SlowSynonymFilterFactory.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.analysis.synonym;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.*;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

/**
 * Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
 * <pre class="prettyprint" >
 * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
 *   <analyzer>
 *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
 *             expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
 *   </analyzer>
 * </fieldType></pre>
 * @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
 */
@Deprecated
final class SlowSynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {

  public void inform(ResourceLoader loader) throws IOException {
    String synonyms = args.get("synonyms");
    if (synonyms == null)
      throw new IllegalArgumentException("Missing required argument 'synonyms'.");
    boolean ignoreCase = getBoolean("ignoreCase", false);
    boolean expand = getBoolean("expand", true);

    String tf = args.get("tokenizerFactory");
    TokenizerFactory tokFactory = null;
    if( tf != null ){
      tokFactory = loadTokenizerFactory(loader, tf);
    }

    Iterable<String> wlist=loadRules( synonyms, loader );
    
    synMap = new SlowSynonymMap(ignoreCase);
    parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
  }
  
  /**
   * @return a list of all rules
   */
  protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) throws IOException {
    List<String> wlist=null;
    File synonymFile = new File(synonyms);
    if (synonymFile.exists()) {
      wlist = getLines(loader, synonyms);
    } else  {
      List<String> files = splitFileNames(synonyms);
      wlist = new ArrayList<String>();
      for (String file : files) {
        List<String> lines = getLines(loader, file.trim());
        wlist.addAll(lines);
      }
    }
    return wlist;
  }

  private SlowSynonymMap synMap;

  static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep,
    String synSep, boolean expansion, TokenizerFactory tokFactory) throws IOException {
    int count=0;
    for (String rule : rules) {
      // To use regexes, we need an expression that specifies an odd number of chars.
      // This can't really be done with string.split(), and since we need to
      // do unescaping at some point anyway, we wouldn't be saving any effort
      // by using regexes.

      List<String> mapping = splitSmart(rule, mappingSep, false);

      List<List<String>> source;
      List<List<String>> target;

      if (mapping.size() > 2) {
        throw new IllegalArgumentException("Invalid Synonym Rule:" + rule);
      } else if (mapping.size()==2) {
        source = getSynList(mapping.get(0), synSep, tokFactory);
        target = getSynList(mapping.get(1), synSep, tokFactory);
      } else {
        source = getSynList(mapping.get(0), synSep, tokFactory);
        if (expansion) {
          // expand to all arguments
          target = source;
        } else {
          // reduce to first argument
          target = new ArrayList<List<String>>(1);
          target.add(source.get(0));
        }
      }

      boolean includeOrig=false;
      for (List<String> fromToks : source) {
        count++;
        for (List<String> toToks : target) {
          map.add(fromToks,
                  SlowSynonymMap.makeTokens(toToks),
                  includeOrig,
                  true
          );
        }
      }
    }
  }

  // a , b c , d e f => [[a],[b,c],[d,e,f]]
  private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) throws IOException {
    List<String> strList = splitSmart(str, separator, false);
    // now split on whitespace to get a list of token strings
    List<List<String>> synList = new ArrayList<List<String>>();
    for (String toks : strList) {
      List<String> tokList = tokFactory == null ?
        splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
      synList.add(tokList);
    }
    return synList;
  }

  private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException{
    StringReader reader = new StringReader( source );
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      while (ts.incrementToken()){
        if( termAtt.length() > 0 )
          tokList.add( termAtt.toString() );
      }
    } finally{
      reader.close();
    }
    return tokList;
  }

  private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException {
    TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class);
    tokFactory.setLuceneMatchVersion(luceneMatchVersion);
    tokFactory.init( args );
    if (tokFactory instanceof ResourceLoaderAware) {
      ((ResourceLoaderAware) tokFactory).inform(loader);
    }
    return tokFactory;
  }

  private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
    return tokFactory.create( reader );
  }

  public SlowSynonymMap getSynonymMap() {
    return synMap;
  }

  public SlowSynonymFilter create(TokenStream input) {
    return new SlowSynonymFilter(input,synMap);
  }
  
  public static List<String> splitWS(String s, boolean decode) {
    ArrayList<String> lst = new ArrayList<String>(2);
    StringBuilder sb = new StringBuilder();
    int pos=0, end=s.length();
    while (pos < end) {
      char ch = s.charAt(pos++);
      if (Character.isWhitespace(ch)) {
        if (sb.length() > 0) {
          lst.add(sb.toString());
          sb=new StringBuilder();
        }
        continue;
      }

      if (ch=='\\') {
        if (!decode) sb.append(ch);
        if (pos>=end) break;  // ERROR, or let it go?
        ch = s.charAt(pos++);
        if (decode) {
          switch(ch) {
            case 'n' : ch='\n'; break;
            case 't' : ch='\t'; break;
            case 'r' : ch='\r'; break;
            case 'b' : ch='\b'; break;
            case 'f' : ch='\f'; break;
          }
        }
      }

      sb.append(ch);
    }

    if (sb.length() > 0) {
      lst.add(sb.toString());
    }

    return lst;
  }
  
  /** Splits a backslash escaped string on the separator.
   * <p>
   * Current backslash escaping supported:
   * <br> \n \t \r \b \f are escaped the same as a Java String
   * <br> Other characters following a backslash are produced verbatim (\c => c)
   *
   * @param s  the string to split
   * @param separator the separator to split on
   * @param decode decode backslash escaping
   */
  public static List<String> splitSmart(String s, String separator, boolean decode) {
    ArrayList<String> lst = new ArrayList<String>(2);
    StringBuilder sb = new StringBuilder();
    int pos=0, end=s.length();
    while (pos < end) {
      if (s.startsWith(separator,pos)) {
        if (sb.length() > 0) {
          lst.add(sb.toString());
          sb=new StringBuilder();
        }
        pos+=separator.length();
        continue;
      }

      char ch = s.charAt(pos++);
      if (ch=='\\') {
        if (!decode) sb.append(ch);
        if (pos>=end) break;  // ERROR, or let it go?
        ch = s.charAt(pos++);
        if (decode) {
          switch(ch) {
            case 'n' : ch='\n'; break;
            case 't' : ch='\t'; break;
            case 'r' : ch='\r'; break;
            case 'b' : ch='\b'; break;
            case 'f' : ch='\f'; break;
          }
        }
      }

      sb.append(ch);
    }

    if (sb.length() > 0) {
      lst.add(sb.toString());
    }

    return lst;
  }
}