SynonymFilterFactory.java example

Explorer
solrcene-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * @version $Id: SynonymFilterFactory.java 942827 2010-05-10 17:37:45Z rmuir $
 */
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {

  public void inform(ResourceLoader loader) {
    String synonyms = args.get("synonyms");

    boolean ignoreCase = getBoolean("ignoreCase", false);
    boolean expand = getBoolean("expand", true);

    String tf = args.get("tokenizerFactory");
    TokenizerFactory tokFactory = null;
    if( tf != null ){
      tokFactory = loadTokenizerFactory( loader, tf, args );
    }

    if (synonyms != null) {
      List<String> wlist=null;
      try {
        File synonymFile = new File(synonyms);
        if (synonymFile.exists()) {
          wlist = loader.getLines(synonyms);
        } else  {
          List<String> files = StrUtils.splitFileNames(synonyms);
          wlist = new ArrayList<String>();
          for (String file : files) {
            List<String> lines = loader.getLines(file.trim());
            wlist.addAll(lines);
          }
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      synMap = new SynonymMap(ignoreCase);
      parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
    }
  }

  private SynonymMap synMap;

  static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
    String synSep, boolean expansion, TokenizerFactory tokFactory) {
    int count=0;
    for (String rule : rules) {
      // To use regexes, we need an expression that specifies an odd number of chars.
      // This can't really be done with string.split(), and since we need to
      // do unescaping at some point anyway, we wouldn't be saving any effort
      // by using regexes.

      List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);

      List<List<String>> source;
      List<List<String>> target;

      if (mapping.size() > 2) {
        throw new RuntimeException("Invalid Synonym Rule:" + rule);
      } else if (mapping.size()==2) {
        source = getSynList(mapping.get(0), synSep, tokFactory);
        target = getSynList(mapping.get(1), synSep, tokFactory);
      } else {
        source = getSynList(mapping.get(0), synSep, tokFactory);
        if (expansion) {
          // expand to all arguments
          target = source;
        } else {
          // reduce to first argument
          target = new ArrayList<List<String>>(1);
          target.add(source.get(0));
        }
      }

      boolean includeOrig=false;
      for (List<String> fromToks : source) {
        count++;
        for (List<String> toToks : target) {
          map.add(fromToks,
                  SynonymMap.makeTokens(toToks),
                  includeOrig,
                  true
          );
        }
      }
    }
  }

  // a , b c , d e f => [[a],[b,c],[d,e,f]]
  private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
    List<String> strList = StrUtils.splitSmart(str, separator, false);
    // now split on whitespace to get a list of token strings
    List<List<String>> synList = new ArrayList<List<String>>();
    for (String toks : strList) {
      List<String> tokList = tokFactory == null ?
        StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
      synList.add(tokList);
    }
    return synList;
  }

  private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
    StringReader reader = new StringReader( source );
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      while (ts.incrementToken()){
        if( termAtt.length() > 0 )
          tokList.add( termAtt.toString() );
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    finally{
      reader.close();
    }
    return tokList;
  }

  private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
    TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
    tokFactory.init( args );
    return tokFactory;
  }

  private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
    return tokFactory.create( reader );
  }

  public SynonymMap getSynonymMap() {
    return synMap;
  }

  public SynonymFilter create(TokenStream input) {
    return new SynonymFilter(input,synMap);
  }
}