NewSynonymFilterFactory.java example

Explorer
montysolr-master
- contrib
package org.apache.lucene.analysis.synonym;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;

/**
 * Factory for {@link SynonymFilter}.
 * <pre class="prettyprint" >
 * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
 *   <analyzer>
 *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
 *             format="solr" ignoreCase="false" expand="true" 
 *             tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
 *   </analyzer>
 * </fieldType></pre>
 * 
 * If the LUCENE-4499 gets committed, we can remove these NewSynonym... classes.
 */
public class NewSynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
  protected Map<String,String> args;

  public NewSynonymFilterFactory(Map<String,String> args) {
    super(args);
    this.args = args;
  }

  private SynonymMap map;
  private boolean ignoreCase;
  
  @Override
  public TokenStream create(TokenStream input) {
    // if the fst is null, it means there's actually no synonyms... just return the original stream
    // as there is nothing to do here.
    return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase);
  }
  
  //@Override
  public void inform(ResourceLoader loader) throws IOException {
    final boolean ignoreCase = getBoolean(args, "ignoreCase", false); 
    this.ignoreCase = ignoreCase;
    
    // must set the value back (for use by the inheritting class)
    args.put("ignoreCase", ignoreCase ? "true" : "false");

    String bf = args.get("builderFactory");
    SynonymBuilderFactory builder = loadBuilderFactory(loader, bf != null ? bf : SynonymBuilderFactory.class.getName());
    
    try {
      map = builder.create(loader);
    } catch (ParseException e) {
      throw new IOException(e);
    }
  }
  
  
  public static class SynonymParser extends SynonymMap.Parser {

    public SynonymParser(boolean dedup, Analyzer analyzer) {
      super(dedup, analyzer);
    }

    public void add(Reader in) throws IOException, ParseException {
      throw new IllegalAccessError("You must override this method");
    }

    @Override
    public void parse(Reader in) throws IOException, ParseException {}
  }
  
  
  public static class SynonymBuilderFactory extends TokenizerFactory implements ResourceLoaderAware {
    
    protected Map<String,String> args;

    public SynonymBuilderFactory(Map<String,String> args) {
      super(args);
      this.args = args;
    }

    @Override
    public Tokenizer create(AttributeFactory factory) {
      // TODO : this could be used to parse the source data (right now Solr and WordNet synonym
      // parser do it
      throw new IllegalAccessError("Not implemented");
    }
    
    public SynonymMap create(ResourceLoader loader) throws IOException, ParseException {
      
      String synonyms = args.get("synonyms");
      if (synonyms == null)
        throw new IllegalArgumentException("Missing required argument 'synonyms'.");
      
      CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
      decoder.onMalformedInput(CodingErrorAction.REPORT)
        		 .onUnmappableCharacter(CodingErrorAction.REPORT);
      
      SynonymParser parser = getParser(getAnalyzer(loader));
      
      File synonymFile = new File(synonyms);
      if (synonymFile.exists()) {
        decoder.reset();
        parser.add(new BufferedReader(new InputStreamReader(loader.openResource(synonyms), decoder)));
      } else {
        List<String> files = splitFileNames(synonyms);
        for (String file : files) {
          decoder.reset();
          parser.add(new InputStreamReader(loader.openResource(file), decoder));
        }
      }
      return parser.build();
      
    }
    
    protected Analyzer getAnalyzer(ResourceLoader loader) throws IOException {
      final boolean ignoreCase = getBoolean(args, "ignoreCase", false); 

      String tf = args.get("tokenizerFactory");

      final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf);
      
      return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
          Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
          TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
          return new TokenStreamComponents(tokenizer, stream);
        }
      };
    }
    
    protected SynonymParser getParser(Analyzer analyzer) {
      
      String format = args.get("format");
      boolean expand = getBoolean(args, "expand", true);
      
      if (format == null || format.equals("solr")) {
        // TODO: expose dedup as a parameter?
        return new NewSolrSynonymParser(true, expand, analyzer);
      } else if (format.equals("wordnet")) {
        return new NewWordnetSynonymParser(true, expand, analyzer);
      } else if (format.equals("semicolon")) {
        return new NewSemicolonSynonymParser(true, expand, analyzer);  
      } else {
        // TODO: somehow make this more pluggable
        throw new IllegalArgumentException("Unrecognized synonyms format: " + format);
      }
    }
    
    
    // (there are no tests for this functionality)
    private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException {
      Class<? extends TokenizerFactory> clazz = loader.findClass(cname, TokenizerFactory.class);
      TokenizerFactory tokFactory;
      try {
        tokFactory = clazz.getConstructor(Map.class).newInstance(new HashMap<String, String>());
        if (tokFactory instanceof ResourceLoaderAware) {
          ((ResourceLoaderAware) tokFactory).inform(loader);
        }
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
      return tokFactory;
    }

    public void inform(ResourceLoader loader) throws IOException {
      // do nothing
    }


  }
  
  
 //(there are no tests for this functionality)
 private SynonymBuilderFactory loadBuilderFactory(ResourceLoader loader, String cname) throws IOException {
   Class<? extends SynonymBuilderFactory> clazz = loader.findClass(cname, SynonymBuilderFactory.class);
   try {
     SynonymBuilderFactory tokFactory = clazz.getConstructor(Map.class).newInstance(args);
     if (tokFactory instanceof ResourceLoaderAware) {
       ((ResourceLoaderAware) tokFactory).inform(loader);
     }
     return tokFactory;
   } catch (Exception e) {
     throw new RuntimeException(e);
   }
 }
  
  
  /*
   * Various configuration options - some of the are useful for indexing, others for
   * querying only 
   */
  
  
  /*
   * Always include the source token before the synonym (this is the default, 
   * lucene behaviour)
   * 
   * "hubble space telescope was..." will be 
   * indexed as
   * 
   * 0: hubble|HST
   * 1: space
   * 2: telescope
   */
  public static class AlwaysIncludeOriginal extends SynonymBuilderFactory {
    public AlwaysIncludeOriginal(Map<String,String> args) {
      super(args);
    }

    protected SynonymParser getParser(Analyzer analyzer) {
      return new NewSolrSynonymParser(true, true, analyzer) {
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(input, output, true);
        }
      };
    }
  }
  
  /*
   * This parser is useful if you want to index multi-token synonyms (as one token)
   * as well as their components. Ie. "hubble space telescope was..." will be 
   * indexed as
   * 
   * 0: hubble|hubble space telescope
   * 1: space
   * 2: telescope
   * 
   * You need this behaviour for index-time synonym expansion, if you want to 
   * retain proximity queries and phrases.
   */
  public static class BestEffort extends SynonymBuilderFactory {
    protected BestEffort(Map<String,String> args) {
      super(args);
    }

    protected SynonymParser getParser(Analyzer analyzer) {
      return new NewSolrSynonymParser(true, true, analyzer) {
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(input, replaceNulls(output), countWords(input) > 1 ? true : false);
        }
      };
    }
  }
  
  /*
   * This parser is useful if you want to index multi-token synonyms (as one token)
   * AND NOT their components. 
   * 
   * Recognize "multi\0word\0synonyms" (null bytes in the input string) 
   * but emit "multi word synonyms" in the output
   * 
   * Ie 'hubble\0space\0telescope' will be indexed as:
   * 
   * 0: hubble space telescope|hst
   * 1-3: null
   * 4: was
   */
  public static class MultiTokenReplaceNulls extends SynonymBuilderFactory {
    public MultiTokenReplaceNulls(Map<String,String> args) {
      super(args);
    }

    protected SynonymParser getParser(Analyzer analyzer) {
      return new NewSolrSynonymParser(true, true, analyzer) {
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(input, replaceNulls(output), includeOrig);
        }
      };
    }
  }
  
  /*
   * This is a custom configuration for multi-token query-time synonym expansion.
   * 
   * The parser searches for synonyms ignoring case, but in the output returns
   * the Original String (important for more complex tokenizer chains, ie. 
   * when synonyms should be found first, then acronyms detected)
   * 
   * The parser also returns source tokens for the multi-token group, but
   * 'eats' the source token when single-token synonym is there. 
   * 
   */
  public static class BestEffortSearchLowercase extends SynonymBuilderFactory {
  	private Map<String,String> args;
    public BestEffortSearchLowercase(Map<String,String> args) {
      super(args);
      this.args = args;
    }
    boolean inclOrig = false;
    public void inform(ResourceLoader loader) throws IOException {
      args.put("ignoreCase", "false");
      inclOrig = args.containsKey("inclOrig") ? ((String) args.get("inclOrig")).equals("true") : false;
    }
    protected SynonymParser getParser(Analyzer analyzer) {
      return new NewSolrSynonymParser(true, true, analyzer) {
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) {
          super.add(lowercase(input), replaceNulls(output), countWords(input) > 1 ? true : inclOrig);
        }
        private CharsRef lowercase(CharsRef chars) {
          chars = CharsRef.deepCopyOf(chars);
          final int limit = chars.offset + chars.length;
          for (int i=chars.offset;i<limit;i++) {
            chars.chars[i] = Character.toLowerCase(chars.chars[i]); // maybe not be always correct (?)
          }
          return chars;
        }
      };
      
    }
  }
  
  
  /*
   * This is a custom configuration for multi-token query-time synonym expansion.
   * 
   * Multi-tokens are searched lowercase and original parts are returned
   * 
   * Single tokens are searched as they are written in the synonym file
   * 
   * The parser also returns source tokens for the multi-token group, for 
   * single-token the behaviour is governed by settings of includeOrig
   * 
   */
  public static class BestEffortIgnoreCaseSelectively extends SynonymBuilderFactory {
  	private Map<String,String> args;
    public BestEffortIgnoreCaseSelectively(Map<String,String> args) {
      super(args);
      this.args = args;
    }
    boolean inclOrig = false;
    public void inform(ResourceLoader loader) throws IOException {
      args.put("ignoreCase", "false");
      inclOrig = args.containsKey("inclOrig") ? ((String) args.get("inclOrig")).equals("true") : false;
    }
    protected SynonymParser getParser(Analyzer analyzer) {
      return new NewSolrSynonymParser(true, true, analyzer) {
        @Override
        public void add(CharsRef input, CharsRef output, boolean includeOrig) { //is always false :(
          int count = countWords(input);
          super.add(count > 1 ? lowercase(input) : input, replaceNulls(output), count > 1 ? true : inclOrig);
        }
        private CharsRef lowercase(CharsRef chars) {
          chars = CharsRef.deepCopyOf(chars);
          final int limit = chars.offset + chars.length;
          for (int i=chars.offset;i<limit;i++) {
            chars.chars[i] = Character.toLowerCase(chars.chars[i]); // maybe not be always correct (?)
          }
          return chars;
        }
      };
      
    }
  }
  
  
  
  public static int countWords(CharsRef chars) {
    int wordCount = 1;
    int upto = chars.offset;
    final int limit = chars.offset + chars.length;
    while(upto < limit) {
      if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
        wordCount++;
      }
    }
    return wordCount;
  }
  
  public static CharsRef replaceNulls(CharsRef charsRef) {
    CharsRef sanChar = CharsRef.deepCopyOf(charsRef);
    final int end = sanChar.offset + sanChar.length;
    for(int idx=sanChar.offset+1;idx<end;idx++) {
      if (sanChar.chars[idx] == SynonymMap.WORD_SEPARATOR) {
        sanChar.chars[idx] = ' ';
      }
    }
    return sanChar;
  }
}