package org.apache.lucene.analysis.synonym; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; /** * Factory for {@link SynonymFilter}. * <pre class="prettyprint" > * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"> * <analyzer> * <tokenizer class="solr.WhitespaceTokenizerFactory"/> * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" * format="solr" ignoreCase="false" expand="true" * tokenizerFactory="solr.WhitespaceTokenizerFactory"/> * </analyzer> * </fieldType></pre> * * If the LUCENE-4499 gets committed, we can remove these NewSynonym... classes. */ public class NewSynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { protected Map<String,String> args; public NewSynonymFilterFactory(Map<String,String> args) { super(args); this.args = args; } private SynonymMap map; private boolean ignoreCase; @Override public TokenStream create(TokenStream input) { // if the fst is null, it means there's actually no synonyms... just return the original stream // as there is nothing to do here. return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase); } //@Override public void inform(ResourceLoader loader) throws IOException { final boolean ignoreCase = getBoolean(args, "ignoreCase", false); this.ignoreCase = ignoreCase; // must set the value back (for use by the inheritting class) args.put("ignoreCase", ignoreCase ? "true" : "false"); String bf = args.get("builderFactory"); SynonymBuilderFactory builder = loadBuilderFactory(loader, bf != null ? bf : SynonymBuilderFactory.class.getName()); try { map = builder.create(loader); } catch (ParseException e) { throw new IOException(e); } } public static class SynonymParser extends SynonymMap.Parser { public SynonymParser(boolean dedup, Analyzer analyzer) { super(dedup, analyzer); } public void add(Reader in) throws IOException, ParseException { throw new IllegalAccessError("You must override this method"); } @Override public void parse(Reader in) throws IOException, ParseException {} } public static class SynonymBuilderFactory extends TokenizerFactory implements ResourceLoaderAware { protected Map<String,String> args; public SynonymBuilderFactory(Map<String,String> args) { super(args); this.args = args; } @Override public Tokenizer create(AttributeFactory factory) { // TODO : this could be used to parse the source data (right now Solr and WordNet synonym // parser do it throw new IllegalAccessError("Not implemented"); } public SynonymMap create(ResourceLoader loader) throws IOException, ParseException { String synonyms = args.get("synonyms"); if (synonyms == null) throw new IllegalArgumentException("Missing required argument 'synonyms'."); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder(); decoder.onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SynonymParser parser = getParser(getAnalyzer(loader)); File synonymFile = new File(synonyms); if (synonymFile.exists()) { decoder.reset(); parser.add(new BufferedReader(new InputStreamReader(loader.openResource(synonyms), decoder))); } else { List<String> files = splitFileNames(synonyms); for (String file : files) { decoder.reset(); parser.add(new InputStreamReader(loader.openResource(file), decoder)); } } return parser.build(); } protected Analyzer getAnalyzer(ResourceLoader loader) throws IOException { final boolean ignoreCase = getBoolean(args, "ignoreCase", false); String tf = args.get("tokenizerFactory"); final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf); return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create(); TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; } protected SynonymParser getParser(Analyzer analyzer) { String format = args.get("format"); boolean expand = getBoolean(args, "expand", true); if (format == null || format.equals("solr")) { // TODO: expose dedup as a parameter? return new NewSolrSynonymParser(true, expand, analyzer); } else if (format.equals("wordnet")) { return new NewWordnetSynonymParser(true, expand, analyzer); } else if (format.equals("semicolon")) { return new NewSemicolonSynonymParser(true, expand, analyzer); } else { // TODO: somehow make this more pluggable throw new IllegalArgumentException("Unrecognized synonyms format: " + format); } } // (there are no tests for this functionality) private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException { Class<? extends TokenizerFactory> clazz = loader.findClass(cname, TokenizerFactory.class); TokenizerFactory tokFactory; try { tokFactory = clazz.getConstructor(Map.class).newInstance(new HashMap<String, String>()); if (tokFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokFactory).inform(loader); } } catch (Exception e) { throw new RuntimeException(e); } return tokFactory; } public void inform(ResourceLoader loader) throws IOException { // do nothing } } //(there are no tests for this functionality) private SynonymBuilderFactory loadBuilderFactory(ResourceLoader loader, String cname) throws IOException { Class<? extends SynonymBuilderFactory> clazz = loader.findClass(cname, SynonymBuilderFactory.class); try { SynonymBuilderFactory tokFactory = clazz.getConstructor(Map.class).newInstance(args); if (tokFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokFactory).inform(loader); } return tokFactory; } catch (Exception e) { throw new RuntimeException(e); } } /* * Various configuration options - some of the are useful for indexing, others for * querying only */ /* * Always include the source token before the synonym (this is the default, * lucene behaviour) * * "hubble space telescope was..." will be * indexed as * * 0: hubble|HST * 1: space * 2: telescope */ public static class AlwaysIncludeOriginal extends SynonymBuilderFactory { public AlwaysIncludeOriginal(Map<String,String> args) { super(args); } protected SynonymParser getParser(Analyzer analyzer) { return new NewSolrSynonymParser(true, true, analyzer) { @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { super.add(input, output, true); } }; } } /* * This parser is useful if you want to index multi-token synonyms (as one token) * as well as their components. Ie. "hubble space telescope was..." will be * indexed as * * 0: hubble|hubble space telescope * 1: space * 2: telescope * * You need this behaviour for index-time synonym expansion, if you want to * retain proximity queries and phrases. */ public static class BestEffort extends SynonymBuilderFactory { protected BestEffort(Map<String,String> args) { super(args); } protected SynonymParser getParser(Analyzer analyzer) { return new NewSolrSynonymParser(true, true, analyzer) { @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { super.add(input, replaceNulls(output), countWords(input) > 1 ? true : false); } }; } } /* * This parser is useful if you want to index multi-token synonyms (as one token) * AND NOT their components. * * Recognize "multi\0word\0synonyms" (null bytes in the input string) * but emit "multi word synonyms" in the output * * Ie 'hubble\0space\0telescope' will be indexed as: * * 0: hubble space telescope|hst * 1-3: null * 4: was */ public static class MultiTokenReplaceNulls extends SynonymBuilderFactory { public MultiTokenReplaceNulls(Map<String,String> args) { super(args); } protected SynonymParser getParser(Analyzer analyzer) { return new NewSolrSynonymParser(true, true, analyzer) { @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { super.add(input, replaceNulls(output), includeOrig); } }; } } /* * This is a custom configuration for multi-token query-time synonym expansion. * * The parser searches for synonyms ignoring case, but in the output returns * the Original String (important for more complex tokenizer chains, ie. * when synonyms should be found first, then acronyms detected) * * The parser also returns source tokens for the multi-token group, but * 'eats' the source token when single-token synonym is there. * */ public static class BestEffortSearchLowercase extends SynonymBuilderFactory { private Map<String,String> args; public BestEffortSearchLowercase(Map<String,String> args) { super(args); this.args = args; } boolean inclOrig = false; public void inform(ResourceLoader loader) throws IOException { args.put("ignoreCase", "false"); inclOrig = args.containsKey("inclOrig") ? ((String) args.get("inclOrig")).equals("true") : false; } protected SynonymParser getParser(Analyzer analyzer) { return new NewSolrSynonymParser(true, true, analyzer) { @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { super.add(lowercase(input), replaceNulls(output), countWords(input) > 1 ? true : inclOrig); } private CharsRef lowercase(CharsRef chars) { chars = CharsRef.deepCopyOf(chars); final int limit = chars.offset + chars.length; for (int i=chars.offset;i<limit;i++) { chars.chars[i] = Character.toLowerCase(chars.chars[i]); // maybe not be always correct (?) } return chars; } }; } } /* * This is a custom configuration for multi-token query-time synonym expansion. * * Multi-tokens are searched lowercase and original parts are returned * * Single tokens are searched as they are written in the synonym file * * The parser also returns source tokens for the multi-token group, for * single-token the behaviour is governed by settings of includeOrig * */ public static class BestEffortIgnoreCaseSelectively extends SynonymBuilderFactory { private Map<String,String> args; public BestEffortIgnoreCaseSelectively(Map<String,String> args) { super(args); this.args = args; } boolean inclOrig = false; public void inform(ResourceLoader loader) throws IOException { args.put("ignoreCase", "false"); inclOrig = args.containsKey("inclOrig") ? ((String) args.get("inclOrig")).equals("true") : false; } protected SynonymParser getParser(Analyzer analyzer) { return new NewSolrSynonymParser(true, true, analyzer) { @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { //is always false :( int count = countWords(input); super.add(count > 1 ? lowercase(input) : input, replaceNulls(output), count > 1 ? true : inclOrig); } private CharsRef lowercase(CharsRef chars) { chars = CharsRef.deepCopyOf(chars); final int limit = chars.offset + chars.length; for (int i=chars.offset;i<limit;i++) { chars.chars[i] = Character.toLowerCase(chars.chars[i]); // maybe not be always correct (?) } return chars; } }; } } public static int countWords(CharsRef chars) { int wordCount = 1; int upto = chars.offset; final int limit = chars.offset + chars.length; while(upto < limit) { if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) { wordCount++; } } return wordCount; } public static CharsRef replaceNulls(CharsRef charsRef) { CharsRef sanChar = CharsRef.deepCopyOf(charsRef); final int end = sanChar.offset + sanChar.length; for(int idx=sanChar.offset+1;idx<end;idx++) { if (sanChar.chars[idx] == SynonymMap.WORD_SEPARATOR) { sanChar.chars[idx] = ' '; } } return sanChar; } }