package org.xbib.elasticsearch.index.analysis.autophrase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.WordlistLoader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * */ public class AutoPhrasingTokenFilterFactory extends AbstractTokenFilterFactory implements ResourceLoaderAware { private final String phraseSetFiles; private final boolean ignoreCase; private final boolean emitSingleTokens; private CharArraySet phraseSets; private String replaceWhitespaceWith; public AutoPhrasingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.phraseSetFiles = settings.get("phrases"); this.ignoreCase = settings.getAsBoolean("ignoreCase", false); this.emitSingleTokens = settings.getAsBoolean("includeTokens", false); this.replaceWhitespaceWith = settings.get("replaceWhitespaceWith"); } @Override public void inform(ResourceLoader loader) throws IOException { if (phraseSetFiles != null) { phraseSets = getWordSet(loader, phraseSetFiles, ignoreCase); } } @Override public TokenStream create(TokenStream input) { AutoPhrasingTokenFilter autoPhraseFilter = new AutoPhrasingTokenFilter(input, phraseSets, emitSingleTokens); if (replaceWhitespaceWith != null) { autoPhraseFilter.setReplaceWhitespaceWith(replaceWhitespaceWith.charAt(0)); } return autoPhraseFilter; } private CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (!files.isEmpty()) { words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; } private List<String> splitFileNames(String fileNames) { if (fileNames == null) { return Collections.emptyList(); } List<String> result = new ArrayList<>(); for (String file : fileNames.split("(?<!\\\\),")) { result.add(file.replaceAll("\\\\(?=,)", "")); } return result; } private List<String> getLines(ResourceLoader loader, String resource) throws IOException { return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8); } }