package org.xbib.elasticsearch.index.analysis.lemmatize;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.xbib.elasticsearch.common.fsa.Dictionary;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.zip.GZIPInputStream;
/**
*
*/
public class LemmatizeTokenFilterFactory extends AbstractTokenFilterFactory {
private final Dictionary dictionary;
private final boolean respectKeywords;
private final boolean lemmaOnly;
public LemmatizeTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.respectKeywords = settings.getAsBoolean("respect_keywords", false);
this.lemmaOnly = settings.getAsBoolean("lemma_only", true);
this.dictionary = createDictionary(settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new LemmatizeTokenFilter(tokenStream, dictionary, respectKeywords, lemmaOnly);
}
private Dictionary createDictionary(Settings settings) {
String language = settings.get("language", "en");
try {
String resource = settings.get("resource", "/lemmatize/lemmatization-" + language + ".fsa.gz");
if (resource.endsWith(".fsa") || resource.endsWith("fsa.gz")) {
// FSA
InputStream inputStream = getClass().getResourceAsStream(resource);
if (resource.endsWith(".gz")) {
inputStream = new GZIPInputStream(inputStream);
}
Dictionary dictionary = new Dictionary().loadFSA(inputStream);
inputStream.close();
return dictionary;
} else {
// Text
InputStream inputStream = getClass().getResourceAsStream(resource);
if (resource.endsWith(".gz")) {
inputStream = new GZIPInputStream(inputStream);
}
Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
Dictionary dictionary = new Dictionary().loadLinesReverse(reader);
reader.close();
return dictionary;
}
} catch (Exception e) {
throw new ElasticsearchException("resources for language " + language +
" in settings not found: " + settings.getAsMap(), e);
}
}
}