/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;
import org.hamcrest.MatcherAssert;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.either;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;
public class AnalysisModuleTests extends ESTestCase {
private final Settings emptyNodeSettings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
public IndexAnalyzers getIndexAnalyzers(Settings settings) throws IOException {
return getIndexAnalyzers(getNewRegistry(settings), settings);
}
public IndexAnalyzers getIndexAnalyzers(AnalysisRegistry registry, Settings settings) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
return registry.build(idxSettings);
}
public AnalysisRegistry getNewRegistry(Settings settings) {
try {
return new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
}
@Override
public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
return AnalysisPlugin.super.getCharFilters();
}
})).getAnalysisRegistry();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private Settings loadFromClasspath(String path) throws IOException {
return Settings.builder().loadFromStream(path, getClass().getResourceAsStream(path))
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
}
public void testSimpleConfigurationJson() throws IOException {
Settings settings = loadFromClasspath("/org/elasticsearch/index/analysis/test1.json");
testSimpleConfiguration(settings);
}
public void testSimpleConfigurationYaml() throws IOException {
Settings settings = loadFromClasspath("/org/elasticsearch/index/analysis/test1.yml");
testSimpleConfiguration(settings);
}
public void testDefaultFactoryTokenFilters() throws IOException {
assertTokenFilter("keyword_repeat", KeywordRepeatFilter.class);
assertTokenFilter("persian_normalization", PersianNormalizationFilter.class);
assertTokenFilter("arabic_normalization", ArabicNormalizationFilter.class);
}
public void testAnalyzerAliasNotAllowedPost5x() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.foobar.type", "standard")
.put("index.analysis.analyzer.foobar.alias","foobaz")
// analyzer aliases were removed in v5.0.0 alpha6
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_5_0_0_beta1, null))
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
AnalysisRegistry registry = getNewRegistry(settings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> getIndexAnalyzers(registry, settings));
assertEquals("setting [index.analysis.analyzer.foobar.alias] is not supported", e.getMessage());
}
public void testVersionedAnalyzers() throws Exception {
String yaml = "/org/elasticsearch/index/analysis/test1.yml";
Settings settings2 = Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml))
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_5_0_0)
.build();
AnalysisRegistry newRegistry = getNewRegistry(settings2);
IndexAnalyzers indexAnalyzers = getIndexAnalyzers(newRegistry, settings2);
// registry always has the current version
assertThat(newRegistry.getAnalyzer("default"), is(instanceOf(NamedAnalyzer.class)));
NamedAnalyzer defaultNamedAnalyzer = (NamedAnalyzer) newRegistry.getAnalyzer("default");
assertThat(defaultNamedAnalyzer.analyzer(), is(instanceOf(StandardAnalyzer.class)));
assertEquals(Version.CURRENT.luceneVersion, defaultNamedAnalyzer.analyzer().getVersion());
// analysis service has the expected version
assertThat(indexAnalyzers.get("standard").analyzer(), is(instanceOf(StandardAnalyzer.class)));
assertEquals(Version.V_5_0_0.luceneVersion,
indexAnalyzers.get("standard").analyzer().getVersion());
assertEquals(Version.V_5_0_0.luceneVersion,
indexAnalyzers.get("thai").analyzer().getVersion());
assertThat(indexAnalyzers.get("custom7").analyzer(), is(instanceOf(StandardAnalyzer.class)));
assertEquals(org.apache.lucene.util.Version.fromBits(3,6,0), indexAnalyzers.get("custom7").analyzer().getVersion());
}
private void assertTokenFilter(String name, Class<?> clazz) throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get(name);
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
TokenStream stream = tokenFilter.create(tokenizer);
assertThat(stream, instanceOf(clazz));
}
private void testSimpleConfiguration(Settings settings) throws IOException {
IndexAnalyzers indexAnalyzers = getIndexAnalyzers(settings);
Analyzer analyzer = indexAnalyzers.get("custom1").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom1 = (CustomAnalyzer) analyzer;
assertThat(custom1.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
assertThat(custom1.tokenFilters().length, equalTo(2));
StopTokenFilterFactory stop1 = (StopTokenFilterFactory) custom1.tokenFilters()[0];
assertThat(stop1.stopWords().size(), equalTo(1));
// verify position increment gap
analyzer = indexAnalyzers.get("custom6").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom6 = (CustomAnalyzer) analyzer;
assertThat(custom6.getPositionIncrementGap("any_string"), equalTo(256));
// check custom class name (my)
analyzer = indexAnalyzers.get("custom4").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom4 = (CustomAnalyzer) analyzer;
assertThat(custom4.tokenFilters()[0], instanceOf(MyFilterTokenFilterFactory.class));
// // verify Czech stemmer
// analyzer = analysisService.analyzer("czechAnalyzerWithStemmer").analyzer();
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
// CustomAnalyzer czechstemmeranalyzer = (CustomAnalyzer) analyzer;
// assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
// assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
//
// // check dictionary decompounder
// analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
// CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
// assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
Set<?> wordList = Analysis.getWordSet(null, Version.CURRENT, settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
// MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
}
public void testWordListPath() throws Exception {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
Environment env = new Environment(settings);
String[] words = new String[]{"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
Path wordListFile = generateWordList(words);
settings = Settings.builder().loadFromSource("index: \n word_list_path: " + wordListFile.toAbsolutePath(), XContentType.YAML)
.build();
Set<?> wordList = Analysis.getWordSet(env, Version.CURRENT, settings, "index.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
// MatcherAssert.assertThat(wordList, hasItems(words));
Files.delete(wordListFile);
}
private Path generateWordList(String[] words) throws Exception {
Path wordListFile = createTempDir().resolve("wordlist.txt");
try (BufferedWriter writer = Files.newBufferedWriter(wordListFile, StandardCharsets.UTF_8)) {
for (String word : words) {
writer.write(word);
writer.write('\n');
}
}
return wordListFile;
}
public void testUnderscoreInAnalyzerName() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.analyzer._invalid_name.tokenizer", "keyword")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, "1")
.build();
try {
getIndexAnalyzers(settings);
fail("This should fail with IllegalArgumentException because the analyzers name starts with _");
} catch (IllegalArgumentException e) {
assertThat(e.getMessage(), either(equalTo("analyzer name must not start with '_'. got \"_invalid_name\""))
.or(equalTo("analyzer name must not start with '_'. got \"_invalidName\"")));
}
}
/**
* Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredTokenFilters() throws IOException {
// Simple token filter that appends text to the term
final class AppendTokenFilter extends TokenFilter {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
private final char[] appendMe;
protected AppendTokenFilter(TokenStream input, String appendMe) {
super(input);
this.appendMe = appendMe.toCharArray();
}
@Override
public boolean incrementToken() throws IOException {
if (false == input.incrementToken()) {
return false;
}
term.resizeBuffer(term.length() + appendMe.length);
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
term.setLength(term.length() + appendMe.length);
return true;
}
}
boolean noVersionSupportsMultiTerm = randomBoolean();
boolean luceneVersionSupportsMultiTerm = randomBoolean();
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return Arrays.asList(
PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm,
tokenStream -> new AppendTokenFilter(tokenStream, "no_version")),
PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm,
(tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())),
PreConfiguredTokenFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm,
(tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString()))
);
}
})).getAnalysisRegistry();
Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
.put("index.analysis.analyzer.no_version.tokenizer", "keyword")
.put("index.analysis.analyzer.no_version.filter", "no_version")
.put("index.analysis.analyzer.lucene_version.tokenizer", "keyword")
.put("index.analysis.analyzer.lucene_version.filter", "lucene_version")
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword")
.put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version")
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
.build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"});
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion});
assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version});
assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
analyzers.get("no_version").normalize("", "test").utf8ToString());
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
analyzers.get("lucene_version").normalize("", "test").utf8ToString());
assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
}
public void testRegisterHunspellDictionary() throws Exception {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
Environment environment = new Environment(settings);
InputStream aff = getClass().getResourceAsStream("/indices/analyze/conf_dir/hunspell/en_US/en_US.aff");
InputStream dic = getClass().getResourceAsStream("/indices/analyze/conf_dir/hunspell/en_US/en_US.dic");
Dictionary dictionary;
try (Directory tmp = new SimpleFSDirectory(environment.tmpFile())) {
dictionary = new Dictionary(tmp, "hunspell", aff, dic);
}
AnalysisModule module = new AnalysisModule(environment, singletonList(new AnalysisPlugin() {
@Override
public Map<String, Dictionary> getHunspellDictionaries() {
return singletonMap("foo", dictionary);
}
}));
assertSame(dictionary, module.getHunspellService().getDictionary("foo"));
}
}