/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.UpperCaseFilter; import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings; public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>(); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new); filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); return filters; } public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() { Map<String, AnalysisProvider<CharFilterFactory>> filters = new TreeMap<>(); filters.put("html_strip", HtmlStripCharFilterFactory::new); filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceCharFilterFactory::new)); filters.put("mapping", requriesAnalysisSettings(MappingCharFilterFactory::new)); return filters; } @Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input))); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input))); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input))); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; } }