/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.br.BrazilianStemFilter;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.Version;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import java.util.Locale;
public enum PreBuiltTokenFilters {
// TODO remove this entire class when PreBuiltTokenizers no longer needs it.....
LOWERCASE(CachingStrategy.LUCENE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new LowerCaseFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
// Extended Token Filters
ELISION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
ARABIC_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ArabicStemFilter(tokenStream);
}
},
BRAZILIAN_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new BrazilianStemFilter(tokenStream);
}
},
CZECH_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CzechStemFilter(tokenStream);
}
},
DUTCH_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, new DutchStemmer());
}
},
FRENCH_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, new FrenchStemmer());
}
},
GERMAN_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new GermanStemFilter(tokenStream);
}
},
RUSSIAN_STEM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, "Russian");
}
},
KEYWORD_REPEAT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new KeywordRepeatFilter(tokenStream);
}
},
ARABIC_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ArabicNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new PersianNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new TypeAsPayloadTokenFilter(tokenStream);
}
},
SHINGLE(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ShingleFilter(tokenStream);
}
},
GERMAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new GermanNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
HINDI_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new HindiNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
INDIC_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new IndicNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
SORANI_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SoraniNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianNormalizationFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianFoldingFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
APOSTROPHE(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ApostropheFilter(tokenStream);
}
},
CJK_WIDTH(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKWidthFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
DECIMAL_DIGIT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new DecimalDigitFilter(tokenStream);
}
@Override
protected boolean isMultiTermAware() {
return true;
}
},
CJK_BIGRAM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKBigramFilter(tokenStream);
}
},
DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
}
},
LIMIT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS);
}
},
;
protected boolean isMultiTermAware() {
return false;
}
public abstract TokenStream create(TokenStream tokenStream, Version version);
protected final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
private final CachingStrategy cachingStrategy;
PreBuiltTokenFilters(CachingStrategy cachingStrategy) {
this.cachingStrategy = cachingStrategy;
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
}
public CachingStrategy getCachingStrategy() {
return cachingStrategy;
}
private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}
public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
TokenFilterFactory factory = cache.get(version);
if (factory == null) {
final String finalName = name().toLowerCase(Locale.ROOT);
if (isMultiTermAware()) {
factory = new MultiTermAwareTokenFilterFactory() {
@Override
public String name() {
return finalName;
}
@Override
public TokenStream create(TokenStream tokenStream) {
return PreBuiltTokenFilters.this.create(tokenStream, version);
}
@Override
public Object getMultiTermComponent() {
return this;
}
};
} else {
factory = new TokenFilterFactory() {
@Override
public String name() {
return finalName;
}
@Override
public TokenStream create(TokenStream tokenStream) {
return PreBuiltTokenFilters.this.create(tokenStream, version);
}
};
}
cache.put(version, factory);
}
return factory;
}
}