/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.analysis; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.test.ESTestCase; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.TreeSet; /** * Alerts us if new analyzers are added to lucene, so we don't miss them. * <p> * If we don't want to expose one for a specific reason, just map it to Void */ public class AnalysisFactoryTests extends ESTestCase { static final Map<String,Class<?>> KNOWN_TOKENIZERS = new HashMap<String,Class<?>>() {{ // deprecated ones, we dont care about these put("arabicletter", Deprecated.class); put("chinese", Deprecated.class); put("cjk", Deprecated.class); put("russianletter", Deprecated.class); // exposed in ES put("classic", ClassicTokenizerFactory.class); put("edgengram", EdgeNGramTokenizerFactory.class); put("keyword", KeywordTokenizerFactory.class); put("letter", LetterTokenizerFactory.class); put("lowercase", LowerCaseTokenizerFactory.class); put("ngram", NGramTokenizerFactory.class); put("pathhierarchy", PathHierarchyTokenizerFactory.class); put("pattern", PatternTokenizerFactory.class); put("standard", StandardTokenizerFactory.class); put("thai", ThaiTokenizerFactory.class); put("uax29urlemail", UAX29URLEmailTokenizerFactory.class); put("whitespace", WhitespaceTokenizerFactory.class); // this one "seems to mess up offsets". probably shouldn't be a tokenizer... put("wikipedia", Void.class); }}; public void testTokenizers() { Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers()); missing.removeAll(KNOWN_TOKENIZERS.keySet()); assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty()); } static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new HashMap<String,Class<?>>() {{ // deprecated ones, we dont care about these put("chinese", Deprecated.class); put("collationkey", Deprecated.class); put("position", Deprecated.class); put("thaiword", Deprecated.class); // exposed in ES put("apostrophe", ApostropheFilterFactory.class); put("arabicnormalization", ArabicNormalizationFilterFactory.class); put("arabicstem", ArabicStemTokenFilterFactory.class); put("asciifolding", ASCIIFoldingTokenFilterFactory.class); put("brazilianstem", BrazilianStemTokenFilterFactory.class); put("bulgarianstem", StemmerTokenFilterFactory.class); put("cjkbigram", CJKBigramFilterFactory.class); put("cjkwidth", CJKWidthFilterFactory.class); put("classic", ClassicFilterFactory.class); put("commongrams", CommonGramsTokenFilterFactory.class); put("commongramsquery", CommonGramsTokenFilterFactory.class); put("czechstem", CzechStemTokenFilterFactory.class); put("decimaldigit", DecimalDigitFilterFactory.class); put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class); put("edgengram", EdgeNGramTokenFilterFactory.class); put("elision", ElisionTokenFilterFactory.class); put("englishminimalstem", StemmerTokenFilterFactory.class); put("englishpossessive", StemmerTokenFilterFactory.class); put("finnishlightstem", StemmerTokenFilterFactory.class); put("frenchlightstem", StemmerTokenFilterFactory.class); put("frenchminimalstem", StemmerTokenFilterFactory.class); put("galicianminimalstem", StemmerTokenFilterFactory.class); put("galicianstem", StemmerTokenFilterFactory.class); put("germanstem", GermanStemTokenFilterFactory.class); put("germanlightstem", StemmerTokenFilterFactory.class); put("germanminimalstem", StemmerTokenFilterFactory.class); put("germannormalization", GermanNormalizationFilterFactory.class); put("greeklowercase", LowerCaseTokenFilterFactory.class); put("greekstem", StemmerTokenFilterFactory.class); put("hindinormalization", HindiNormalizationFilterFactory.class); put("hindistem", StemmerTokenFilterFactory.class); put("hungarianlightstem", StemmerTokenFilterFactory.class); put("hunspellstem", HunspellTokenFilterFactory.class); put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class); put("indicnormalization", IndicNormalizationFilterFactory.class); put("irishlowercase", LowerCaseTokenFilterFactory.class); put("indonesianstem", StemmerTokenFilterFactory.class); put("italianlightstem", StemmerTokenFilterFactory.class); put("keepword", KeepWordFilterFactory.class); put("keywordmarker", KeywordMarkerTokenFilterFactory.class); put("kstem", KStemTokenFilterFactory.class); put("latvianstem", StemmerTokenFilterFactory.class); put("length", LengthTokenFilterFactory.class); put("limittokencount", LimitTokenCountFilterFactory.class); put("lowercase", LowerCaseTokenFilterFactory.class); put("ngram", NGramTokenFilterFactory.class); put("norwegianlightstem", StemmerTokenFilterFactory.class); put("norwegianminimalstem", StemmerTokenFilterFactory.class); put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class); put("patternreplace", PatternReplaceTokenFilterFactory.class); put("persiannormalization", PersianNormalizationFilterFactory.class); put("porterstem", PorterStemTokenFilterFactory.class); put("portuguesestem", StemmerTokenFilterFactory.class); put("portugueselightstem", StemmerTokenFilterFactory.class); put("portugueseminimalstem", StemmerTokenFilterFactory.class); put("reversestring", ReverseTokenFilterFactory.class); put("russianlightstem", StemmerTokenFilterFactory.class); put("scandinavianfolding", ScandinavianFoldingFilterFactory.class); put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class); put("serbiannormalization", SerbianNormalizationFilterFactory.class); put("shingle", ShingleTokenFilterFactory.class); put("snowballporter", SnowballTokenFilterFactory.class); put("soraninormalization", SoraniNormalizationFilterFactory.class); put("soranistem", StemmerTokenFilterFactory.class); put("spanishlightstem", StemmerTokenFilterFactory.class); put("standard", StandardTokenFilterFactory.class); put("stemmeroverride", StemmerOverrideTokenFilterFactory.class); put("stop", StopTokenFilterFactory.class); put("swedishlightstem", StemmerTokenFilterFactory.class); put("synonym", SynonymTokenFilterFactory.class); put("trim", TrimTokenFilterFactory.class); put("truncate", TruncateTokenFilterFactory.class); put("turkishlowercase", LowerCaseTokenFilterFactory.class); put("type", KeepTypesFilterFactory.class); put("uppercase", UpperCaseTokenFilterFactory.class); put("worddelimiter", WordDelimiterTokenFilterFactory.class); // TODO: these tokenfilters are not yet exposed: useful? // suggest stop put("suggeststop", Void.class); // capitalizes tokens put("capitalization", Void.class); // like length filter (but codepoints) put("codepointcount", Void.class); // puts hyphenated words back together put("hyphenatedwords", Void.class); // repeats anything marked as keyword put("keywordrepeat", Void.class); // like limittokencount, but by offset put("limittokenoffset", Void.class); // like limittokencount, but by position put("limittokenposition", Void.class); // ??? put("numericpayload", Void.class); // removes duplicates at the same position (this should be used by the existing factory) put("removeduplicates", Void.class); // ??? put("tokenoffsetpayload", Void.class); // puts the type into the payload put("typeaspayload", Void.class); // fingerprint put("fingerprint", Void.class); // for tee-sinks put("daterecognizer", Void.class); }}; public void testTokenFilters() { Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters()); missing.removeAll(KNOWN_TOKENFILTERS.keySet()); assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty()); } static final Map<String,Class<?>> KNOWN_CHARFILTERS = new HashMap<String,Class<?>>() {{ // exposed in ES put("htmlstrip", HtmlStripCharFilterFactory.class); put("mapping", MappingCharFilterFactory.class); put("patternreplace", PatternReplaceCharFilterFactory.class); // TODO: these charfilters are not yet exposed: useful? // handling of zwnj for persian put("persian", Void.class); }}; public void testCharFilters() { Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.CharFilterFactory.availableCharFilters()); missing.removeAll(KNOWN_CHARFILTERS.keySet()); assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty()); } }