AnalysisFactoryTests.java example

Explorer
elassandra-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.analysis;

import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.test.ESTestCase;

import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

/** 
 * Alerts us if new analyzers are added to lucene, so we don't miss them.
 * <p>
 * If we don't want to expose one for a specific reason, just map it to Void
 */
public class AnalysisFactoryTests extends ESTestCase {
    
    static final Map<String,Class<?>> KNOWN_TOKENIZERS = new HashMap<String,Class<?>>() {{
        // deprecated ones, we dont care about these
        put("arabicletter",  Deprecated.class);
        put("chinese",       Deprecated.class);
        put("cjk",           Deprecated.class);
        put("russianletter", Deprecated.class);
        
        // exposed in ES
        put("classic",       ClassicTokenizerFactory.class);
        put("edgengram",     EdgeNGramTokenizerFactory.class);
        put("keyword",       KeywordTokenizerFactory.class);
        put("letter",        LetterTokenizerFactory.class);
        put("lowercase",     LowerCaseTokenizerFactory.class);
        put("ngram",         NGramTokenizerFactory.class);
        put("pathhierarchy", PathHierarchyTokenizerFactory.class);
        put("pattern",       PatternTokenizerFactory.class);
        put("standard",      StandardTokenizerFactory.class);
        put("thai",          ThaiTokenizerFactory.class);
        put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
        put("whitespace",    WhitespaceTokenizerFactory.class);
        
        // this one "seems to mess up offsets". probably shouldn't be a tokenizer...
        put("wikipedia",     Void.class);
    }};
    
    public void testTokenizers() {
        Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers());
        missing.removeAll(KNOWN_TOKENIZERS.keySet());
        assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty());
    }
    
    static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new HashMap<String,Class<?>>() {{
        // deprecated ones, we dont care about these
        put("chinese",                Deprecated.class);
        put("collationkey",           Deprecated.class);
        put("position",               Deprecated.class);
        put("thaiword",               Deprecated.class);
        
        
        // exposed in ES
        put("apostrophe",                ApostropheFilterFactory.class);
        put("arabicnormalization",       ArabicNormalizationFilterFactory.class);
        put("arabicstem",                ArabicStemTokenFilterFactory.class);
        put("asciifolding",              ASCIIFoldingTokenFilterFactory.class);
        put("brazilianstem",             BrazilianStemTokenFilterFactory.class);
        put("bulgarianstem",             StemmerTokenFilterFactory.class);
        put("cjkbigram",                 CJKBigramFilterFactory.class);
        put("cjkwidth",                  CJKWidthFilterFactory.class);
        put("classic",                   ClassicFilterFactory.class);
        put("commongrams",               CommonGramsTokenFilterFactory.class);
        put("commongramsquery",          CommonGramsTokenFilterFactory.class);
        put("czechstem",                 CzechStemTokenFilterFactory.class);
        put("decimaldigit",              DecimalDigitFilterFactory.class);
        put("delimitedpayload",          DelimitedPayloadTokenFilterFactory.class);
        put("dictionarycompoundword",    DictionaryCompoundWordTokenFilterFactory.class);
        put("edgengram",                 EdgeNGramTokenFilterFactory.class);
        put("elision",                   ElisionTokenFilterFactory.class);
        put("englishminimalstem",        StemmerTokenFilterFactory.class);
        put("englishpossessive",         StemmerTokenFilterFactory.class);
        put("finnishlightstem",          StemmerTokenFilterFactory.class);
        put("frenchlightstem",           StemmerTokenFilterFactory.class);
        put("frenchminimalstem",         StemmerTokenFilterFactory.class);
        put("galicianminimalstem",       StemmerTokenFilterFactory.class);
        put("galicianstem",              StemmerTokenFilterFactory.class);
        put("germanstem",                GermanStemTokenFilterFactory.class);
        put("germanlightstem",           StemmerTokenFilterFactory.class);
        put("germanminimalstem",         StemmerTokenFilterFactory.class);
        put("germannormalization",       GermanNormalizationFilterFactory.class);
        put("greeklowercase",            LowerCaseTokenFilterFactory.class);
        put("greekstem",                 StemmerTokenFilterFactory.class);
        put("hindinormalization",        HindiNormalizationFilterFactory.class);
        put("hindistem",                 StemmerTokenFilterFactory.class);
        put("hungarianlightstem",        StemmerTokenFilterFactory.class);
        put("hunspellstem",              HunspellTokenFilterFactory.class);
        put("hyphenationcompoundword",   HyphenationCompoundWordTokenFilterFactory.class);
        put("indicnormalization",        IndicNormalizationFilterFactory.class);
        put("irishlowercase",            LowerCaseTokenFilterFactory.class);
        put("indonesianstem",            StemmerTokenFilterFactory.class);
        put("italianlightstem",          StemmerTokenFilterFactory.class);
        put("keepword",                  KeepWordFilterFactory.class);
        put("keywordmarker",             KeywordMarkerTokenFilterFactory.class);
        put("kstem",                     KStemTokenFilterFactory.class);
        put("latvianstem",               StemmerTokenFilterFactory.class);
        put("length",                    LengthTokenFilterFactory.class);
        put("limittokencount",           LimitTokenCountFilterFactory.class);
        put("lowercase",                 LowerCaseTokenFilterFactory.class);
        put("ngram",                     NGramTokenFilterFactory.class);
        put("norwegianlightstem",        StemmerTokenFilterFactory.class);
        put("norwegianminimalstem",      StemmerTokenFilterFactory.class);
        put("patterncapturegroup",       PatternCaptureGroupTokenFilterFactory.class);
        put("patternreplace",            PatternReplaceTokenFilterFactory.class);
        put("persiannormalization",      PersianNormalizationFilterFactory.class);
        put("porterstem",                PorterStemTokenFilterFactory.class);
        put("portuguesestem",            StemmerTokenFilterFactory.class);
        put("portugueselightstem",       StemmerTokenFilterFactory.class);
        put("portugueseminimalstem",     StemmerTokenFilterFactory.class);
        put("reversestring",             ReverseTokenFilterFactory.class);
        put("russianlightstem",          StemmerTokenFilterFactory.class);
        put("scandinavianfolding",       ScandinavianFoldingFilterFactory.class);
        put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
        put("serbiannormalization",      SerbianNormalizationFilterFactory.class);
        put("shingle",                   ShingleTokenFilterFactory.class);
        put("snowballporter",            SnowballTokenFilterFactory.class);
        put("soraninormalization",       SoraniNormalizationFilterFactory.class);
        put("soranistem",                StemmerTokenFilterFactory.class);
        put("spanishlightstem",          StemmerTokenFilterFactory.class);
        put("standard",                  StandardTokenFilterFactory.class);
        put("stemmeroverride",           StemmerOverrideTokenFilterFactory.class);
        put("stop",                      StopTokenFilterFactory.class);
        put("swedishlightstem",          StemmerTokenFilterFactory.class);
        put("synonym",                   SynonymTokenFilterFactory.class);
        put("trim",                      TrimTokenFilterFactory.class);
        put("truncate",                  TruncateTokenFilterFactory.class);
        put("turkishlowercase",          LowerCaseTokenFilterFactory.class);
        put("type",                      KeepTypesFilterFactory.class);
        put("uppercase",                 UpperCaseTokenFilterFactory.class);
        put("worddelimiter",             WordDelimiterTokenFilterFactory.class);
                
        // TODO: these tokenfilters are not yet exposed: useful?

        // suggest stop
        put("suggeststop",               Void.class);
        // capitalizes tokens
        put("capitalization",            Void.class);
        // like length filter (but codepoints)
        put("codepointcount",            Void.class);
        // puts hyphenated words back together
        put("hyphenatedwords",           Void.class);
        // repeats anything marked as keyword
        put("keywordrepeat",             Void.class);
        // like limittokencount, but by offset
        put("limittokenoffset",          Void.class);
        // like limittokencount, but by position
        put("limittokenposition",        Void.class);
        // ???
        put("numericpayload",            Void.class);
        // removes duplicates at the same position (this should be used by the existing factory)
        put("removeduplicates",          Void.class);
        // ???
        put("tokenoffsetpayload",        Void.class);
        // puts the type into the payload
        put("typeaspayload",             Void.class);
        // fingerprint
        put("fingerprint",               Void.class);
        // for tee-sinks
        put("daterecognizer",            Void.class);
    }};
    
    public void testTokenFilters() {
        Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters());
        missing.removeAll(KNOWN_TOKENFILTERS.keySet());
        assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty());
    }
    
    static final Map<String,Class<?>> KNOWN_CHARFILTERS = new HashMap<String,Class<?>>() {{        
        // exposed in ES
        put("htmlstrip",      HtmlStripCharFilterFactory.class);
        put("mapping",        MappingCharFilterFactory.class);
        put("patternreplace", PatternReplaceCharFilterFactory.class);
                
        // TODO: these charfilters are not yet exposed: useful?
        // handling of zwnj for persian
        put("persian",        Void.class);
    }};
    
    public void testCharFilters() {
        Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.CharFilterFactory.availableCharFilters());
        missing.removeAll(KNOWN_CHARFILTERS.keySet());
        assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty());
    }
   
    
}