SimpleIcuCollationTokenFilterTests.java example

Explorer
elasticsearch-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.analysis;

import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
import java.io.StringReader;

import static org.hamcrest.Matchers.equalTo;

// Tests borrowed from Solr's Icu collation key filter factory test.
public class SimpleIcuCollationTokenFilterTests extends ESTestCase {
    /*
    * Turkish has some funny casing.
    * This test shows how you can solve this kind of thing easily with collation.
    * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
    * Then things will sort and match correctly.
    */
    public void testBasicUsage() throws Exception {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "tr")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "I WİLL USE TURKİSH CASING", "ı will use turkish casıng");
    }

    /*
    * Test usage of the decomposition option for unicode normalization.
    */
    public void testNormalization() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "tr")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.decomposition", "canonical")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng");
    }

    /*
    * Test secondary strength, for english case is not significant.
    */
    public void testSecondaryStrength() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "secondary")
                .put("index.analysis.filter.myCollator.decomposition", "no")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "TESTING", "testing");
    }

    /*
    * Setting alternate=shifted to shift whitespace, punctuation and symbols
    * to quaternary level
    */
    public void testIgnorePunctuation() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.alternate", "shifted")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "foo-bar", "foo bar");
    }

    /*
    * Setting alternate=shifted and variableTop to shift whitespace, but not
    * punctuation or symbols, to quaternary level
    */
    public void testIgnoreWhitespace() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.alternate", "shifted")
                .put("index.analysis.filter.myCollator.variableTop", " ")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "foo bar", "foobar");
        // now assert that punctuation still matters: foo-bar < foo bar
        assertCollation(filterFactory, "foo-bar", "foo bar", -1);
    }

    /*
    * Setting numeric to encode digits with numeric value, so that
    * foobar-9 sorts before foobar-10
    */
    public void testNumerics() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.numeric", "true")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollation(filterFactory, "foobar-9", "foobar-10", -1);
    }

    /*
    * Setting caseLevel=true to create an additional case level between
    * secondary and tertiary
    */
    public void testIgnoreAccentsButNotCase() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "primary")
                .put("index.analysis.filter.myCollator.caseLevel", "true")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "résumé", "resume");
        assertCollatesToSame(filterFactory, "Résumé", "Resume");
        // now assert that case still matters: resume < Resume
        assertCollation(filterFactory, "resume", "Resume", -1);
    }

    /*
    * Setting caseFirst=upper to cause uppercase strings to sort
    * before lowercase ones.
    */
    public void testUpperCaseFirst() throws IOException {
        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.language", "en")
                .put("index.analysis.filter.myCollator.strength", "tertiary")
                .put("index.analysis.filter.myCollator.caseFirst", "upper")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollation(filterFactory, "Resume", "resume", -1);
    }

    /*
    * For german, you might want oe to sort and match with o umlaut.
    * This is not the default, but you can make a customized ruleset to do this.
    *
    * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
    *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
    */
    public void testCustomRules() throws Exception {
        RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
        String DIN5007_2_tailorings =
                "& ae , a\u0308 & AE , A\u0308"+
                        "& oe , o\u0308 & OE , O\u0308"+
                        "& ue , u\u0308 & UE , u\u0308";

        RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
        String tailoredRules = tailoredCollator.getRules();

        Settings settings = Settings.builder()
                .put("index.analysis.filter.myCollator.type", "icu_collation")
                .put("index.analysis.filter.myCollator.rules", tailoredRules)
                .put("index.analysis.filter.myCollator.strength", "primary")
                .build();
        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("myCollator");
        assertCollatesToSame(filterFactory, "Töne", "Toene");
    }

    private void assertCollatesToSame(TokenFilterFactory factory, String string1, String string2) throws IOException {
        assertCollation(factory, string1, string2, 0);
    }

    private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
        Tokenizer tokenizer = new KeywordTokenizer();
        tokenizer.setReader(new StringReader(string1));
        TokenStream stream1 = factory.create(tokenizer);

        tokenizer = new KeywordTokenizer();
        tokenizer.setReader(new StringReader(string2));
        TokenStream stream2 = factory.create(tokenizer);

        assertCollation(stream1, stream2, comparison);
    }

    private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
        CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
        CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);

        stream1.reset();
        stream2.reset();

        assertThat(stream1.incrementToken(), equalTo(true));
        assertThat(stream2.incrementToken(), equalTo(true));
        assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
        assertThat(stream1.incrementToken(), equalTo(false));
        assertThat(stream2.incrementToken(), equalTo(false));

        stream1.end();
        stream2.end();

        stream1.close();
        stream2.close();
    }
}