package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.testng.annotations.Test;
import java.io.IOException;
import java.io.StringReader;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
// Tests borrowed from Solr's Icu collation key filter factory test.
public class SimpleIcuCollationTokenFilterTests {
/*
* Turkish has some funny casing.
* This test shows how you can solve this kind of thing easily with collation.
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
* Then things will sort and match correctly.
*/
@Test
public void testBasicUsage() throws Exception {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
@Test
public void testNormalization() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "tr")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.decomposition", "canonical")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test secondary strength, for english case is not significant.
*/
@Test
public void testSecondaryStrength() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "secondary")
.put("index.analysis.filter.myCollator.decomposition", "no")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String upperCase = "TESTING";
String lowerCase = "testing";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
@Test
public void testIgnorePunctuation() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String withPunctuation = "foo-bar";
String withoutPunctuation = "foo bar";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
TokenStream tsWithoutPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withoutPunctuation)));
assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
@Test
public void testIgnoreWhitespace() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.alternate", "shifted")
.put("index.analysis.filter.myCollator.variableTop", " ")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String withSpace = "foo bar";
String withoutSpace = "foobar";
String withPunctuation = "foo-bar";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithoutSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withoutSpace)));
assertCollatesToSame(tsWithSpace, tsWithoutSpace);
// now assert that punctuation still matters: foo-bar < foo bar
tsWithSpace = filterFactory.create(new KeywordTokenizer(new StringReader(withSpace)));
TokenStream tsWithPunctuation = filterFactory.create(new KeywordTokenizer(new StringReader(withPunctuation)));
assertCollation(tsWithPunctuation, tsWithSpace, -1);
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
@Test
public void testNumerics() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.numeric", "true")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String nine = "foobar-9";
String ten = "foobar-10";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsNine = filterFactory.create(new KeywordTokenizer(new StringReader(nine)));
TokenStream tsTen = filterFactory.create(new KeywordTokenizer(new StringReader(ten)));
assertCollation(tsNine, tsTen, -1);
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
@Test
public void testIgnoreAccentsButNotCase() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "primary")
.put("index.analysis.filter.myCollator.caseLevel", "true")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String withAccents = "résumé";
String withoutAccents = "resume";
String withAccentsUpperCase = "Résumé";
String withoutAccentsUpperCase = "Resume";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsWithAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withAccents)));
TokenStream tsWithoutAccents = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
assertCollatesToSame(tsWithAccents, tsWithoutAccents);
TokenStream tsWithAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
TokenStream tsWithoutAccentsUpperCase = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
// now assert that case still matters: resume < Resume
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccents)));
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
assertCollation(tsLower, tsUpper, -1);
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
@Test
public void testUpperCaseFirst() throws IOException {
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.language", "en")
.put("index.analysis.filter.myCollator.strength", "tertiary")
.put("index.analysis.filter.myCollator.caseFirst", "upper")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String lower = "resume";
String upper = "Resume";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsLower = filterFactory.create(new KeywordTokenizer(new StringReader(lower)));
TokenStream tsUpper = filterFactory.create(new KeywordTokenizer(new StringReader(upper)));
assertCollation(tsUpper, tsLower, -1);
}
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
@Test
public void testCustomRules() throws Exception {
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308"+
"& oe , o\u0308 & OE , O\u0308"+
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
Index index = new Index("test");
Settings settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.filter.myCollator.type", "icu_collation")
.put("index.analysis.filter.myCollator.rules", tailoredRules)
.put("index.analysis.filter.myCollator.strength", "primary")
.build();
AnalysisService analysisService = createAnalysisService(index, settings);
String germanUmlaut = "Töne";
String germanOE = "Toene";
TokenFilterFactory filterFactory = analysisService.tokenFilter("myCollator");
TokenStream tsUmlaut = filterFactory.create(new KeywordTokenizer(new StringReader(germanUmlaut)));
TokenStream tsOE = filterFactory.create(new KeywordTokenizer(new StringReader(germanOE)));
assertCollatesToSame(tsUmlaut, tsOE);
}
private AnalysisService createAnalysisService(Index index, Settings settings) {
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IcuAnalysisBinderProcessor()))
.createChildInjector(parentInjector);
return injector.getInstance(AnalysisService.class);
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
assertCollation(stream1, stream2, 0);
}
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
assertThat(stream1.incrementToken(), equalTo(true));
assertThat(stream2.incrementToken(), equalTo(true));
assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison)));
assertThat(stream1.incrementToken(), equalTo(false));
assertThat(stream2.incrementToken(), equalTo(false));
}
}