package org.xbib.elasticsearch.index.analysis.icu; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.BytesRef; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.base.Supplier; import org.elasticsearch.common.collect.Multimaps; import org.elasticsearch.common.collect.SetMultimap; import org.elasticsearch.common.collect.Sets; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.SettingsModule; import org.elasticsearch.env.Environment; import org.elasticsearch.env.EnvironmentModule; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.analysis.AnalysisModule; import org.elasticsearch.index.analysis.AnalysisService; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.junit.Test; import org.xbib.elasticsearch.index.analysis.BaseTokenStreamTest; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.Set; import java.util.TreeMap; public class IcuCollationAnalyzerTests extends BaseTokenStreamTest { /* * Turkish has some funny casing. * This test shows how you can solve this kind of thing easily with collation. * Instead of using LowerCaseFilter, use a turkish collator with primary strength. * Then things will sort and match correctly. */ @Test public void testBasicUsage() throws Exception { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "tr") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.decomposition", "canonical") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); TokenStream tsUpper = analyzer.tokenStream(null, "I WİLL USE TURKİSH CASING"); BytesRef b1 = bytesFromTokenStream(tsUpper); TokenStream tsLower = analyzer.tokenStream(null, "ı will use turkish casıng"); BytesRef b2 = bytesFromTokenStream(tsLower); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Test usage of the decomposition option for unicode normalization. */ @Test public void testNormalization() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "tr") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.decomposition", "canonical") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); TokenStream tsUpper = analyzer.tokenStream(null, "I W\u0049\u0307LL USE TURKİSH CASING"); BytesRef b1 = bytesFromTokenStream(tsUpper); TokenStream tsLower = analyzer.tokenStream(null, "ı will use turkish casıng"); BytesRef b2 = bytesFromTokenStream(tsLower); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Test secondary strength, for english case is not significant. */ @Test public void testSecondaryStrength() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "secondary") .put("index.analysis.analyzer.myAnalyzer.decomposition", "no") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); TokenStream tsUpper = analyzer.tokenStream("content", "TESTING"); BytesRef b1 = bytesFromTokenStream(tsUpper); TokenStream tsLower = analyzer.tokenStream("content", "testing"); BytesRef b2 = bytesFromTokenStream(tsLower); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Setting alternate=shifted to shift whitespace, punctuation and symbols * to quaternary level */ @Test public void testIgnorePunctuation() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); TokenStream tsPunctuation = analyzer.tokenStream("content", "foo-bar"); BytesRef b1 = bytesFromTokenStream(tsPunctuation); TokenStream tsWithoutPunctuation = analyzer.tokenStream("content", "foo bar"); BytesRef b2 = bytesFromTokenStream(tsWithoutPunctuation); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Setting alternate=shifted and variableTop to shift whitespace, but not * punctuation or symbols, to quaternary level */ @Test public void testIgnoreWhitespace() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted") .put("index.analysis.analyzer.myAnalyzer.variableTop", 4096) // SPACE .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); TokenStream tsWithoutSpace = analyzer.tokenStream(null, "foobar"); BytesRef b1 = bytesFromTokenStream(tsWithoutSpace); TokenStream tsWithSpace = analyzer.tokenStream(null, "foo bar"); BytesRef b2 = bytesFromTokenStream(tsWithSpace); assertTrue(compare(b1.bytes, b2.bytes) == 0); // now assert that punctuation still matters: foo-bar < foo bar TokenStream tsWithPunctuation = analyzer.tokenStream(null, "foo-bar"); BytesRef b3 = bytesFromTokenStream(tsWithPunctuation); assertTrue(compare(b3.bytes, b1.bytes) < 0); } /* * Setting numeric to encode digits with numeric value, so that * foobar-9 sorts before foobar-10 */ @Test public void testNumerics() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.numeric", true) .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); TokenStream tsNine = analyzer.tokenStream(null, "foobar-9"); BytesRef b1 = bytesFromTokenStream(tsNine); TokenStream tsTen = analyzer.tokenStream(null, "foobar-10"); BytesRef b2 = bytesFromTokenStream(tsTen); assertTrue(compare(b1.bytes, b2.bytes) == -1); } /* * Setting caseLevel=true to create an additional case level between * secondary and tertiary */ @Test public void testIgnoreAccentsButNotCase() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.caseLevel", "true") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); String withAccents = "résumé"; String withoutAccents = "resume"; String withAccentsUpperCase = "Résumé"; String withoutAccentsUpperCase = "Resume"; TokenStream tsWithAccents = analyzer.tokenStream(null, withAccents); BytesRef b1 = bytesFromTokenStream(tsWithAccents); TokenStream tsWithoutAccents = analyzer.tokenStream(null, withoutAccents); BytesRef b2 = bytesFromTokenStream(tsWithoutAccents); assertTrue(compare(b1.bytes, b2.bytes) == 0); TokenStream tsWithAccentsUpperCase = analyzer.tokenStream(null, withAccentsUpperCase); BytesRef b3 = bytesFromTokenStream(tsWithAccentsUpperCase); TokenStream tsWithoutAccentsUpperCase = analyzer.tokenStream(null, withoutAccentsUpperCase); BytesRef b4 = bytesFromTokenStream(tsWithoutAccentsUpperCase); assertTrue(compare(b3.bytes, b4.bytes) == 0); // now assert that case still matters: resume < Resume TokenStream tsLower = analyzer.tokenStream(null, withoutAccents); BytesRef b5 = bytesFromTokenStream(tsLower); TokenStream tsUpper = analyzer.tokenStream(null, withoutAccentsUpperCase); BytesRef b6 = bytesFromTokenStream(tsUpper); assertTrue(compare(b5.bytes, b6.bytes) < 0); } /* * Setting caseFirst=upper to cause uppercase strings to sort * before lowercase ones. */ @Test public void testUpperCaseFirst() throws IOException { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "tertiary") .put("index.analysis.analyzer.myAnalyzer.caseFirst", "upper") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); String lower = "resume"; String upper = "Resume"; TokenStream tsLower = analyzer.tokenStream(null, lower); BytesRef b1 = bytesFromTokenStream(tsLower); TokenStream tsUpper = analyzer.tokenStream(null, upper); BytesRef b2 = bytesFromTokenStream(tsUpper); assertTrue(compare(b2.bytes, b1.bytes) < 0); } /* * For german, you might want oe to sort and match with o umlaut. * This is not the default, but you can make a customized ruleset to do this. * * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 */ @Test public void testCustomRules() throws Exception { RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.rules", tailoredRules) .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("myAnalyzer").analyzer(); String germanUmlaut = "Töne"; TokenStream tsUmlaut = analyzer.tokenStream(null, germanUmlaut); BytesRef b1 = bytesFromTokenStream(tsUmlaut); String germanExpandedUmlaut = "Toene"; TokenStream tsExpanded = analyzer.tokenStream(null, germanExpandedUmlaut); BytesRef b2 = bytesFromTokenStream(tsExpanded); assertTrue(compare(b1.bytes, b2.bytes) == 0); } @Test public void testCustomFromJson() throws Exception { Index index = new Index("test"); Settings settings = ImmutableSettings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .loadFromClasspath("org/xbib/elasticsearch/index/analysis/icu/icu_collation.json").build(); AnalysisService analysisService = createAnalysisService(index, settings); Analyzer analyzer = analysisService.analyzer("icu_german_collate").analyzer(); String[] words = new String[]{ "Göbel", "Goethe", "Goldmann", "Göthe", "Götz" }; SetMultimap<BytesRef,String> bytesRefMap = Multimaps.newSetMultimap(new TreeMap<BytesRef, Collection<String>>(), new Supplier<Set<String>>() { @Override public Set<String> get() { return Sets.newTreeSet(); } }); for (String s : words) { TokenStream ts = analyzer.tokenStream(null, s); bytesRefMap.put(bytesFromTokenStream(ts), s); } Iterator<Collection<String>> it = bytesRefMap.asMap().values().iterator(); assertEquals("[Göbel]",it.next().toString()); assertEquals("[Goethe, Göthe]",it.next().toString()); assertEquals("[Götz]",it.next().toString()); assertEquals("[Goldmann]",it.next().toString()); } private AnalysisService createAnalysisService(Index index, Settings settings) { Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); Injector injector = new ModulesBuilder().add( new IndexSettingsModule(index, settings), new IndexNameModule(index), new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)) .addProcessor(new IcuAnalysisBinderProcessor())) .createChildInjector(parentInjector); return injector.getInstance(AnalysisService.class); } private BytesRef bytesFromTokenStream(TokenStream stream) throws IOException { TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class); BytesRef bytesRef = termAttr.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAttr.fillBytesRef(); } stream.close(); BytesRef copy = new BytesRef(); copy.copyBytes(bytesRef); return copy; } private int compare(byte[] left, byte[] right) { for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) { int a = (left[i] & 0xff); int b = (right[j] & 0xff); if (a != b) { return a - b; } } return left.length - right.length; } }