package org.xbib.elasticsearch.index.analysis.icu; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.elasticsearch.common.settings.Settings; import org.junit.Test; import org.xbib.elasticsearch.index.analysis.BaseTokenStreamTest; import java.io.IOException; import java.util.*; import static org.xbib.elasticsearch.MapperTestUtils.analyzer; /** * */ public class IcuCollationAnalyzerTests extends BaseTokenStreamTest { /* * Turkish has some funny casing. * This test shows how you can solve this kind of thing easily with collation. * Instead of using LowerCaseFilter, use a turkish collator with primary strength. * Then things will sort and match correctly. */ @Test public void testBasicUsage() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "tr") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.decomposition", "canonical") .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); TokenStream tsUpper = analyzer.tokenStream(null, "I WİLL USE TURKİSH CASING"); BytesRef b1 = bytesFromTokenStream(tsUpper); TokenStream tsLower = analyzer.tokenStream(null, "ı will use turkish casıng"); BytesRef b2 = bytesFromTokenStream(tsLower); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Test usage of the decomposition option for unicode normalization. */ @Test public void testNormalization() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "tr") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.decomposition", "canonical") .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); TokenStream tsUpper = analyzer.tokenStream(null, "I W\u0049\u0307LL USE TURKİSH CASING"); BytesRef b1 = bytesFromTokenStream(tsUpper); TokenStream tsLower = analyzer.tokenStream(null, "ı will use turkish casıng"); BytesRef b2 = bytesFromTokenStream(tsLower); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Test secondary strength, for english case is not significant. */ @Test public void testSecondaryStrength() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "secondary") .put("index.analysis.analyzer.myAnalyzer.decomposition", "no") .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); TokenStream tsUpper = analyzer.tokenStream("content", "TESTING"); BytesRef b1 = bytesFromTokenStream(tsUpper); TokenStream tsLower = analyzer.tokenStream("content", "testing"); BytesRef b2 = bytesFromTokenStream(tsLower); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Setting alternate=shifted to shift whitespace, punctuation and symbols * to quaternary level */ @Test public void testIgnorePunctuation() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted") .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); TokenStream tsPunctuation = analyzer.tokenStream("content", "foo-bar"); BytesRef b1 = bytesFromTokenStream(tsPunctuation); TokenStream tsWithoutPunctuation = analyzer.tokenStream("content", "foo bar"); BytesRef b2 = bytesFromTokenStream(tsWithoutPunctuation); assertTrue(compare(b1.bytes, b2.bytes) == 0); } /* * Setting alternate=shifted and variableTop to shift whitespace, but not * punctuation or symbols, to quaternary level */ @Test public void testIgnoreWhitespace() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted") .put("index.analysis.analyzer.myAnalyzer.variableTop", 4096) // SPACE .build(); Analyzer analyzer = analyzer(settings ,"myAnalyzer"); TokenStream tsWithoutSpace = analyzer.tokenStream(null, "foobar"); BytesRef b1 = bytesFromTokenStream(tsWithoutSpace); TokenStream tsWithSpace = analyzer.tokenStream(null, "foo bar"); BytesRef b2 = bytesFromTokenStream(tsWithSpace); assertTrue(compare(b1.bytes, b2.bytes) == 0); // now check that punctuation still matters: foo-bar < foo bar TokenStream tsWithPunctuation = analyzer.tokenStream(null, "foo-bar"); BytesRef b3 = bytesFromTokenStream(tsWithPunctuation); assertTrue(compare(b3.bytes, b1.bytes) < 0); } /* * Setting numeric to encode digits with numeric value, so that * foobar-9 sorts before foobar-10 */ @Test public void testNumerics() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.numeric", true) .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); TokenStream tsNine = analyzer.tokenStream(null, "foobar-9"); BytesRef b1 = bytesFromTokenStream(tsNine); TokenStream tsTen = analyzer.tokenStream(null, "foobar-10"); BytesRef b2 = bytesFromTokenStream(tsTen); assertTrue(compare(b1.bytes, b2.bytes) == -1); } /* * Setting caseLevel=true to create an additional case level between * secondary and tertiary */ @Test public void testIgnoreAccentsButNotCase() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.caseLevel", "true") .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); String withAccents = "résumé"; String withoutAccents = "resume"; String withAccentsUpperCase = "Résumé"; String withoutAccentsUpperCase = "Resume"; TokenStream tsWithAccents = analyzer.tokenStream(null, withAccents); BytesRef b1 = bytesFromTokenStream(tsWithAccents); TokenStream tsWithoutAccents = analyzer.tokenStream(null, withoutAccents); BytesRef b2 = bytesFromTokenStream(tsWithoutAccents); assertTrue(compare(b1.bytes, b2.bytes) == 0); TokenStream tsWithAccentsUpperCase = analyzer.tokenStream(null, withAccentsUpperCase); BytesRef b3 = bytesFromTokenStream(tsWithAccentsUpperCase); TokenStream tsWithoutAccentsUpperCase = analyzer.tokenStream(null, withoutAccentsUpperCase); BytesRef b4 = bytesFromTokenStream(tsWithoutAccentsUpperCase); assertTrue(compare(b3.bytes, b4.bytes) == 0); // now check that case still matters: resume < Resume TokenStream tsLower = analyzer.tokenStream(null, withoutAccents); BytesRef b5 = bytesFromTokenStream(tsLower); TokenStream tsUpper = analyzer.tokenStream(null, withoutAccentsUpperCase); BytesRef b6 = bytesFromTokenStream(tsUpper); assertTrue(compare(b5.bytes, b6.bytes) < 0); } /* * Setting caseFirst=upper to cause uppercase strings to sort * before lowercase ones. */ @Test public void testUpperCaseFirst() throws IOException { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "tertiary") .put("index.analysis.analyzer.myAnalyzer.caseFirst", "upper") .build(); Analyzer analyzer = analyzer(settings,"myAnalyzer"); String lower = "resume"; String upper = "Resume"; TokenStream tsLower = analyzer.tokenStream(null, lower); BytesRef b1 = bytesFromTokenStream(tsLower); TokenStream tsUpper = analyzer.tokenStream(null, upper); BytesRef b2 = bytesFromTokenStream(tsUpper); assertTrue(compare(b2.bytes, b1.bytes) < 0); } /* * For german, you might want oe to sort and match with o umlaut. * This is not the default, but you can make a customized ruleset to do this. * * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 */ @Test public void testCustomRules() throws Exception { RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.rules", tailoredRules) .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .build(); Analyzer analyzer = analyzer(settings, "myAnalyzer"); String germanUmlaut = "Töne"; TokenStream tsUmlaut = analyzer.tokenStream(null, germanUmlaut); BytesRef b1 = bytesFromTokenStream(tsUmlaut); String germanExpandedUmlaut = "Toene"; TokenStream tsExpanded = analyzer.tokenStream(null, germanExpandedUmlaut); BytesRef b2 = bytesFromTokenStream(tsExpanded); assertTrue(compare(b1.bytes, b2.bytes) == 0); } @Test public void testPrimaryStrengthFromJson() throws Exception { String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_collation.json"; Analyzer analyzer = analyzer(resource, "icu_german_collate"); String[] words = new String[]{ "Göbel", "Goethe", "Goldmann", "Göthe", "Götz" }; MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>(); for (String s : words) { TokenStream ts = analyzer.tokenStream(null, s); bytesRefMap.put(bytesFromTokenStream(ts), s); } Iterator<Set<String>> it = bytesRefMap.values().iterator(); assertEquals("[Göbel]",it.next().toString()); assertEquals("[Goethe, Göthe]",it.next().toString()); assertEquals("[Götz]",it.next().toString()); assertEquals("[Goldmann]",it.next().toString()); } @Test public void testQuaternaryStrengthFromJson() throws Exception { String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_collation.json"; Analyzer analyzer = analyzer(resource, "icu_german_collate_without_punct"); String[] words = new String[]{ "Göbel", "G-oethe", "Gold*mann", "Göthe", "Götz" }; MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>(); for (String s : words) { TokenStream ts = analyzer.tokenStream(null, s); bytesRefMap.put(bytesFromTokenStream(ts), s); } Iterator<Set<String>> it = bytesRefMap.values().iterator(); assertEquals("[Göbel]",it.next().toString()); assertEquals("[G-oethe]",it.next().toString()); assertEquals("[Göthe]",it.next().toString()); assertEquals("[Götz]",it.next().toString()); assertEquals("[Gold*mann]",it.next().toString()); } @Test public void testGermanPhoneBook() throws Exception { String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_collation.json"; Analyzer analyzer = analyzer(resource, "german_phonebook"); String[] words = new String[]{ "Göbel", "Goethe", "Goldmann", "Göthe", "Götz" }; MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>(); for (String s : words) { TokenStream ts = analyzer.tokenStream(null, s); bytesRefMap.put(bytesFromTokenStream(ts), s); } Iterator<Set<String>> it = bytesRefMap.values().iterator(); assertEquals("[Göbel]",it.next().toString()); assertEquals("[Goethe, Göthe]",it.next().toString()); assertEquals("[Götz]",it.next().toString()); assertEquals("[Goldmann]",it.next().toString()); } @Test public void testReorder() throws IOException { String resource = "org/xbib/elasticsearch/index/analysis/icu/icu_collation.json"; Analyzer analyzer = analyzer(resource, "reorder"); assertNotNull(analyzer); } private BytesRef bytesFromTokenStream(TokenStream stream) throws IOException { TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class); stream.reset(); BytesRefBuilder bytesRefBuilder = new BytesRefBuilder(); while (stream.incrementToken()) { BytesRef bytesRef = termAttr.getBytesRef(); bytesRefBuilder.append(bytesRef); } stream.close(); return bytesRefBuilder.toBytesRef(); } private int compare(byte[] left, byte[] right) { for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) { int a = (left[i] & 0xff); int b = (right[j] & 0xff); if (a != b) { return a - b; } } return left.length - right.length; } interface MultiMap<K, V> { void clear(); int size(); boolean isEmpty(); boolean containsKey(K key); Collection<V> get(K key); Set<K> keySet(); Collection<Set<V>> values(); Collection<V> put(K key, V value); Collection<V> remove(K key); Collection<V> remove(K key, V value); void putAll(K key, Collection<V> values); } class TreeMultiMap<K, V> implements MultiMap<K, V> { private final Map<K, Set<V>> map = new TreeMap<>(); @Override public int size() { return map.size(); } @Override public void clear() { map.clear(); } @Override public boolean isEmpty() { return map.isEmpty(); } @Override public boolean containsKey(K key) { return map.containsKey(key); } @Override public Set<K> keySet() { return map.keySet(); } @Override public Collection<Set<V>> values() { return map.values(); } @Override public Collection<V> put(K key, V value) { Set<V> set = map.get(key); if (set == null) { set = new TreeSet<>(); } set.add(value); return map.put(key, set); } @Override public void putAll(K key, Collection<V> values) { Set<V> set = map.computeIfAbsent(key, k -> new LinkedHashSet<>()); set.addAll(values); } @Override public Collection<V> get(K key) { return map.get(key); } @Override public Set<V> remove(K key) { return map.remove(key); } @Override public Set<V> remove(K key, V value) { Set<V> set = map.get(key); if (set != null) { set.remove(value); } return set; } @Override public boolean equals(Object obj) { return obj != null && obj instanceof TreeMultiMap && map.equals(((TreeMultiMap) obj).map); } @Override public int hashCode() { return map.hashCode(); } @Override public String toString() { return map.toString(); } } }