/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.icu; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.TokenStream; import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UnicodeSet; /** * Test the ICUTransformFilter with some basic examples. */ public class TestICUTransformFilter extends BaseTokenStreamTestCase { public void testBasicFunctionality() throws Exception { checkToken(Transliterator.getInstance("Traditional-Simplified"), "簡化字", "简化字"); checkToken(Transliterator.getInstance("Katakana-Hiragana"), "ヒラガナ", "ひらがな"); checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), "アルアノリウ", "アルアノリウ"); checkToken(Transliterator.getInstance("Any-Latin"), "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos"); checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), "Alphabētikós Katálogos", "Alphabetikos Katalogos"); checkToken(Transliterator.getInstance("Han-Latin"), "中国", "zhōng guó"); } public void testCustomFunctionality() throws Exception { String rules = "a > b; b > c;"; // convert a's to b's and b's to c's checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb"); } public void testCustomFunctionality2() throws Exception { String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd"); } public void testOptimizer() throws Exception { String rules = "a > b; b > c;"; // convert a's to b's and b's to c's Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); assertTrue(custom.getFilter() == null); final KeywordTokenizer input = new KeywordTokenizer(); input.setReader(new StringReader("")); new ICUTransformFilter(input, custom); assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]"))); } public void testOptimizer2() throws Exception { checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), "ABCDE", "abcde"); } public void testOptimizerSurrogate() throws Exception { String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); assertTrue(custom.getFilter() == null); final KeywordTokenizer input = new KeywordTokenizer(); input.setReader(new StringReader("")); new ICUTransformFilter(input, custom); assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]"))); } private void checkToken(Transliterator transform, String input, String expected) throws IOException { final KeywordTokenizer input1 = new KeywordTokenizer(); input1.setReader(new StringReader(input)); TokenStream ts = new ICUTransformFilter(input1, transform); assertTokenStreamContents(ts, new String[] { expected }); } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { final Transliterator transform = Transliterator.getInstance("Any-Latin"); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform)); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); a.close(); } public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin"))); } }; checkOneTerm(a, "", ""); a.close(); } }