package org.apache.lucene.analysis.icu; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.TokenStream; import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UnicodeSet; /** * Test the ICUTransformFilter with some basic examples. */ public class TestICUTransformFilter extends BaseTokenStreamTestCase { public void testBasicFunctionality() throws Exception { checkToken(Transliterator.getInstance("Traditional-Simplified"), "簡化字", "简化字"); checkToken(Transliterator.getInstance("Katakana-Hiragana"), "ヒラガナ", "ひらがな"); checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), "アルアノリウ", "アルアノリウ"); checkToken(Transliterator.getInstance("Any-Latin"), "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos"); checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), "Alphabētikós Katálogos", "Alphabetikos Katalogos"); checkToken(Transliterator.getInstance("Han-Latin"), "中国", "zhōng guó"); } public void testCustomFunctionality() throws Exception { String rules = "a > b; b > c;"; // convert a's to b's and b's to c's checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb"); } public void testCustomFunctionality2() throws Exception { String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd"); } public void testOptimizer() throws Exception { String rules = "a > b; b > c;"; // convert a's to b's and b's to c's Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); assertTrue(custom.getFilter() == null); new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom); assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]"))); } public void testOptimizer2() throws Exception { checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), "ABCDE", "abcde"); } public void testOptimizerSurrogate() throws Exception { String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD); assertTrue(custom.getFilter() == null); new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom); assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]"))); } private void checkToken(Transliterator transform, String input, String expected) throws IOException { TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform); assertTokenStreamContents(ts, new String[] { expected }); } }