package org.apache.lucene.analysis.icu; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import com.ibm.icu.text.Replaceable; import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; /** * A {@link TokenFilter} that transforms text with ICU. * <p> * ICU provides text-transformation functionality via its Transliteration API. * Although script conversion is its most common use, a Transliterator can * actually perform a more general class of tasks. In fact, Transliterator * defines a very general API which specifies only that a segment of the input * text is replaced by new text. The particulars of this conversion are * determined entirely by subclasses of Transliterator. * </p> * <p> * Some useful transformations for search are built-in: * <ul> * <li>Conversion from Traditional to Simplified Chinese characters * <li>Conversion from Hiragana to Katakana * <li>Conversion from Fullwidth to Halfwidth forms. * <li>Script conversions, for example Serbian Cyrillic to Latin * </ul> * </p> * <p> * Example usage: <blockquote>stream = new ICUTransformFilter(stream, * Transliterator.getInstance("Traditional-Simplified"));</blockquote> * </p> * For more details, see the <a * href="http://userguide.icu-project.org/transforms/general">ICU User * Guide</a>. */ public final class ICUTransformFilter extends TokenFilter { // Transliterator to transform the text private final Transliterator transform; // Reusable position object private final Transliterator.Position position = new Transliterator.Position(); // term attribute, will be updated with transformed text. private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); // Wraps a termAttribute around the replaceable interface. private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute(); /** * Create a new ICUTransformFilter that transforms text on the given stream. * * @param input {@link TokenStream} to filter. * @param transform Transliterator to transform the text. */ public ICUTransformFilter(TokenStream input, Transliterator transform) { super(input); this.transform = transform; /* * This is cheating, but speeds things up a lot. * If we wanted to use pkg-private APIs we could probably do better. */ if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) { final UnicodeSet sourceSet = transform.getSourceSet(); if (sourceSet != null && !sourceSet.isEmpty()) transform.setFilter(sourceSet); } } @Override public boolean incrementToken() throws IOException { /* * Wrap around replaceable. clear the positions, and transliterate. */ if (input.incrementToken()) { replaceableAttribute.setText(termAtt); final int length = termAtt.length(); position.start = 0; position.limit = length; position.contextStart = 0; position.contextLimit = length; transform.filteredTransliterate(replaceableAttribute, position, false); return true; } else { return false; } } /** * Wrap a {@link CharTermAttribute} with the Replaceable API. */ final class ReplaceableTermAttribute implements Replaceable { private char buffer[]; private int length; private CharTermAttribute token; void setText(final CharTermAttribute token) { this.token = token; this.buffer = token.buffer(); this.length = token.length(); } public int char32At(int pos) { return UTF16.charAt(buffer, 0, length, pos); } public char charAt(int pos) { return buffer[pos]; } public void copy(int start, int limit, int dest) { char text[] = new char[limit - start]; getChars(start, limit, text, 0); replace(dest, dest, text, 0, limit - start); } public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) { System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart); } public boolean hasMetaData() { return false; } public int length() { return length; } public void replace(int start, int limit, String text) { final int charsLen = text.length(); final int newLength = shiftForReplace(start, limit, charsLen); // insert the replacement text text.getChars(0, charsLen, buffer, start); token.setLength(length = newLength); } public void replace(int start, int limit, char[] text, int charsStart, int charsLen) { // shift text if necessary for the replacement final int newLength = shiftForReplace(start, limit, charsLen); // insert the replacement text System.arraycopy(text, charsStart, buffer, start, charsLen); token.setLength(length = newLength); } /** shift text (if necessary) for a replacement operation */ private int shiftForReplace(int start, int limit, int charsLen) { final int replacementLength = limit - start; final int newLength = length - replacementLength + charsLen; // resize if necessary if (newLength > length) buffer = token.resizeBuffer(newLength); // if the substring being replaced is longer or shorter than the // replacement, need to shift things around if (replacementLength != charsLen && limit < length) System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit); return newLength; } } }