package org.apache.lucene.analysis.icu.segmentation; /** * Copyright (C) 1999-2010, International Business Machines * Corporation and others. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, and/or sell copies of the * Software, and to permit persons to whom the Software is furnished to do so, * provided that the above copyright notice(s) and this permission notice appear * in all copies of the Software and that both the above copyright notice(s) and * this permission notice appear in supporting documentation. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall not * be used in advertising or otherwise to promote the sale, use or other * dealings in this Software without prior written authorization of the * copyright holder. */ import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.UTF16; /** * An iterator that locates ISO 15924 script boundaries in text. * <p> * This is not the same as simply looking at the Unicode block, or even the * Script property. Some characters are 'common' across multiple scripts, and * some 'inherit' the script value of text surrounding them. * <p> * This is similar to ICU (internal-only) UScriptRun, with the following * differences: * <ul> * <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this * is not necessary. Its also quite expensive. * <li>Non-spacing marks inherit the script of their base character, following * recommendations from UTR #24. * </ul> * @lucene.experimental */ final class ScriptIterator { private char text[]; private int start; private int limit; private int index; private int scriptStart; private int scriptLimit; private int scriptCode; /** * Get the start of this script run * * @return start position of script run */ int getScriptStart() { return scriptStart; } /** * Get the index of the first character after the end of this script run * * @return position of the first character after this script run */ int getScriptLimit() { return scriptLimit; } /** * Get the UScript script code for this script run * * @return code for the script of the current run */ int getScriptCode() { return scriptCode; } /** * Iterates to the next script run, returning true if one exists. * * @return true if there is another script run, false otherwise. */ boolean next() { if (scriptLimit >= limit) return false; scriptCode = UScript.COMMON; scriptStart = scriptLimit; while (index < limit) { final int ch = UTF16.charAt(text, start, limit, index - start); final int sc = getScript(ch); /* * From UTR #24: Implementations that determine the boundaries between * characters of given scripts should never break between a non-spacing * mark and its base character. Thus for boundary determinations and * similar sorts of processing, a non-spacing mark — whatever its script * value — should inherit the script value of its base character. */ if (isSameScript(scriptCode, sc) || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) { index += UTF16.getCharCount(ch); /* * Inherited or Common becomes the script code of the surrounding text. */ if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { scriptCode = sc; } } else { break; } } scriptLimit = index; return true; } /** Determine if two scripts are compatible. */ private static boolean isSameScript(int scriptOne, int scriptTwo) { return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; } /** * Set a new region of text to be examined by this iterator * * @param text text buffer to examine * @param start offset into buffer * @param length maximum length to examine */ void setText(char text[], int start, int length) { this.text = text; this.start = start; this.index = start; this.limit = start + length; this.scriptStart = start; this.scriptLimit = start; this.scriptCode = UScript.INVALID_CODE; } /** linear fast-path for basic latin case */ private static final int basicLatin[] = new int[128]; static { for (int i = 0; i < basicLatin.length; i++) basicLatin[i] = UScript.getScript(i); } /** fast version of UScript.getScript(). Basic Latin is an array lookup */ private static int getScript(int codepoint) { if (0 <= codepoint && codepoint < basicLatin.length) return basicLatin[codepoint]; else return UScript.getScript(codepoint); } }