package org.xbib.elasticsearch.index.analysis.icu.segmentation; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.UTF16; /** * An iterator that locates ISO 15924 script boundaries in text. * This is not the same as simply looking at the Unicode block, or even the * Script property. Some characters are 'common' across multiple scripts, and * some 'inherit' the script value of text surrounding them. * This is similar to ICU (internal-only) UScriptRun, with the following * differences: * <ul> * <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this * is not necessary. Its also quite expensive. * <li>Non-spacing marks inherit the script of their base character, following * recommendations from UTR #24. * </ul> */ final class ScriptIterator { /** * linear fast-path for basic latin case. */ private static final int[] basicLatin = new int[128]; static { for (int i = 0; i < basicLatin.length; i++) { basicLatin[i] = UScript.getScript(i); } } private final boolean combineCJ; private char[] text; private int start; private int limit; private int index; private int scriptStart; private int scriptLimit; private int scriptCode; /** * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}. */ ScriptIterator(boolean combineCJ) { this.combineCJ = combineCJ; } /** * Determine if two scripts are compatible. */ private static boolean isSameScript(int scriptOne, int scriptTwo) { return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo; } /** * Get the start of this script run. * * @return start position of script run */ int getScriptStart() { return scriptStart; } /** * Get the index of the first character after the end of this script run. * * @return position of the first character after this script run */ int getScriptLimit() { return scriptLimit; } /** * Get the UScript script code for this script run. * * @return code for the script of the current run */ int getScriptCode() { return scriptCode; } /** * Iterates to the next script run, returning true if one exists. * * @return true if there is another script run, false otherwise. */ boolean next() { if (scriptLimit >= limit) { return false; } scriptCode = UScript.COMMON; scriptStart = scriptLimit; while (index < limit) { final int ch = UTF16.charAt(text, start, limit, index - start); final int sc = getScript(ch); /* * From UTR #24: Implementations that determine the boundaries between * characters of given scripts should never break between a non-spacing * mark and its base character. Thus for boundary determinations and * similar sorts of processing, a non-spacing mark — whatever its script * value — should inherit the script value of its base character. */ if (isSameScript(scriptCode, sc) || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) { index += UTF16.getCharCount(ch); /* * Inherited or Common becomes the script code of the surrounding text. */ if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { scriptCode = sc; } } else { break; } } scriptLimit = index; return true; } /** * Set a new region of text to be examined by this iterator. * * @param text text buffer to examine * @param start offset into buffer * @param length maximum length to examine */ void setText(char[] text, int start, int length) { this.text = text; this.start = start; this.index = start; this.limit = start + length; this.scriptStart = start; this.scriptLimit = start; this.scriptCode = UScript.INVALID_CODE; } /** * fast version of UScript.getScript(). Basic Latin is an array lookup */ private int getScript(int codepoint) { if (0 <= codepoint && codepoint < basicLatin.length) { return basicLatin[codepoint]; } else { int script = UScript.getScript(codepoint); if (combineCJ) { if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) { return UScript.JAPANESE; } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) { // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise // they are treated as punctuation. we currently have no cleaner way to fix this! return UScript.LATIN; } else { return script; } } else { return script; } } } }