/* * * @(#)Normalizer.java 1.12 06/10/10 * * Portions Copyright 2000-2008 Sun Microsystems, Inc. All Rights * Reserved. Use is subject to license terms. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 only, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License version 2 for more details (a copy is * included at /legal/license.txt). * * You should have received a copy of the GNU General Public License * version 2 along with this work; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA * * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa * Clara, CA 95054 or visit www.sun.com if you need additional * information or have any questions. */ /* * (C) Copyright IBM Corp. 1996-2002 - All Rights Reserved * * The original version of this source code and documentation is * copyrighted and owned by IBM. These materials are provided * under terms of a License Agreement between IBM and Sun. * This technology is protected by multiple US and International * patents. This notice and attribution to IBM may not be removed. */ package sun.text; import java.lang.Character; import java.text.CharacterIterator; import java.text.StringCharacterIterator; /** * <tt>Normalizer</tt> transforms Unicode text into an equivalent composed or * decomposed form, allowing for easier sorting and searching of text. * <tt>Normalizer</tt> supports the standard normalization forms described in * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> * Unicode Technical Report #15</a>. * <p> * Characters with accents or other adornments can be encoded in * several different ways in Unicode. For example, take the character "�" * (A-acute). In Unicode, this can be encoded as a single character (the * "composed" form): * <pre> * 00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre> * or as two separate characters (the "decomposed" form): * <pre> * 0041 LATIN CAPITAL LETTER A * 0301 COMBINING ACUTE ACCENT</pre> * <p> * To a user of your program, however, both of these sequences should be * treated as the same "user-level" character "�". When you are searching or * comparing text, you must ensure that these two sequences are treated * equivalently. In addition, you must handle characters with more than one * accent. Sometimes the order of a character's combining accents is * significant, while in other cases accent sequences in different orders are * really equivalent. * <p> * Similarly, the string "ffi" can be encoded as three separate letters: * <pre> * 0066 LATIN SMALL LETTER F * 0066 LATIN SMALL LETTER F * 0069 LATIN SMALL LETTER I</pre> * or as the single character * <pre> * FB03 LATIN SMALL LIGATURE FFI</pre> * <p> * The ffi ligature is not a distinct semantic character, and strictly speaking * it shouldn't be in Unicode at all, but it was included for compatibility * with existing character sets that already provided it. The Unicode standard * identifies such characters by giving them "compatibility" decompositions * into the corresponding semantic characters. When sorting and searching, you * will often want to use these mappings. * <p> * <tt>Normalizer</tt> helps solve these problems by transforming text into the * canonical composed and decomposed forms as shown in the first example above. * In addition, you can have it perform compatibility decompositions so that * you can treat compatibility characters the same as their equivalents. * Finally, <tt>Normalizer</tt> rearranges accents into the proper canonical * order, so that you do not have to worry about accent rearrangement on your * own. * <p> * <tt>Normalizer</tt> adds one optional behavior, {@link #IGNORE_HANGUL}, * that differs from * the standard Unicode Normalization Forms. This option can be passed * to the {@link #Normalizer constructors} and to the static * {@link #compose compose} and {@link #decompose decompose} methods. This * option, and any that are added in the future, will be turned off by default. * <p> * There are three common usage models for <tt>Normalizer</tt>. In the first, * the static {@link #normalize normalize()} method is used to process an * entire input string at once. Second, you can create a <tt>Normalizer</tt> * object and use it to iterate through the normalized form of a string by * calling {@link #first} and {@link #next}. Finally, you can use the * {@link #setIndex setIndex()} and {@link #getIndex} methods to perform * random-access iteration, which is very useful for searching. * <p> * <b>Note:</b> <tt>Normalizer</tt> objects behave like iterators and have * methods such as <tt>setIndex</tt>, <tt>next</tt>, <tt>previous</tt>, etc. * You should note that while the <tt>setIndex</tt> and <tt>getIndex</tt> refer * to indices in the underlying <em>input</em> text being processed, the * <tt>next</tt> and <tt>previous</tt> methods it iterate through characters * in the normalized <em>output</em>. This means that there is not * necessarily a one-to-one correspondence between characters returned * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and * returned from <tt>setIndex</tt> and <tt>getIndex</tt>. It is for this * reason that <tt>Normalizer</tt> does not implement the * {@link CharacterIterator} interface. * <p> * <b>Note:</b> <tt>Normalizer</tt> is currently based on version 3.0 * of the <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>. * It will be updated as later versions of Unicode are released. If you are * using this class on a JDK that supports an earlier version of Unicode, it * is possible that <tt>Normalizer</tt> may generate composed or dedecomposed * characters for which your JDK's {@link java.lang.Character} class does not * have any data. * <p> * @author Laura Werner, Mark Davis */ public final class Normalizer implements Cloneable { /** * Constant indicating that the end of the iteration has been reached. * This is guaranteed to have the same value as {@link CharacterIterator#DONE}. */ public static final char DONE = CharacterIterator.DONE; // This tells us what the bits in the "mode" object mean. private static final int COMPAT_BIT = 1; private static final int DECOMP_BIT = 2; private static final int COMPOSE_BIT = 4; /** * This class represents the mode of a {@link Normalizer} * object, <i>i.e.</i> the Unicode Normalization Form of the * text that the <tt>Normalizer</tt> produces. <tt>Mode</tt> objects * are used as arguments to the {@link Normalizer#Normalizer constructors} * and {@link Normalizer#setMode setMode} method of <tt>Normalizer</tt>. * <p> * Clients cannot create <tt>Mode</tt> objects directly. * Instead, use the predefined constants {@link Normalizer#NO_OP}, * {@link Normalizer#COMPOSE}, {@link Normalizer#COMPOSE_COMPAT}, * {@link Normalizer#DECOMP}, and {@link Normalizer#DECOMP_COMPAT}. * <p> * @see Normalizer */ public static final class Mode { Mode(int m) { mode = m; } final boolean compat() { return (mode & COMPAT_BIT) != 0; } final boolean compose() { return (mode & COMPOSE_BIT) != 0; } final boolean decomp() { return (mode & DECOMP_BIT) != 0; } final int mode; }; /** * Null operation for use with the {@link #Normalizer constructors} * and the static {@link #normalize normalize} method. This value tells * the <tt>Normalizer</tt> to do nothing but return unprocessed characters * from the underlying String or CharacterIterator. If you have code which * requires raw text at some times and normalized text at others, you can * use <tt>NO_OP</tt> for the cases where you want raw text, rather * than having a separate code path that bypasses <tt>Normalizer</tt> * altogether. * <p> * @see #setMode */ public static final Mode NO_OP = new Mode(0); /** * Canonical decomposition followed by canonical composition. Used with the * {@link #Normalizer constructors} and the static {@link #normalize normalize} * method to determine the operation to be performed. * <p> * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a> * <b>C</b>. * <p> * @see #setMode */ public static final Mode COMPOSE = new Mode(COMPOSE_BIT); /** * Compatibility decomposition followed by canonical composition. * Used with the {@link #Normalizer constructors} and the static * {@link #normalize normalize} method to determine the operation to be performed. * <p> * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a> * <b>KC</b>. * <p> * @see #setMode */ public static final Mode COMPOSE_COMPAT = new Mode(COMPOSE_BIT | COMPAT_BIT); /** * Canonical decomposition. This value is passed to the * {@link #Normalizer constructors} and the static {@link #normalize normalize} * method to determine the operation to be performed. * <p> * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a> * <b>D</b>. * <p> * @see #setMode */ public static final Mode DECOMP = new Mode(DECOMP_BIT); /** * Compatibility decomposition. This value is passed to the * {@link #Normalizer constructors} and the static {@link #normalize normalize} * method to determine the operation to be performed. * <p> * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned * off, this operation produces output that is in * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical Form</a> * <b>KD</b>. * <p> * @see #setMode */ public static final Mode DECOMP_COMPAT = new Mode(DECOMP_BIT | COMPAT_BIT); /** * Option to disable Hangul/Jamo composition and decomposition. * This option applies to Korean text, * which can be represented either in the Jamo alphabet or in Hangul * characters, which are really just two or three Jamo combined * into one visual glyph. Since Jamo takes up more storage space than * Hangul, applications that process only Hangul text may wish to turn * this option on when decomposing text. * <p> * The Unicode standard treates Hangul to Jamo conversion as a * canonical decomposition, so this option must be turned <b>off</b> if you * wish to transform strings into one of the standard * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> * Unicode Normalization Forms</a>. * <p> * @see #setOption */ public static final int IGNORE_HANGUL = 0x0001; //------------------------------------------------------------------------- // Constructors //------------------------------------------------------------------------- /** * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of a given string. * <p> * @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. */ public Normalizer(String str, Mode mode) { this(new StringCharacterIterator(str), mode, 0); } /** * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of a given string. * <p> * The <tt>options</tt> parameter specifies which optional * <tt>Normalizer</tt> features are to be enabled for this object. * <p> * @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * * @param opt Any optional features to be enabled. * Currently the only available option is {@link #IGNORE_HANGUL}. * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument. */ public Normalizer(String str, Mode mode, int opt) { this(new StringCharacterIterator(str), mode, opt); } /** * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of the given text. * <p> * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * */ public Normalizer(CharacterIterator iter, Mode mode) { this(iter, mode, 0); } /** * Creates a new <tt>Normalizer</tt> object for iterating over the * normalized form of the given text. * <p> * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * * @param opt Any optional features to be enabled. * Currently the only available option is {@link #IGNORE_HANGUL}. * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument. */ public Normalizer(CharacterIterator iter, Mode mode, int opt) { text = iter; this.mode = mode; options = opt; // Compatibility explosions have lower indices; skip them if necessary minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT; } /** * Clones this <tt>Normalizer</tt> object. All properties of this * object are duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor * or to {@link #setText(CharacterIterator) setText}. * However, the text storage underlying * the <tt>CharacterIterator</tt> is not duplicated unless the * iterator's <tt>clone</tt> method does so. */ public Object clone() { try { Normalizer copy = (Normalizer) super.clone(); copy.text = (CharacterIterator) text.clone(); // old version did not clone the buffer this causes serious // problems in the collation element iterator if (buffer != null) { copy.buffer = new StringBuffer(); if (buffer.length() > 0) { copy.buffer.append(buffer); } } return copy; } catch (CloneNotSupportedException e) { throw new InternalError(e.toString()); } } //------------------------------------------------------------------------- // Static utility methods //------------------------------------------------------------------------- /** * Normalizes a <tt>String</tt> using the given normalization operation. * <p> * The <tt>options</tt> parameter specifies which optional * <tt>Normalizer</tt> features are to be enabled for this operation. * Currently the only available option is {@link #IGNORE_HANGUL}. * If you want the default behavior corresponding to one of the standard * Unicode Normalization Forms, use 0 for this argument. * <p> * @param str the input string to be normalized. * * @param aMode the normalization mode * * @param options the optional features to be enabled. */ public static String normalize(String str, Mode mode, int options) { return normalize(str, mode, options, false); } public static String normalize(String str, Mode mode, int options, boolean addSingleQuotation) { if (mode.compose()) { // compose() handles decomposition and reordering; // don't call decompose() first. return compose(str, mode.compat(), options); } if (mode.decomp()) { return decompose(str, mode.compat(), options, addSingleQuotation); } return str; } //------------------------------------------------------------------------- // Compose methods //------------------------------------------------------------------------- /** * Compose a <tt>String</tt>. * <p> * The <tt>options</tt> parameter specifies which optional * <tt>Normalizer</tt> features are to be enabled for this operation. * Currently the only available option is {@link #IGNORE_HANGUL}. * If you want the default behavior corresponding * to Unicode Normalization Form <b>C</b> or <b>KC</b>, * use 0 for this argument. * <p> * @param source the string to be composed. * * @param compat Perform compatibility decomposition before composition. * If this argument is <tt>false</tt>, only canonical * decomposition will be performed. * * @param options the optional features to be enabled. * * @return the composed string. */ public static String compose(String source, boolean compat, int options) { StringBuffer result = new StringBuffer(); StringBuffer explodeBuf = new StringBuffer(); int explodePos = EMPTY; // Position in input buffer int basePos = 0; // Position of last base in output string int baseIndex = 0; // Index of last base in "actions" array int classesSeenL = 0; // Combining classes seen since last base int classesSeenH = 0; // 64-bit mask int action; // Compatibility explosions have lower indices; skip them if necessary int minExplode = compat ? 0 : ComposeData.MAX_COMPAT; int minDecomp = compat ? 0 : DecompData.MAX_COMPAT; if (DEBUG) System.out.println("minExplode = " + minExplode); int i = 0; while (i < source.length() || explodePos != EMPTY) { // Get the next char from either the buffer or the source char ch; if (explodePos == EMPTY) { ch = source.charAt(i++); } else { ch = explodeBuf.charAt(explodePos++); if (explodePos >= explodeBuf.length()) { explodePos = EMPTY; explodeBuf.setLength(0); } } // Get the basic info for the character int charInfo = composeLookup(ch); int type = charInfo & ComposeData.TYPE_MASK; int index = charInfo >>> ComposeData.INDEX_SHIFT; if (DEBUG) System.out.println("Got char " + Utility.hex(ch) + ", type=" + type + ", index=" + index); // Examples of NON_COMPOSING_COMBINING with an index < minExplode: // 00A8 017F 03D2 1FBF 1FFE if (type == ComposeData.BASE || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)) { if (DEBUG) System.out.println("New base " + Utility.hex(ch) + ", type=" + type + ", index=" + index); classesSeenL = classesSeenH = 0; baseIndex = index; basePos = result.length(); result.append(ch); } else if (type == ComposeData.COMBINING) { // assert(index > 0); int cclass = ComposeData.typeBit[index]; // typeBit is a bit value from 0..63, indicating the class. // We use a bit mask of 2 32-bit ints. boolean seen = 0 != ((cclass < 32) ? (classesSeenL & (1 << cclass)) : (classesSeenH & (1 << (cclass & 31)))); if (DEBUG) System.out.println("Class of " + Utility.hex(ch) + " = " + cclass + " seen:" + seen + " baseIndex:" + baseIndex + " action:" + composeAction(baseIndex, index)); // We can only combine a character with the base if we haven't // already seen a combining character with the same canonical class. // We only combine characters with an index from // 1..COMBINING_COUNT-1. Indices >= COMBINING_COUNT are // also combining characters, but we know that they don't // compose with anything. if (index < ComposeData.COMBINING_COUNT && !seen && (action = composeAction(baseIndex, index)) > 0) { if (action > ComposeData.MAX_COMPOSED) { // Pairwise explosion. Actions above this value are really // indices into an array that in turn contains indices // into the exploding string table // TODO: What if there are unprocessed chars in the explode buffer? if (DEBUG) System.out.println("Pairwise exploding"); char newBase = pairExplode(explodeBuf, action); explodePos = 0; result.setCharAt(basePos, newBase); baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT; if (DEBUG) System.out.println("New base " + Utility.hex(newBase)); } else { // Normal pairwise combination. Replace the base char if (DEBUG) System.out.println("Pairwise combining"); char newBase = (char) action; result.setCharAt(basePos, newBase); baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT; if (DEBUG) System.out.println("New base " + Utility.hex(newBase)); } // // Since there are Unicode characters that cannot be combined in arbitrary // order, we have to re-process any combining marks that go with this // base character. There are only four characters in Unicode that have // this problem. If they are fixed in Unicode 3.0, this code can go away. // int len = result.length(); if (len - basePos > 1) { for (int j = basePos+1; j < len; j++) { explodeBuf.append(result.charAt(j)); } result.setLength(basePos+1); classesSeenL = classesSeenH = 0; if (explodePos == EMPTY) explodePos = 0; } } else { // No combination with this character if (DEBUG) System.out.println("No action"); bubbleAppend(result, ch, cclass); if (cclass < 32) { classesSeenL |= 1 << cclass; } else { classesSeenH |= 1 << (cclass & 31); } } } else if (index > minExplode) { // Single exploding character explode(explodeBuf, index); explodePos = 0; if (DEBUG) System.out.println("explosion: " + Utility.hex(ch) + " --> " + Utility.hex(explodeBuf)); } else if (type == ComposeData.HANGUL && minExplode == 0) { // If we're in compatibility mode we need to decompose Hangul to Jamo, // because some of the Jamo might have compatibility decompositions. hangulToJamo(ch, explodeBuf, minDecomp); if (DEBUG) System.out.println("decomposed hangul " + Utility.hex(ch) + " to jamo " + Utility.hex(explodeBuf)); explodePos = 0; } else if (type == ComposeData.INITIAL_JAMO) { classesSeenL = classesSeenH = 0; baseIndex = ComposeData.INITIAL_JAMO_INDEX; basePos = result.length(); result.append(ch); if (DEBUG) System.out.println("got initial jamo " + Utility.hex(ch)); } else if (type == ComposeData.MEDIAL_JAMO && classesSeenL == 0 && classesSeenH == 0 && baseIndex == ComposeData.INITIAL_JAMO_INDEX) { // If the last character was an initial jamo, we can combine it with this // one to create a Hangul character. int l = result.charAt(basePos) - JAMO_LBASE; int v = ch - JAMO_VBASE; char newCh = (char)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT); result.setCharAt(basePos, newCh); if (DEBUG) System.out.println("got medial jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh)); baseIndex = ComposeData.MEDIAL_JAMO_INDEX; } else if (type == ComposeData.FINAL_JAMO && classesSeenL == 0 && classesSeenH == 0 && baseIndex == ComposeData.MEDIAL_JAMO_INDEX) { // If the last character was a medial jamo that we turned into Hangul, // we can add this character too. char newCh = (char)(result.charAt(basePos) + (ch - JAMO_TBASE)); result.setCharAt(basePos, newCh); if (DEBUG) System.out.println("got final jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh)); baseIndex = 0; basePos = -1; classesSeenL = classesSeenH = 0; } else { if (DEBUG) System.out.println("No base as of " + Utility.hex(ch)); baseIndex = 0; basePos = -1; classesSeenL = classesSeenH = 0; result.append(ch); } } return result.toString(); } /** * Compose starting with current input character and continuing * until just before the next base char. * <p> * <b>Input</b>: * <ul> * <li>underlying char iter points to first character to compose * </ul> * <p> * <b>Output:</b> * <ul> * <li>returns first char of composition or DONE if at end * <li>Underlying char iter is pointing at next base char or past end * </ul> */ private char nextCompose() { if (DEBUG) System.out.println("--------------- top of nextCompose() ---------------"); int explodePos = EMPTY; // Position in input buffer int basePos = 0; // Position of last base in output string int baseIndex = 0; // Index of last base in "actions" array int classesSeenL = 0; // Combining classes seen since last base int classesSeenH = 0; // 64-bit mask int action; char lastBase = 0; boolean chFromText = true; currentIndex = nextIndex; text.setIndex(currentIndex); // Compatibility explosions have lower indices; skip them if necessary int minExplode = mode.compat() ? 0 : ComposeData.MAX_COMPAT; int minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT; initBuffer(); if (explodeBuf == null) { explodeBuf = new StringBuffer(); } else { explodeBuf.setLength(0); } char ch = curForward(); while (ch != DONE) { // Get the basic info for the character int charInfo = composeLookup(ch); int type = charInfo & ComposeData.TYPE_MASK; int index = charInfo >>> ComposeData.INDEX_SHIFT; if (type == ComposeData.BASE || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode)) { if (getBufferLength() > 0 && chFromText && explodePos == EMPTY) { // When we hit a base char in the source text, we can return the text // that's been composed so far. We'll re-process this char next time through. if (DEBUG) System.out.println("returning early because we hit a new base"); break; } classesSeenL = classesSeenH = 0; baseIndex = index; basePos = getBufferLength(); buffer.append(ch); if (DEBUG) System.out.println("got BASE char " + Utility.hex(ch) + ", type=" + type + ", index=" + index); lastBase = ch; } else if (type == ComposeData.COMBINING) { // assert(index > 0); int cclass = ComposeData.typeBit[index]; boolean seen = 0 != ((cclass < 32) ? (classesSeenL & (1 << cclass)) : (classesSeenH & (1 << (cclass & 31)))); if (DEBUG) System.out.println("got COMBINING char " + Utility.hex(ch) + ", type=" + type + ", index=" + index + ", class=" + cclass); // We can only combine a character with the base if we haven't // already seen a combining character with the same canonical class. if (index < ComposeData.COMBINING_COUNT && !seen && (action = composeAction(baseIndex, index)) > 0) { if (action > ComposeData.MAX_COMPOSED) { // Pairwise explosion. Actions above this value are really // indices into an array that in turn contains indices // into the exploding string table // TODO: What if there are unprocessed chars in the explode buffer? char newBase = pairExplode(explodeBuf, action); explodePos = 0; buffer.setCharAt(basePos, newBase); baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT; if (DEBUG) System.out.println("Pairwise explosion: " + Utility.hex(lastBase) + "," + Utility.hex(ch) + " --> " + Utility.hex(newBase) + "," + Utility.hex(explodeBuf)); lastBase = newBase; } else { // Normal pairwise combination. Replace the base char char newBase = (char) action; buffer.setCharAt(basePos, newBase); baseIndex = composeLookup(newBase) >>> ComposeData.INDEX_SHIFT; if (DEBUG) System.out.println("Pairwise combination: " + Utility.hex(lastBase) + "," + Utility.hex(ch) + " --> " + Utility.hex(newBase)); lastBase = newBase; } // // Since there are Unicode characters that cannot be combined in arbitrary // order, we have to re-process any combining marks that go with this // base character. There are only four characters in Unicode that have // this problem. If they are fixed in Unicode 3.0, this code can go away. // int len = getBufferLength(); if (len - basePos > 1) { if (DEBUG) System.out.println("Reprocessing combining marks"); for (int j = basePos+1; j < len; j++) { explodeBuf.append(buffer.charAt(j)); } buffer.setLength(basePos+1); classesSeenL = classesSeenH = 0; if (explodePos == EMPTY) explodePos = 0; } } else { if (DEBUG) System.out.println("char doesn't combine"); // No combination with this character bubbleAppend(buffer, ch, cclass); if (cclass < 32) { classesSeenL |= 1 << cclass; } else { classesSeenH |= 1 << (cclass & 31); } } } else if (index > minExplode) { // Single exploding character explode(explodeBuf, index); explodePos = 0; if (DEBUG) System.out.println("explosion: " + Utility.hex(ch) + " --> " + Utility.hex(explodeBuf)); } else if (type == ComposeData.HANGUL && minExplode == 0) { // If we're in compatibility mode we need to decompose Hangul to Jamo, // because some of the Jamo might have compatibility decompositions. hangulToJamo(ch, explodeBuf, minDecomp); if (DEBUG) System.out.println("decomposed hangul " + Utility.hex(ch) + " to jamo " + Utility.hex(explodeBuf)); explodePos = 0; } else if (type == ComposeData.INITIAL_JAMO) { if (getBufferLength() > 0 && chFromText && explodePos == EMPTY) { // When we hit a base char in the source text, we can return the text // that's been composed so far. We'll re-process this char next time through. if (DEBUG) System.out.println("returning early because we hit a new base"); break; } classesSeenL = classesSeenH = 0; baseIndex = ComposeData.INITIAL_JAMO_INDEX; basePos = getBufferLength(); buffer.append(ch); if (DEBUG) System.out.println("got initial jamo " + Utility.hex(ch)); } else if (type == ComposeData.MEDIAL_JAMO && classesSeenL == 0 && classesSeenH == 0 && baseIndex == ComposeData.INITIAL_JAMO_INDEX) { // If the last character was an initial jamo, we can combine it with this // one to create a Hangul character. int l = buffer.charAt(basePos) - JAMO_LBASE; int v = ch - JAMO_VBASE; char newCh = (char)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT); buffer.setCharAt(basePos, newCh); if (DEBUG) System.out.println("got medial jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh)); baseIndex = ComposeData.MEDIAL_JAMO_INDEX; } else if (type == ComposeData.FINAL_JAMO && classesSeenL == 0 && classesSeenH == 0 && baseIndex == ComposeData.MEDIAL_JAMO_INDEX) { // If the last character was a medial jamo that we turned into Hangul, // we can add this character too. char newCh = (char)(buffer.charAt(basePos) + (ch - JAMO_TBASE)); buffer.setCharAt(basePos, newCh); if (DEBUG) System.out.println("got final jamo " + Utility.hex(ch) + ", replacing with Hangul " + Utility.hex(newCh)); baseIndex = 0; basePos = -1; classesSeenL = classesSeenH = 0; } else { // TODO: deal with JAMO character types baseIndex = 0; basePos = -1; classesSeenL = classesSeenH = 0; buffer.append(ch); if (DEBUG) System.out.println("UNKNOWN char " + Utility.hex(ch)); } if (explodePos == EMPTY) { ch = text.next(); chFromText = true; } else { ch = explodeBuf.charAt(explodePos++); if (explodePos >= explodeBuf.length()) { explodePos = EMPTY; explodeBuf.setLength(0); } chFromText = false; } } if (getBufferLength() > 0) { ch = buffer.charAt(0); } else { ch = DONE; } nextIndex = text.getIndex(); return ch; } /** * Compose starting with the input char just before the current position * and continuing backward until (and including) the previous base char. * <p> * <b>Input</b>: * <ul> * <li>underlying char iter points just after last char to decompose * </ul> * <p> * <b>Output:</b> * <ul> * <li>returns last char of resulting decomposition sequence * <li>underlying iter points to lowest-index char we decomposed, i.e. the base char * </ul> */ private char prevCompose() { if (DEBUG) System.out.println("--------------- top of prevCompose() ---------------"); // Compatibility explosions have lower indices; skip them if necessary int minExplode = mode.compat() ? 0 : ComposeData.MAX_COMPAT; nextIndex=currentIndex; initBuffer(); // Slurp up characters until we hit a base char or an initial Jamo char ch; while ((ch = curBackward()) != DONE) { buffer.insert(0, ch); // Get the basic info for the character int charInfo = composeLookup(ch); int type = charInfo & ComposeData.TYPE_MASK; int index = charInfo >>> ComposeData.INDEX_SHIFT; if (DEBUG) System.out.println("prevCompose got char " + Utility.hex(ch) + ", type=" + type + ", index=" + index + ", minExplode=" + minExplode); if (type == ComposeData.BASE || (type == ComposeData.NON_COMPOSING_COMBINING && index < minExplode) || type == ComposeData.HANGUL || type == ComposeData.INITIAL_JAMO) { break; } } // If there's more than one character in the buffer, compose it all at once.... if (getBufferLength() > 0) { // TODO: The performance of this is awful; add a way to compose // a StringBuffer in place. String composed = compose(buffer.toString(), mode.compat(), options); if (DEBUG) System.out.println("prevCompose called compose(" + Utility.hex(buffer) + ")->" + Utility.hex(composed)); buffer.setLength(0); buffer.append(composed); if (getBufferLength() > 1) { bufferPos = getBufferLength()-1; ch = buffer.charAt(bufferPos); } else { ch = buffer.charAt(0); } } else { ch = DONE; } currentIndex = text.getIndex(); if (DEBUG) System.out.println("prevCompose returning " + Utility.hex(ch)); return ch; } private static void bubbleAppend(StringBuffer target, char ch, int cclass) { if (DEBUG) System.out.println(" bubbleAppend(" + Utility.hex(target) + ", " + Utility.hex(ch) + ", " + cclass + ")" ); if (DEBUG) System.out.println(" getComposeClass(" + Utility.hex(ch) + ")=" + getComposeClass(ch) ); if (DEBUG) System.out.println(" target before bubbling is : " + Utility.hex(target)); int i = target.length()-1; if (cclass != 1) { // 1 means combining class 0!!! for (; i >= 0; --i ) { int iClass = getComposeClass(target.charAt(i)); if (DEBUG) System.out.println(" getComposeClass(" + Utility.hex(target.charAt(i)) + ")=" + getComposeClass(target.charAt(i)) ); if (DEBUG) System.out.println(" bubbleAppend: target[" + i + "]=" + Utility.hex(target.charAt(i)) + " is iClass=" + iClass); if (DEBUG) System.out.println(" bubbleAppend: for ch="+ Utility.hex(ch) + " class="+cclass); if (iClass <= cclass) { // We've hit something we can't bubble this character past, so insert here break; } } } // We need to insert just after character "i" if (DEBUG) System.out.println(" bubbleAppend inserting "+ Utility.hex(ch) + " at index " + (i+1)); target.insert(i+1, ch); if (DEBUG) System.out.println(" target is : " + Utility.hex(target)); } private static int getComposeClass(char ch) { int cclass = 0; int charInfo = composeLookup(ch); int type = charInfo & ComposeData.TYPE_MASK; if (type == ComposeData.COMBINING) { cclass = ComposeData.typeBit[charInfo >>> ComposeData.INDEX_SHIFT]; } return cclass; } static final int composeLookup(char ch) { return ComposeData.lookup.elementAt(ch); } static final int composeAction(int baseIndex, int comIndex) { return ComposeData.actions.elementAt((char)(baseIndex + ComposeData.MAX_BASES*comIndex)); } static final void explode(StringBuffer target, int index) { char ch; while ((ch = ComposeData.replaceCharAt(index++)) != 0) target.append(ch); } static final char pairExplode(StringBuffer target, int action) { int index = ComposeData.actionIndex[action - ComposeData.MAX_COMPOSED]; explode(target, index + 1); return ComposeData.replaceCharAt(index); // New base char } //------------------------------------------------------------------------- // Decompose methods //------------------------------------------------------------------------- /** * Static method to decompose a <tt>String</tt>. * <p> * The <tt>options</tt> parameter specifies which optional * <tt>Normalizer</tt> features are to be enabled for this operation. * Currently the only available option is {@link #IGNORE_HANGUL}. * The desired options should be OR'ed together to determine the value * of this argument. If you want the default behavior corresponding * to Unicode Normalization Form <b>D</b> or <b>KD</b>, * use 0 for this argument. * <p> * @param str the string to be decomposed. * * @param compat Perform compatibility decomposition. * If this argument is <tt>false</tt>, only canonical * decomposition will be performed. * * * @return the decomposed string. */ public static String decompose(String source, boolean compat, int options) { return decompose(source, compat, options, false); } public static String decompose(String source, boolean compat, int options, boolean addSingleQuotation) { if (DEBUG) System.out.println("--------------- top of decompose() ---------------"); boolean hangul = (options & IGNORE_HANGUL) == 0; int minDecomp = compat ? 0 : DecompData.MAX_COMPAT; StringBuffer result = new StringBuffer(); StringBuffer buffer = null; StringBuffer tmpBuf = null; int i = 0, bufPtr = -1; if (addSingleQuotation) { tmpBuf = new StringBuffer(); } while (i < source.length() || bufPtr >= 0) { char ch; if (bufPtr >= 0) { ch = buffer.charAt(bufPtr++); if (bufPtr == buffer.length()) { bufPtr = -1; } } else { ch = source.charAt(i++); } int offset = DecompData.offsets.elementAt(ch); int index = offset & DecompData.DECOMP_MASK; if (DEBUG) System.out.println("decompose got " + Utility.hex(ch)); if (index > minDecomp) { if ((offset & DecompData.DECOMP_RECURSE) != 0) { if (DEBUG) System.out.println(" " + Utility.hex(ch) + " has RECURSIVE decomposition, index=" + index); if (buffer == null) { buffer = new StringBuffer(); } else { buffer.setLength(0); } DecompData.doAppend(index, buffer); bufPtr = 0; } else { if (DEBUG) System.out.println(" " + Utility.hex(ch) + " has decomposition, index=" + index); if (!addSingleQuotation) { DecompData.doAppend(index, result); } else { tmpBuf.setLength(0); DecompData.doAppend(index, tmpBuf); if ((tmpBuf.length() > 1) || (ch == 0x037e) || // normalized to ';' (ch == 0x1fef)) { // normalized to '`' for (int j=0; j < tmpBuf.length(); j++) { char c = tmpBuf.charAt(j); if ((c >= 0x0009 && c <= 0x000D) || (c >= 0x0020 && c <= 0x002F) || (c >= 0x003A && c <= 0x0040) || (c >= 0x005B && c <= 0x0060) || (c >= 0x007B && c <= 0x007E)) { result.append('\''); result.append(c); result.append('\''); } else { result.append(c); } } } else { result.append(tmpBuf); } } } } else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) { hangulToJamo(ch, result, minDecomp); } else { result.append(ch); } } fixCanonical(result); return result.toString(); } /** * Decompose starting with current input character and continuing * until just before the next base char. * <p> * <b>Input</b>: * <ul> * <li>underlying char iter points to first character to decompose * </ul> * <p> * <b>Output:</b> * <ul> * <li>returns first char of decomposition or DONE if at end * <li>Underlying char iter is pointing at next base char or past end * </ul> */ private char nextDecomp() { if (DEBUG) System.out.println("--------------- top of nextDecomp() ---------------"); boolean hangul = (options & IGNORE_HANGUL) == 0; currentIndex = nextIndex; char ch = curForward(); int offset = DecompData.offsets.elementAt(ch); int index = offset & DecompData.DECOMP_MASK; initBuffer(); if (index > minDecomp || DecompData.canonClass.elementAt(ch) != DecompData.BASE) { if (index > minDecomp) { if (DEBUG) System.out.println(" " + Utility.hex(ch) + " has decomposition, index=" + index); DecompData.doAppend(index, buffer); if ((offset & DecompData.DECOMP_RECURSE) != 0) { // Need to decompose the output of this decomposition recursively. for (int i = 0; i < getBufferLength(); i++) { ch = buffer.charAt(i); index = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK; if (index > minDecomp) { i += DecompData.doReplace(index, buffer, i); } } } } else { buffer.append(ch); } boolean needToReorder = false; // Any other combining chacters that immediately follow the decomposed // character must be included in the buffer too, because they're // conceptually part of the same logical character. while ((ch = text.next()) != DONE && DecompData.canonClass.elementAt(ch) != DecompData.BASE) { needToReorder = true; // Decompose any of these characters that need it - Liu index = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK; if (index > minDecomp) { DecompData.doAppend(index, buffer); } else { buffer.append(ch); } } if (getBufferLength() > 1 && needToReorder) { // If there is more than one combining character in the buffer, // put them into the canonical order. // But we don't need to sort if only characters are the ones that // resulted from decomosing the base character. fixCanonical(buffer); } ch = buffer.charAt(0); } else { // Just use this character, but first advance to the next one text.next(); buffer.setLength(0); buffer.append(ch); // Do Hangul -> Jamo decomposition if necessary if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) { clearBuffer(); hangulToJamo(ch, buffer, minDecomp); ch = buffer.charAt(0); } } nextIndex = text.getIndex(); if (DEBUG) System.out.println("nextDecomp getBufferLength() " + getBufferLength() + " buffer : " + buffer.toString()); if (DEBUG) System.out.println("nextDecomp returning " + Utility.hex(ch) + ", text index=" + text.getIndex()); return ch; } /** * Decompose starting with the input char just before the current position * and continuing backward until (and including) the previous base char. * <p> * <b>Input</b>: * <ul> * <li>underlying char iter points just after last char to decompose * </ul> * <p> * <b>Output:</b> * <ul> * <li>returns last char of resulting decomposition sequence * <li>underlying iter points to lowest-index char we decomposed, i.e. the base char * </ul> */ private char prevDecomp() { if (DEBUG) System.out.println("--------------- top of prevDecomp() ---------------"); boolean hangul = (options & IGNORE_HANGUL) == 0; nextIndex=currentIndex; char ch = curBackward(); int offset = DecompData.offsets.elementAt(ch); int index = offset & DecompData.DECOMP_MASK; if (DEBUG) System.out.println("prevDecomp got input char " + Utility.hex(ch)); initBuffer(); if (index > minDecomp || DecompData.canonClass.elementAt(ch) != DecompData.BASE) { // This method rewritten to pass conformance tests. - Liu // Collect all characters up to the previous base char while (ch != DONE) { buffer.insert(0, ch); if (DecompData.canonClass.elementAt(ch) == DecompData.BASE) break; ch = text.previous(); } if (DEBUG) System.out.println("prevDecomp buffer: " + Utility.hex(buffer)); // Decompose the buffer for (int i = 0; i < getBufferLength(); i++) { ch = buffer.charAt(i); offset = DecompData.offsets.elementAt(ch); index = offset & DecompData.DECOMP_MASK; if (index > minDecomp) { int j = DecompData.doReplace(index, buffer, i); if ((offset & DecompData.DECOMP_RECURSE) != 0) { // Need to decompose this recursively for (; i < j; ++i) { ch = buffer.charAt(i); index = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK; if (index > minDecomp) { i += DecompData.doReplace(index, buffer, i); } } } i = j; } } if (DEBUG) System.out.println("prevDecomp buffer after decomp: " + Utility.hex(buffer)); if (getBufferLength() > 1) { // If there is more than one combining character in the buffer, // put them into the canonical order. fixCanonical(buffer); } bufferPos = getBufferLength()-1; ch = buffer.charAt(bufferPos); } else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) { hangulToJamo(ch, buffer, minDecomp); getBufferLength(); bufferPos = getBufferLength()-1; ch = buffer.charAt(bufferPos); } else { buffer.append(ch); getBufferLength(); bufferPos=getBufferLength()-1; } currentIndex = text.getIndex(); if (DEBUG) System.out.println(" prevDecomp getBufferLength() " + getBufferLength() + " buffer : " + buffer.toString()); if (DEBUG) System.out.println(" prevDecomp returning '" + ch + "' " + Utility.hex(ch) + ", text index=" + text.getIndex()); return ch; } public static final int getClass(char ch) { int value = DecompData.canonClass.elementAt(ch); return (value >= 0) ? value : value + 256; } //------------------------------------------------------------------------- // CharacterIterator overrides //------------------------------------------------------------------------- /** * Return the current character in the normalized text. */ public char current() { if (bufferPos >= getBufferLength() || getBufferLength()==0) { bufferPos = 0; // Buffer is now out of date // i have a problem with this, we are moving one away from // the current position, which isn't right. // even when currentIndex is still at the present position // text.index would have shifted because of next*() // we have to reset the text to the former position // admittedly, this isn't the best solution if (mode.compose()) { currentChar = nextCompose(); text.setIndex(currentIndex); } else if (mode.decomp()) { currentChar = nextDecomp(); text.setIndex(currentIndex); } else { if (currentIndex == 0) { currentChar = text.current(); } else{ /* text.setIndex(currentIndex - 1); currentChar = text.next(); */ currentChar = text.current(); } } } else { currentChar = buffer.charAt(bufferPos); } return currentChar; } /** * Return the first character in the normalized text. This resets * the <tt>Normalizer's</tt> position to the beginning of the text. */ public char first() { reset(); return next(); } /** * Return the last character in the normalized text. This resets * the <tt>Normalizer's</tt> position to be just before the * the input text corresponding to that normalized character. */ public char last() { currentIndex = nextIndex = text.getEndIndex() - 1; text.setIndex(currentIndex); // Setting to getEndIndex() fails in 1.1 atEnd = true; // so work around the bug currentChar = DONE; // The current char hasn't been processed clearBuffer(); // The buffer is empty too return previous(); } /** * Return the current character in the normalized text and advance * the iteration position by one. If the end * of the text has already been reached, {@link #DONE} is returned. */ public char next() { if (buffer != null && (++bufferPos) < buffer.length()) { currentChar= buffer.charAt(bufferPos); } else { bufferPos = 0; // Buffer is now out of date if (mode.compose()) { currentChar = nextCompose(); } else if (mode.decomp()) { currentChar = nextDecomp(); } else { // If we're not really doing decomposition, just return the current char currentChar = text.current(); text.next(); //Set the indicies for no op if (currentChar != CharacterIterator.DONE) { currentIndex = nextIndex+=1; } } } return currentChar; } /** * Return the previous character in the normalized text and decrement * the iteration position by one. If the beginning * of the text has already been reached, {@link #DONE} is returned. */ public char previous() { if (bufferPos > 0) { // There are output characters left in the buffer currentChar = buffer.charAt(--bufferPos); } else { bufferPos = 0; // Buffer is now out of date if (mode.compose()) { currentChar = prevCompose(); } else if (mode.decomp()) { currentChar = prevDecomp(); } else { text.setIndex(currentIndex); currentChar = text.previous(); if (currentIndex != 0) { currentIndex = nextIndex -= 1; } } } return currentChar; } private int getBufferLength() { if (buffer == null){ return 0; } else { return buffer.length(); } } /** * Set the iteration position in the input text that is being normalized * and return the first normalized character at that position. * <p> * @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating * forward starting at the given index. * * @throws IllegalArgumentException if the given index is less than * {@link #getBeginIndex} or greater than {@link #getEndIndex}. */ public char setIndex(int index) { setIndexOnly(index); return current(); } public void setIndexOnly(int index) { currentIndex = nextIndex = index; text.setIndex(index); // Checks range currentChar = DONE; // The current char hasn't been processed clearBuffer(); // The buffer is empty too } /** * Retrieve the current iteration position in the input text that is * being normalized. This method is useful in applications such as * searching, where you need to be able to determine the position in * the input text that corresponds to a given normalized output character. */ public final int getIndex() { return text.getIndex(); } /** * Retrieve the index of the start of the input text. This is the begin index * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> * over which this <tt>Normalizer</tt> is iterating */ public final int getBeginIndex() { return text.getBeginIndex(); } /** * Retrieve the index of the end of the input text. This is the end index * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> * over which this <tt>Normalizer</tt> is iterating */ public final int getEndIndex() { return text.getEndIndex(); } //------------------------------------------------------------------------- // Property access methods //------------------------------------------------------------------------- /** * Set the normalization mode for this object. * <p> * <b>Note:</b>If the normalization mode is changed while iterating * over a string, calls to {@link #next} and {@link #previous} may * return previously buffers characters in the old normalization mode * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, * {@link #last}, etc. after calling <tt>setMode</tt>. * <p> * @param newMode the new mode for this <tt>Normalizer</tt>. * The supported modes are: * <ul> * <li>{@link #COMPOSE} - Unicode canonical decompositiion * followed by canonical composition. * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion * follwed by canonical composition. * <li>{@link #DECOMP} - Unicode canonical decomposition * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition. * <li>{@link #NO_OP} - Do nothing but return characters * from the underlying input text. * </ul> * * @see #getMode */ public void setMode(Mode newMode) { mode = newMode; minDecomp = mode.compat() ? 0 : DecompData.MAX_COMPAT; } /** * Return the basic operation performed by this <tt>Normalizer</tt> * * @see #setMode */ public Mode getMode() { return mode; } /** * Set options that affect this <tt>Normalizer</tt>'s operation. * Options do not change the basic composition or decomposition operation * that is being performed , but they control whether * certain optional portions of the operation are done. * Currently the only available option is: * <p> * <ul> * <li>{@link #IGNORE_HANGUL} - Do not decompose Hangul syllables into the Jamo alphabet * and vice-versa. This option is off by default (<i>i.e.</i> Hangul processing * is enabled) since the Unicode standard specifies that Hangul to Jamo * is a canonical decomposition. For any of the standard Unicode Normalization * Forms, you should leave this option off. * </ul> * <p> * @param option the option whose value is to be set. * @param value the new setting for the option. Use <tt>true</tt> to * turn the option on and <tt>false</tt> to turn it off. * * @see #getOption */ public void setOption(int option, boolean value) { if (option != IGNORE_HANGUL) { throw new IllegalArgumentException("Illegal option"); } if (value) { options |= option; } else { options &= (~option); } } /** * Determine whether an option is turned on or off. * <p> * @see #setOption */ public boolean getOption(int option) { return (options & option) != 0; } /** * Set the input text over which this <tt>Normalizer</tt> will iterate. * The iteration position will be reset to the beginning. * <p> * @param newText The new string to be normalized. */ public void setText(String newText) { text = new StringCharacterIterator(newText); reset(); } /** * Set the input text over which this <tt>Normalizer</tt> will iterate. * The iteration position will be reset to the beginning. * <p> * @param newText The new text to be normalized. */ public void setText(CharacterIterator newText) { text = newText; reset(); } //------------------------------------------------------------------------- // Private utility methods //------------------------------------------------------------------------- private final char curForward() { char ch = text.current(); if (DEBUG) System.out.println(" curForward returning " + Utility.hex(ch) + ", text index=" + text.getIndex()); return ch; } private final char curBackward() { char ch = atEnd ? text.current() : text.previous(); atEnd = false; if (DEBUG) System.out.println(" curBackward returning " + Utility.hex(ch) + ", text index=" + text.getIndex()); return ch; } public void reset() { currentIndex = nextIndex = text.getBeginIndex(); text.setIndex(currentIndex); atEnd = false; bufferPos = 0; clearBuffer(); } private final void initBuffer() { if (buffer == null) { buffer = new StringBuffer(10); } else { buffer.setLength(0); } clearBuffer(); } private final void clearBuffer() { bufferPos = 0; if (buffer != null) { buffer.setLength(0); } } /** * Fixes the sorting sequence of non-spacing characters according to * their combining class. The algorithm is listed on p.3-11 in the * Unicode Standard 2.0. The table of combining classes is on p.4-2 * in the Unicode Standard 2.0. * @param result the string to fix. */ private static void fixCanonical(StringBuffer result) { if (result.length() == 0) return; // don't bother with empty strings! int i = result.length() - 1; int currentType = getClass(result.charAt(i)); int lastType; for (--i; i >= 0; --i) { lastType = currentType; currentType = getClass(result.charAt(i)); // // a swap is presumed to be rare (and a double-swap very rare), // so don't worry about efficiency here. // if (currentType > lastType && lastType != DecompData.BASE) { // swap characters char temp = result.charAt(i); result.setCharAt(i, result.charAt(i+1)); result.setCharAt(i+1, temp); // if not at end, backup (one further, to compensate for for-loop) if (i < result.length() - 2) { i += 2; } // reset type, since we swapped. currentType = getClass(result.charAt(i)); } } } //------------------------------------------------------------------------- // Hangul / Jamo conversion utilities for internal use // See section 3.10 of The Unicode Standard, v 2.0. // // Package-accessible for use by ComposedCharIter static final char HANGUL_BASE = 0xac00; static final char HANGUL_LIMIT = 0xd7a4; private static final char JAMO_LBASE = 0x1100; private static final char JAMO_VBASE = 0x1161; private static final char JAMO_TBASE = 0x11a7; private static final int JAMO_LCOUNT = 19; private static final int JAMO_VCOUNT = 21; private static final int JAMO_TCOUNT = 28; private static final int JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT; /** * Convert a single Hangul syllable into one or more Jamo characters. * * @param conjoin If true, decompose Jamo into conjoining Jamo. */ static int hangulToJamo(char ch, StringBuffer result, int decompLimit) { char sIndex = (char)(ch - HANGUL_BASE); char leading = (char)(JAMO_LBASE + sIndex / JAMO_NCOUNT); char vowel = (char)(JAMO_VBASE + (sIndex % JAMO_NCOUNT) / JAMO_TCOUNT); char trailing= (char)(JAMO_TBASE + (sIndex % JAMO_TCOUNT)); int length = 0; length += jamoAppend(leading, decompLimit, result); length += jamoAppend(vowel, decompLimit, result); if (trailing != JAMO_TBASE) { length += jamoAppend(trailing, decompLimit, result); } return length; } static final int jamoAppend(char ch, int limit, StringBuffer dest) { int offset = DecompData.offsets.elementAt(ch); if (offset > limit) { return DecompData.doAppend(offset, dest); } else { dest.append(ch); return 1; } } static private void jamoToHangul(StringBuffer buffer, int start) { int out = 0; int limit = buffer.length() - 1; int in, l, v, t; for (in = start; in < limit; in++) { char ch = buffer.charAt(in); if ((l = ch - JAMO_LBASE) >= 0 && l < JAMO_LCOUNT && (v = buffer.charAt(in+1) - JAMO_VBASE) >= 0 && v < JAMO_VCOUNT) { // // We've found a pair of Jamo characters to compose. // Snarf the Jamo vowel and see if there's also a trailing char // in++; // Snarf the Jamo vowel too. t = (in < limit) ? buffer.charAt(in+1) : 0; t -= JAMO_TBASE; if (t >= 0 && t < JAMO_TCOUNT) { in++; // Snarf the trailing consonant too } else { t = 0; // No trailing consonant } buffer.setCharAt(out++, (char)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE)); } else { buffer.setCharAt(out++, ch); } } while (in < buffer.length()) { buffer.setCharAt(out++, buffer.charAt(in++)); } buffer.setLength(out); } //------------------------------------------------------------------------- // Private data //------------------------------------------------------------------------- private static final boolean DEBUG = false; private Mode mode = DECOMP; private int options = 0; private transient int minDecomp; private int currentIndex = 0; private int nextIndex = 0; // The input text and our position in it private CharacterIterator text; private boolean atEnd = false; // A buffer for holding intermediate results private StringBuffer buffer = null; private int bufferPos = 0; private char currentChar; // Another buffer for use during iterative composition private static final int EMPTY = -1; private StringBuffer explodeBuf = null; // These must agree with the constants used in NormalizerBuilder static final int STR_INDEX_SHIFT = 2; static final int STR_LENGTH_MASK = 0x0003; }