/* * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; import java.io.IOException; import java.nio.ByteBuffer; import java.text.Normalizer; // Original filename in ICU4J: Normalizer2Impl.java public final class NormalizerImpl { public static final class Hangul { /* Korean Hangul and Jamo constants */ public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ public static final int HANGUL_BASE=0xac00; public static final int HANGUL_END=0xd7a3; public static final int JAMO_L_COUNT=19; public static final int JAMO_V_COUNT=21; public static final int JAMO_T_COUNT=28; public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; public static boolean isHangul(int c) { return HANGUL_BASE<=c && c<HANGUL_LIMIT; } public static boolean isHangulWithoutJamoT(char c) { c-=HANGUL_BASE; return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; } /** * Decomposes c, which must be a Hangul syllable, into buffer * and returns the length of the decomposition (2 or 3). */ public static int decompose(int c, Appendable buffer) { try { c-=HANGUL_BASE; int c2=c%JAMO_T_COUNT; c/=JAMO_T_COUNT; buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); if(c2==0) { return 2; } else { buffer.append((char)(JAMO_T_BASE+c2)); return 3; } } catch(IOException e) { throw new InternalError(e); } } } /** * Writable buffer that takes care of canonical ordering. * Its Appendable methods behave like the C++ implementation's * appendZeroCC() methods. * <p> * If dest is a StringBuilder, then the buffer writes directly to it. * Otherwise, the buffer maintains a StringBuilder for intermediate text segments * until no further changes are necessary and whole segments are appended. * append() methods that take combining-class values always write to the StringBuilder. * Other append() methods flush and append to the Appendable. */ public static final class ReorderingBuffer implements Appendable { public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { impl=ni; app=dest; if (app instanceof StringBuilder) { appIsStringBuilder=true; str=(StringBuilder)dest; // In Java, the constructor subsumes public void init(int destCapacity) str.ensureCapacity(destCapacity); reorderStart=0; if(str.length()==0) { lastCC=0; } else { setIterator(); lastCC=previousCC(); // Set reorderStart after the last code point with cc<=1 if there is one. if(lastCC>1) { while(previousCC()>1) {} } reorderStart=codePointLimit; } } else { appIsStringBuilder=false; str=new StringBuilder(); reorderStart=0; lastCC=0; } } public boolean isEmpty() { return str.length()==0; } public int length() { return str.length(); } public int getLastCC() { return lastCC; } public StringBuilder getStringBuilder() { return str; } public boolean equals(CharSequence s, int start, int limit) { return UTF16Plus.equal(str, 0, str.length(), s, start, limit); } // For Hangul composition, replacing the Leading consonant Jamo with the syllable. public void setLastChar(char c) { str.setCharAt(str.length()-1, c); } public void append(int c, int cc) { if(lastCC<=cc || cc==0) { str.appendCodePoint(c); lastCC=cc; if(cc<=1) { reorderStart=str.length(); } } else { insert(c, cc); } } // s must be in NFD, otherwise change the implementation. public void append(CharSequence s, int start, int limit, int leadCC, int trailCC) { if(start==limit) { return; } if(lastCC<=leadCC || leadCC==0) { if(trailCC<=1) { reorderStart=str.length()+(limit-start); } else if(leadCC<=1) { reorderStart=str.length()+1; // Ok if not a code point boundary. } str.append(s, start, limit); lastCC=trailCC; } else { int c=Character.codePointAt(s, start); start+=Character.charCount(c); insert(c, leadCC); // insert first code point while(start<limit) { c=Character.codePointAt(s, start); start+=Character.charCount(c); if(start<limit) { // s must be in NFD, otherwise we need to use getCC(). leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); } else { leadCC=trailCC; } append(c, leadCC); } } } // The following append() methods work like C++ appendZeroCC(). // They assume that the cc or trailCC of their input is 0. // Most of them implement Appendable interface methods. // @Override when we switch to Java 6 public ReorderingBuffer append(char c) { str.append(c); lastCC=0; reorderStart=str.length(); return this; } public void appendZeroCC(int c) { str.appendCodePoint(c); lastCC=0; reorderStart=str.length(); } // @Override when we switch to Java 6 public ReorderingBuffer append(CharSequence s) { if(s.length()!=0) { str.append(s); lastCC=0; reorderStart=str.length(); } return this; } // @Override when we switch to Java 6 public ReorderingBuffer append(CharSequence s, int start, int limit) { if(start!=limit) { str.append(s, start, limit); lastCC=0; reorderStart=str.length(); } return this; } /** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Used after recomposition. * Must be called at the end when writing to a non-StringBuilder Appendable. */ public void flush() { if(appIsStringBuilder) { reorderStart=str.length(); } else { try { app.append(str); str.setLength(0); reorderStart=0; } catch(IOException e) { throw new InternalError(e); // Avoid declaring "throws IOException". } } lastCC=0; } /** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Then appends the new text to the Appendable or StringBuilder. * Normally used after quick check loops find a non-empty sequence. */ public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { if(appIsStringBuilder) { str.append(s, start, limit); reorderStart=str.length(); } else { try { app.append(str).append(s, start, limit); str.setLength(0); reorderStart=0; } catch(IOException e) { throw new InternalError(e); // Avoid declaring "throws IOException". } } lastCC=0; return this; } public void remove() { str.setLength(0); lastCC=0; reorderStart=0; } public void removeSuffix(int suffixLength) { int oldLength=str.length(); str.delete(oldLength-suffixLength, oldLength); lastCC=0; reorderStart=str.length(); } // Inserts c somewhere before the last character. // Requires 0<cc<lastCC which implies reorderStart<limit. private void insert(int c, int cc) { for(setIterator(), skipPrevious(); previousCC()>cc;) {} // insert c at codePointLimit, after the character with prevCC<=cc if(c<=0xffff) { str.insert(codePointLimit, (char)c); if(cc<=1) { reorderStart=codePointLimit+1; } } else { str.insert(codePointLimit, Character.toChars(c)); if(cc<=1) { reorderStart=codePointLimit+2; } } } private final NormalizerImpl impl; private final Appendable app; private final StringBuilder str; private final boolean appIsStringBuilder; private int reorderStart; private int lastCC; // private backward iterator private void setIterator() { codePointStart=str.length(); } private void skipPrevious() { // Requires 0<codePointStart. codePointLimit=codePointStart; codePointStart=str.offsetByCodePoints(codePointStart, -1); } private int previousCC() { // Returns 0 if there is no previous character. codePointLimit=codePointStart; if(reorderStart>=codePointStart) { return 0; } int c=str.codePointBefore(codePointStart); codePointStart-=Character.charCount(c); if(c<MIN_CCC_LCCC_CP) { return 0; } return getCCFromYesOrMaybe(impl.getNorm16(c)); } private int codePointStart, codePointLimit; } // TODO: Propose as public API on the UTF16 class. // TODO: Propose widening UTF16 methods that take char to take int. // TODO: Propose widening UTF16 methods that take String to take CharSequence. public static final class UTF16Plus { /** * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), * is it a lead surrogate? * @param c code unit or code point * @return true or false */ public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } /** * Compares two CharSequence subsequences for binary equality. * @param s1 first sequence * @param start1 start offset in first sequence * @param limit1 limit offset in first sequence * @param s2 second sequence * @param start2 start offset in second sequence * @param limit2 limit offset in second sequence * @return true if s1.subSequence(start1, limit1) contains the same text * as s2.subSequence(start2, limit2) */ public static boolean equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2) { if((limit1-start1)!=(limit2-start2)) { return false; } if(s1==s2 && start1==start2) { return true; } while(start1<limit1) { if(s1.charAt(start1++)!=s2.charAt(start2++)) { return false; } } return true; } } public NormalizerImpl() {} private static final class IsAcceptable implements ICUBinary.Authenticate { // @Override when we switch to Java 6 public boolean isDataVersionAcceptable(byte version[]) { return version[0]==2; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" public NormalizerImpl load(ByteBuffer bytes) { try { dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 if(indexesLength<=IX_MIN_MAYBE_YES) { throw new IOException("Normalizer2 data: not enough indexes"); } int[] inIndexes=new int[indexesLength]; inIndexes[0]=indexesLength*4; for(int i=1; i<indexesLength; ++i) { inIndexes[i]=bytes.getInt(); } minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; minYesNo=inIndexes[IX_MIN_YES_NO]; minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; minNoNo=inIndexes[IX_MIN_NO_NO]; limitNoNo=inIndexes[IX_LIMIT_NO_NO]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; normTrie=Trie2_16.createFromSerialized(bytes); int trieLength=normTrie.getSerializedLength(); if(trieLength>(nextOffset-offset)) { throw new IOException("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; int numChars=(nextOffset-offset)/2; char[] chars; if(numChars!=0) { chars=new char[numChars]; for(int i=0; i<numChars; ++i) { chars[i]=bytes.getChar(); } maybeYesCompositions=new String(chars); extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); } // smallFCD: new in formatVersion 2 offset=nextOffset; smallFCD=new byte[0x100]; for(int i=0; i<0x100; ++i) { smallFCD[i]=bytes.get(); } // Build tccc180[]. // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. tccc180=new int[0x180]; int bits=0; for(int c=0; c<0x180; bits>>=1) { if((c&0xff)==0) { bits=smallFCD[c>>8]; // one byte per 0x100 code points } if((bits&1)!=0) { for(int i=0; i<0x20; ++i, ++c) { tccc180[c]=getFCD16FromNormData(c)&0xff; } } else { c+=0x20; } } return this; } catch(IOException e) { throw new InternalError(e); } } public NormalizerImpl load(String name) { return load(ICUBinary.getRequiredData(name)); } public int getNorm16(int c) { return normTrie.get(c); } public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } public int getCC(int norm16) { if(norm16>=MIN_NORMAL_MAYBE_YES) { return norm16&0xff; } if(norm16<minNoNo || limitNoNo<=norm16) { return 0; } return getCCFromNoNo(norm16); } public static int getCCFromYesOrMaybe(int norm16) { return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ public int getFCD16(int c) { if(c<0) { return 0; } else if(c<0x180) { return tccc180[c]; } else if(c<=0xffff) { if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } } return getFCD16FromNormData(c); } /** Returns the FCD data for U+0000<=c<U+0180. */ public int getFCD16FromBelow180(int c) { return tccc180[c]; } /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ public boolean singleLeadMightHaveNonZeroFCD16(int lead) { // 0<=lead<=0xffff byte bits=smallFCD[lead>>8]; if(bits==0) { return false; } return ((bits>>((lead>>5)&7))&1)!=0; } /** Gets the FCD value from the regular normalization data. */ public int getFCD16FromNormData(int c) { // Only loops for 1:1 algorithmic mappings. for(;;) { int norm16=getNorm16(c); if(norm16<=minYesNo) { // no decomposition or Hangul syllable, all zeros return 0; } else if(norm16>=MIN_NORMAL_MAYBE_YES) { // combining mark norm16&=0xff; return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data int firstUnit=extraData.charAt(norm16); if((firstUnit&MAPPING_LENGTH_MASK)==0) { // A character that is deleted (maps to an empty string) must // get the worst-case lccc and tccc values because arbitrary // characters on both sides will become adjacent. return 0x1ff; } else { int fcd16=firstUnit>>8; // tccc if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc } return fcd16; } } } } /** * Gets the decomposition for one code point. * @param c code point * @return c's decomposition, if it has one; returns null if it does not have a decomposition */ public String getDecomposition(int c) { int decomp=-1; int norm16; for(;;) { if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { // c does not decompose } else if(isHangul(norm16)) { // Hangul syllable: decompose algorithmically StringBuilder buffer=new StringBuilder(); Hangul.decompose(c, buffer); return buffer.toString(); } else if(isDecompNoAlgorithmic(norm16)) { decomp=c=mapAlgorithmic(c, norm16); continue; } else { // c decomposes, get everything from the variable-length extra data int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; return extraData.substring(norm16, norm16+length); } if(decomp<0) { return null; } else { return UTF16.valueOf(decomp); } } } public static final int MIN_CCC_LCCC_CP=0x300; public static final int MIN_YES_YES_WITH_CC=0xff01; public static final int JAMO_VT=0xff00; public static final int MIN_NORMAL_MAYBE_YES=0xfe00; public static final int MAX_DELTA=0x40; // Byte offsets from the start of the data, after the generic header. public static final int IX_NORM_TRIE_OFFSET=0; public static final int IX_EXTRA_DATA_OFFSET=1; public static final int IX_SMALL_FCD_OFFSET=2; // Code point thresholds for quick check codes. public static final int IX_MIN_DECOMP_NO_CP=8; public static final int IX_MIN_COMP_NO_MAYBE_CP=9; // Norm16 value thresholds for quick check combinations and types of extra data. // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. public static final int IX_MIN_YES_NO=10; public static final int IX_MIN_NO_NO=11; public static final int IX_LIMIT_NO_NO=12; public static final int IX_MIN_MAYBE_YES=13; // Mappings only in [minYesNoMappingsOnly..minNoNo[. public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; public static final int MAPPING_LENGTH_MASK=0x1f; public static final int COMP_1_LAST_TUPLE=0x8000; public static final int COMP_1_TRIPLE=1; public static final int COMP_1_TRAIL_LIMIT=0x3400; public static final int COMP_1_TRAIL_MASK=0x7ffe; public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit public static final int COMP_2_TRAIL_SHIFT=6; public static final int COMP_2_TRAIL_MASK=0xffc0; // higher-level functionality ------------------------------------------ *** /** * Decomposes s[src, limit[ and writes the result to dest. * limit can be NULL if src is NUL-terminated. * destLengthEstimate is the initial dest buffer capacity and can be -1. */ public void decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate) { if(destLengthEstimate<0) { destLengthEstimate=limit-src; } dest.setLength(0); ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); decompose(s, src, limit, buffer); } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer) { int minNoCP=minDecompNoCP; int prevSrc; int c=0; int norm16=0; // only for quick check int prevBoundary=src; int prevCC=0; for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))<minNoCP || isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { --src; c=Character.toCodePoint(c2, (char)c); } } if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { src+=Character.charCount(c); } else { break; } } } // copy these code units all at once if(src!=prevSrc) { if(buffer!=null) { buffer.flushAndAppendZeroCC(s, prevSrc, src); } else { prevCC=0; prevBoundary=src; } } if(src==limit) { break; } // Check one above-minimum, relevant code point. src+=Character.charCount(c); if(buffer!=null) { decompose(c, norm16, buffer); } else { if(isDecompYes(norm16)) { int cc=getCCFromYesOrMaybe(norm16); if(prevCC<=cc || cc==0) { prevCC=cc; if(cc<=1) { prevBoundary=src; } continue; } } return prevBoundary; // "no" or cc out of order } } return src; } public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { int limit=s.length(); if(limit==0) { return; } if(doDecompose) { decompose(s, 0, limit, buffer); return; } // Just merge the strings at the boundary. int c=Character.codePointAt(s, 0); int src=0; int firstCC, prevCC, cc; firstCC=prevCC=cc=getCC(getNorm16(c)); while(cc!=0) { prevCC=cc; src+=Character.charCount(c); if(src>=limit) { break; } c=Character.codePointAt(s, src); cc=getCC(getNorm16(c)); }; buffer.append(s, 0, src, firstCC, prevCC); buffer.append(s, src, limit); } // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize // !doCompose: isNormalized (buffer must be empty and initialized) public boolean compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer) { int minNoMaybeCP=minCompNoMaybeCP; /* * prevBoundary points to the last character before the current one * that has a composition boundary before it with ccc==0 and quick check "yes". * Keeping track of prevBoundary saves us looking for a composition boundary * when we find a "no" or "maybe". * * When we back out from prevSrc back to prevBoundary, * then we also remove those same characters (which had been simply copied * or canonically-order-inserted) from the ReorderingBuffer. * Therefore, at all times, the [prevBoundary..prevSrc[ source units * must correspond 1:1 to destination units at the end of the destination buffer. */ int prevBoundary=src; int prevSrc; int c=0; int norm16=0; // only for isNormalized int prevCC=0; for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { --src; c=Character.toCodePoint(c2, (char)c); } } if(isCompYesAndZeroCC(norm16=getNorm16(c))) { src+=Character.charCount(c); } else { break; } } } // copy these code units all at once if(src!=prevSrc) { if(src==limit) { if(doCompose) { buffer.flushAndAppendZeroCC(s, prevSrc, src); } break; } // Set prevBoundary to the last character in the quick check loop. prevBoundary=src-1; if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && Character.isHighSurrogate(s.charAt(prevBoundary-1)) ) { --prevBoundary; } if(doCompose) { // The last "quick check yes" character is excluded from the // flush-and-append call in case it needs to be modified. buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); buffer.append(s, prevBoundary, src); } else { prevCC=0; } // The start of the current character (c). prevSrc=src; } else if(src==limit) { break; } src+=Character.charCount(c); /* * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) * or has ccc!=0. * Check for Jamo V/T, then for regular characters. * c is not a Hangul syllable or Jamo L because those have "yes" properties. */ if(isJamoVT(norm16) && prevBoundary!=prevSrc) { char prev=s.charAt(prevSrc-1); boolean needToDecompose=false; if(c<Hangul.JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. prev-=Hangul.JAMO_L_BASE; if(prev<Hangul.JAMO_L_COUNT) { if(!doCompose) { return false; } char syllable=(char) (Hangul.HANGUL_BASE+ (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* Hangul.JAMO_T_COUNT); char t; if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { ++src; syllable+=t; // The next character was a Jamo T. prevBoundary=src; buffer.setLastChar(syllable); continue; } // If we see L+V+x where x!=T then we drop to the slow path, // decompose and recompose. // This is to deal with NFKC finding normal L and V but a // compatibility variant of a T. We need to either fully compose that // combination here (which would complicate the code and may not work // with strange custom data) or use the slow path -- or else our replacing // two input characters (L+V) with one output character (LV syllable) // would violate the invariant that [prevBoundary..prevSrc[ has the same // length as what we appended to the buffer since prevBoundary. needToDecompose=true; } } else if(Hangul.isHangulWithoutJamoT(prev)) { // c is a Jamo Trailing consonant, // compose with previous Hangul LV that does not contain a Jamo T. if(!doCompose) { return false; } buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); prevBoundary=src; continue; } if(!needToDecompose) { // The Jamo V/T did not compose into a Hangul syllable. if(doCompose) { buffer.append((char)c); } else { prevCC=0; } continue; } } /* * Source buffer pointers: * * all done quick check current char not yet * "yes" but (c) processed * may combine * forward * [-------------[-------------[-------------[-------------[ * | | | | | * orig. src prevBoundary prevSrc src limit * * * Destination buffer pointers inside the ReorderingBuffer: * * all done might take not filled yet * characters for * reordering * [-------------[-------------[-------------[ * | | | | * start reorderStart limit | * +remainingCap.+ */ if(norm16>=MIN_YES_YES_WITH_CC) { int cc=norm16&0xff; // cc!=0 if( onlyContiguous && // FCC (doCompose ? buffer.getLastCC() : prevCC)==0 && prevBoundary<prevSrc && // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) // passed the quick check "yes && ccc==0" test. // Check whether the last character was a "yesYes" or a "yesNo". // If a "yesNo", then we get its trailing ccc from its // mapping and check for canonical order. // All other cases are ok. getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ) { // Fails FCD test, need to decompose and contiguously recompose. if(!doCompose) { return false; } } else if(doCompose) { buffer.append(c, cc); continue; } else if(prevCC<=cc) { prevCC=cc; continue; } else { return false; } } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { return false; } /* * Find appropriate boundaries around this character, * decompose the source text from between the boundaries, * and recompose it. * * We may need to remove the last few characters from the ReorderingBuffer * to account for source text that was copied or appended * but needs to take part in the recomposition. */ /* * Find the last composition boundary in [prevBoundary..src[. * It is either the decomposition of the current character (at prevSrc), * or prevBoundary. */ if(hasCompBoundaryBefore(c, norm16)) { prevBoundary=prevSrc; } else if(doCompose) { buffer.removeSuffix(prevSrc-prevBoundary); } // Find the next composition boundary in [src..limit[ - // modifies src to point to the next starter. src=findNextCompBoundary(s, src, limit); // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. int recomposeStartIndex=buffer.length(); decomposeShort(s, prevBoundary, src, buffer); recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { if(!buffer.equals(s, prevBoundary, src)) { return false; } buffer.remove(); prevCC=0; } // Move to the next starter. We never need to look back before this point again. prevBoundary=src; } return true; } /** * Very similar to compose(): Make the same changes in both places if relevant. * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) * !doSpan: quickCheck * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and * bit 0: set if "maybe"; otherwise, if the span length<s.length() * then the quick check result is "no" */ public int composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan) { int qcResult=0; int minNoMaybeCP=minCompNoMaybeCP; /* * prevBoundary points to the last character before the current one * that has a composition boundary before it with ccc==0 and quick check "yes". */ int prevBoundary=src; int prevSrc; int c=0; int norm16=0; int prevCC=0; for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { --src; c=Character.toCodePoint(c2, (char)c); } } if(isCompYesAndZeroCC(norm16=getNorm16(c))) { src+=Character.charCount(c); } else { break; } } } if(src!=prevSrc) { // Set prevBoundary to the last character in the quick check loop. prevBoundary=src-1; if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && Character.isHighSurrogate(s.charAt(prevBoundary-1)) ) { --prevBoundary; } prevCC=0; // The start of the current character (c). prevSrc=src; } src+=Character.charCount(c); /* * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) * or has ccc!=0. */ if(isMaybeOrNonZeroCC(norm16)) { int cc=getCCFromYesOrMaybe(norm16); if( onlyContiguous && // FCC cc!=0 && prevCC==0 && prevBoundary<prevSrc && // prevCC==0 && prevBoundary<prevSrc tell us that // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) // passed the quick check "yes && ccc==0" test. // Check whether the last character was a "yesYes" or a "yesNo". // If a "yesNo", then we get its trailing ccc from its // mapping and check for canonical order. // All other cases are ok. getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ) { // Fails FCD test. } else if(prevCC<=cc || cc==0) { prevCC=cc; if(norm16<MIN_YES_YES_WITH_CC) { if(!doSpan) { qcResult=1; } else { return prevBoundary<<1; // spanYes does not care to know it's "maybe" } } continue; } } return prevBoundary<<1; // "no" } } public void composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { int firstStarterInSrc=findNextCompBoundary(s, 0, limit); if(0!=firstStarterInSrc) { int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), buffer.length()); StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ firstStarterInSrc+16); middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastStarterInDest); middle.append(s, 0, firstStarterInSrc); compose(middle, 0, middle.length(), onlyContiguous, true, buffer); src=firstStarterInSrc; } } if(doCompose) { compose(s, src, limit, onlyContiguous, true, buffer); } else { buffer.append(s, src, limit); } } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { // Note: In this function we use buffer->appendZeroCC() because we track // the lead and trail combining classes here, rather than leaving it to // the ReorderingBuffer. // The exception is the call to decomposeShort() which uses the buffer // in the normal way. // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. // Similar to the prevBoundary in the compose() implementation. int prevBoundary=src; int prevSrc; int c=0; int prevFCD16=0; int fcd16=0; for(;;) { // count code units with lccc==0 for(prevSrc=src; src!=limit;) { if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { prevFCD16=~c; ++src; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { prevFCD16=0; ++src; } else { if(UTF16.isSurrogate((char)c)) { char c2; if(UTF16Plus.isSurrogateLead(c)) { if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { --src; c=Character.toCodePoint(c2, (char)c); } } } if((fcd16=getFCD16FromNormData(c))<=0xff) { prevFCD16=fcd16; src+=Character.charCount(c); } else { break; } } } // copy these code units all at once if(src!=prevSrc) { if(src==limit) { if(buffer!=null) { buffer.flushAndAppendZeroCC(s, prevSrc, src); } break; } prevBoundary=src; // We know that the previous character's lccc==0. if(prevFCD16<0) { // Fetching the fcd16 value was deferred for this below-U+0300 code point. int prev=~prevFCD16; prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); if(prevFCD16>1) { --prevBoundary; } } else { int p=src-1; if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && Character.isHighSurrogate(s.charAt(p-1)) ) { --p; // Need to fetch the previous character's FCD value because // prevFCD16 was just for the trail surrogate code point. prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); // Still known to have lccc==0 because its lead surrogate unit had lccc==0. } if(prevFCD16>1) { prevBoundary=p; } } if(buffer!=null) { // The last lccc==0 character is excluded from the // flush-and-append call in case it needs to be modified. buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); buffer.append(s, prevBoundary, src); } // The start of the current character (c). prevSrc=src; } else if(src==limit) { break; } src+=Character.charCount(c); // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. // Check for proper order, and decompose locally if necessary. if((prevFCD16&0xff)<=(fcd16>>8)) { // proper order: prev tccc <= current lccc if((fcd16&0xff)<=1) { prevBoundary=src; } if(buffer!=null) { buffer.appendZeroCC(c); } prevFCD16=fcd16; continue; } else if(buffer==null) { return prevBoundary; // quick check "no" } else { /* * Back out the part of the source that we copied or appended * already but is now going to be decomposed. * prevSrc is set to after what was copied/appended. */ buffer.removeSuffix(prevSrc-prevBoundary); /* * Find the part of the source that needs to be decomposed, * up to the next safe boundary. */ src=findNextFCDBoundary(s, src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ decomposeShort(s, prevBoundary, src, buffer); prevBoundary=src; prevFCD16=0; } } return src; } // Note: hasDecompBoundary() could be implemented as aliases to // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() // at the cost of building the FCD trie for a decomposition normalizer. public boolean hasDecompBoundary(int c, boolean before) { for(;;) { if(c<minDecompNoCP) { return true; } int norm16=getNorm16(c); if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { return true; } else if(norm16>MIN_NORMAL_MAYBE_YES) { return false; // ccc!=0 } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); } else { // c decomposes, get everything from the variable-length extra data int firstUnit=extraData.charAt(norm16); if((firstUnit&MAPPING_LENGTH_MASK)==0) { return false; } if(!before) { // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return false; // trailCC>1 } if(firstUnit<=0xff) { return true; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary } // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; } } } public boolean hasCompBoundaryBefore(int c) { return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); } private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } private boolean isHangul(int norm16) { return norm16==minYesNo; } private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } // UBool isCompYes(uint16_t norm16) const { // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; // } // UBool isCompYesOrMaybe(uint16_t norm16) const { // return norm16<minNoNo || minMaybeYes<=norm16; // } // private boolean hasZeroCCFromDecompYes(int norm16) { // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; // } private boolean isDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==JAMO_VT || (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); } /** * A little faster and simpler than isDecompYesAndZeroCC() but does not include * the MaybeYes which combine-forward and have ccc=0. * (Standard Unicode 5.2 normalization does not have such characters.) */ private boolean isMostDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; } private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; // } private int getCCFromNoNo(int norm16) { if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { return extraData.charAt(norm16-1)&0xff; } else { return 0; } } // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { int c; if(cpStart==(cpLimit-1)) { c=s.charAt(cpStart); } else { c=Character.codePointAt(s, cpStart); } int prevNorm16=getNorm16(c); if(prevNorm16<=minYesNo) { return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 } else { return extraData.charAt(prevNorm16)>>8; // tccc from yesNo } } // Requires algorithmic-NoNo. private int mapAlgorithmic(int c, int norm16) { return c+norm16-(minMaybeYes-MAX_DELTA-1); } // Requires minYesNo<norm16<limitNoNo. // private int getMapping(int norm16) { return /*extraData+*/norm16; } /** * @return index into maybeYesCompositions, or -1 */ private int getCompositionsListForDecompYes(int norm16) { if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { return -1; } else { if((norm16-=minMaybeYes)<0) { // norm16<minMaybeYes: index into extraData which is a substring at // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list } return norm16; } } /** * @return index into maybeYesCompositions */ private int getCompositionsListForComposite(int norm16) { // composite has both mapping & compositions list int firstUnit=extraData.charAt(norm16); return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 1+ // +1 to skip the first unit with the mapping lenth (firstUnit&MAPPING_LENGTH_MASK); // + mapping length } // Decompose a short piece of text which is likely to contain characters that // fail the quick check loop and/or where the quick check loop's overhead // is unlikely to be amortized. // Called by the compose() and makeFCD() implementations. // Public in Java for collation implementation code. public void decomposeShort(CharSequence s, int src, int limit, ReorderingBuffer buffer) { while(src<limit) { int c=Character.codePointAt(s, src); src+=Character.charCount(c); decompose(c, getNorm16(c), buffer); } } private void decompose(int c, int norm16, ReorderingBuffer buffer) { // Only loops for 1:1 algorithmic mappings. for(;;) { // get the decomposition and the lead and trail cc's if(isDecompYes(norm16)) { // c does not decompose buffer.append(c, getCCFromYesOrMaybe(norm16)); } else if(isHangul(norm16)) { // Hangul syllable: decompose algorithmically Hangul.decompose(c, buffer); } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); continue; } else { // c decomposes, get everything from the variable-length extra data int firstUnit=extraData.charAt(norm16); int length=firstUnit&MAPPING_LENGTH_MASK; int leadCC, trailCC; trailCC=firstUnit>>8; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { leadCC=extraData.charAt(norm16-1)>>8; } else { leadCC=0; } ++norm16; // skip over the firstUnit buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); } return; } } /** * Finds the recomposition result for * a forward-combining "lead" character, * specified with a pointer to its compositions list, * and a backward-combining "trail" character. * * <p>If the lead and trail characters combine, then this function returns * the following "compositeAndFwd" value: * <pre> * Bits 21..1 composite character * Bit 0 set if the composite is a forward-combining starter * </pre> * otherwise it returns -1. * * <p>The compositions list has (trail, compositeAndFwd) pair entries, * encoded as either pairs or triples of 16-bit units. * The last entry has the high bit of its first unit set. * * <p>The list is sorted by ascending trail characters (there are no duplicates). * A linear search is used. * * <p>See normalizer2impl.h for a more detailed description * of the compositions list format. */ private static int combine(String compositions, int list, int trail) { int key1, firstUnit; if(trail<COMP_1_TRAIL_LIMIT) { // trail character is 0..33FF // result entry may have 2 or 3 units key1=(trail<<1); while(key1>(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); } else { return compositions.charAt(list+1); } } } else { // trail character is 3400..10FFFF // result entry has 3 units key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; int secondUnit; for(;;) { if(key1>(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(key2>(secondUnit=compositions.charAt(list+1))) { if((firstUnit&COMP_1_LAST_TUPLE)!=0) { break; } else { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); } else { break; } } else { break; } } } return -1; } /* * Recomposes the buffer text starting at recomposeStartIndex * (which is in NFD - decomposed and canonically ordered), * and truncates the buffer contents. * * Note that recomposition never lengthens the text: * Any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit. */ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous) { StringBuilder sb=buffer.getStringBuilder(); int p=recomposeStartIndex; if(p==sb.length()) { return; } int starter, pRemove; int compositionsList; int c, compositeAndFwd; int norm16; int cc, prevCC; boolean starterIsSupplementary; // Some of the following variables are not used until we have a forward-combining starter // and are only initialized now to avoid compiler warnings. compositionsList=-1; // used as indicator for whether we have a forward-combining starter starter=-1; starterIsSupplementary=false; prevCC=0; for(;;) { c=sb.codePointAt(p); p+=Character.charCount(c); norm16=getNorm16(c); cc=getCCFromYesOrMaybe(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList>=0 && // the backward-combining character is not blocked (prevCC<cc || prevCC==0)) { if(isJamoVT(norm16)) { // c is a Jamo V/T, see if we can compose it with the previous character. if(c<Hangul.JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); if(prev<Hangul.JAMO_L_COUNT) { pRemove=p-1; char syllable=(char) (Hangul.HANGUL_BASE+ (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* Hangul.JAMO_T_COUNT); char t; if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { ++p; syllable+=t; // The next character was a Jamo T. } sb.setCharAt(starter, syllable); // remove the Jamo V/T sb.delete(pRemove, p); p=pRemove; } } /* * No "else" for Jamo T: * Since the input is in NFD, there are no Hangul LV syllables that * a Jamo T could combine with. * All Jamo Ts are combined above when handling Jamo Vs. */ if(p==sb.length()) { break; } compositionsList=-1; continue; } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { // The starter and the combining mark (c) do combine. int composite=compositeAndFwd>>1; // Remove the combining mark. pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark sb.delete(pRemove, p); p=pRemove; // Replace the starter with the composite. if(starterIsSupplementary) { if(composite>0xffff) { // both are supplementary sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); } else { sb.setCharAt(starter, (char)c); sb.deleteCharAt(starter+1); // The composite is shorter than the starter, // move the intermediate characters forward one. starterIsSupplementary=false; --p; } } else if(composite>0xffff) { // The composite is longer than the starter, // move the intermediate characters back one. starterIsSupplementary=true; sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); ++p; } else { // both are on the BMP sb.setCharAt(starter, (char)composite); } // Keep prevCC because we removed the combining mark. if(p==sb.length()) { break; } // Is the composite a starter that combines forward? if((compositeAndFwd&1)!=0) { compositionsList= getCompositionsListForComposite(getNorm16(composite)); } else { compositionsList=-1; } // We combined; continue with looking for compositions. continue; } } // no combination this time prevCC=cc; if(p==sb.length()) { break; } // If c did not combine, then check if it is a starter. if(cc==0) { // Found a new starter. if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { // It may combine with something, prepare for it. if(c<=0xffff) { starterIsSupplementary=false; starter=p-1; } else { starterIsSupplementary=true; starter=p-2; } } } else if(onlyContiguous) { // FCC: no discontiguous compositions; any intervening character blocks. compositionsList=-1; } } buffer.flush(); } /** * Does c have a composition boundary before it? * True if its decomposition begins with a character that has * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes * (isCompYesAndZeroCC()) so we need not decompose. */ private boolean hasCompBoundaryBefore(int c, int norm16) { for(;;) { if(isCompYesAndZeroCC(norm16)) { return true; } else if(isMaybeOrNonZeroCC(norm16)) { return false; } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); } else { // c decomposes, get everything from the variable-length extra data int firstUnit=extraData.charAt(norm16); if((firstUnit&MAPPING_LENGTH_MASK)==0) { return false; } if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { return false; // non-zero leadCC } return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); } } } private int findPreviousCompBoundary(CharSequence s, int p) { while(p>0) { int c=Character.codePointBefore(s, p); p-=Character.charCount(c); if(hasCompBoundaryBefore(c)) { break; } // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, // but that's probably not worth the extra cost. } return p; } private int findNextCompBoundary(CharSequence s, int p, int limit) { while(p<limit) { int c=Character.codePointAt(s, p); int norm16=normTrie.get(c); if(hasCompBoundaryBefore(c, norm16)) { break; } p+=Character.charCount(c); } return p; } private int findNextFCDBoundary(CharSequence s, int p, int limit) { while(p<limit) { int c=Character.codePointAt(s, p); if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { break; } p+=Character.charCount(c); } return p; } /** * Get the canonical decomposition * sherman for ComposedCharIter */ public static int getDecompose(int chars[], String decomps[]) { Normalizer2 impl = Normalizer2.getNFDInstance(); int length=0; int norm16 = 0; int ch = -1; int i = 0; while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff //TBD !!!! the hack code heres save us about 50ms for startup //need a better solution/lookup if (ch == 0x30ff) ch = 0xf900; else if (ch == 0x115bc) ch = 0x1d15e; else if (ch == 0x1d1c1) ch = 0x2f800; String s = impl.getDecomposition(ch); if(s != null && i < chars.length) { chars[i] = ch; decomps[i++] = s; } } return i; } //------------------------------------------------------ // special method for Collation (RBTableBuilder.build()) //------------------------------------------------------ private static boolean needSingleQuotation(char c) { return (c >= 0x0009 && c <= 0x000D) || (c >= 0x0020 && c <= 0x002F) || (c >= 0x003A && c <= 0x0040) || (c >= 0x005B && c <= 0x0060) || (c >= 0x007B && c <= 0x007E); } public static String canonicalDecomposeWithSingleQuotation(String string) { Normalizer2 impl = Normalizer2.getNFDInstance(); char[] src = string.toCharArray(); int srcIndex = 0; int srcLimit = src.length; char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 int destIndex = 0; int destLimit = dest.length; int prevSrc; String norm; int reorderStartIndex, length; char c1, c2; int cp; int minNoMaybe = 0x00c0; int cc, prevCC, trailCC; char[] p; int pStart; // initialize reorderStartIndex = 0; prevCC = 0; norm = null; cp = 0; pStart = 0; cc = trailCC = -1; // initialize to bogus value c1 = 0; for (;;) { prevSrc=srcIndex; //quick check (1)less than minNoMaybe (2)no decomp (3)hangual while (srcIndex != srcLimit && ((c1 = src[srcIndex]) < minNoMaybe || (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables prevCC = 0; srcIndex += (cp < 0x10000) ? 1 : 2; } // copy these code units all at once if (srcIndex != prevSrc) { length = srcIndex - prevSrc; if ((destIndex + length) <= destLimit) { System.arraycopy(src,prevSrc,dest,destIndex,length); } destIndex += length; reorderStartIndex = destIndex; } // end of source reached? if (srcIndex == srcLimit) { break; } // cp already contains *src and norm32 is set for it, increment src srcIndex += (cp < 0x10000) ? 1 : 2; if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { c2 = 0; length = 1; if (Character.isHighSurrogate(c1) || Character.isLowSurrogate(c1)) { norm = null; } } else { length = 2; c2 = src[srcIndex-1]; } // get the decomposition and the lead and trail cc's if (norm == null) { // cp does not decompose cc = trailCC = UCharacter.getCombiningClass(cp); p = null; pStart = -1; } else { pStart = 0; p = norm.toCharArray(); length = p.length; int cpNum = norm.codePointCount(0, length); cc= UCharacter.getCombiningClass(norm.codePointAt(0)); trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); if (length == 1) { // fastpath a single code unit from decomposition c1 = p[pStart]; c2 = 0; p = null; pStart = -1; } } if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations // buffer overflow char[] tmpBuf = new char[destLimit * 2]; System.arraycopy(dest, 0, tmpBuf, 0, destIndex); dest = tmpBuf; destLimit = dest.length; } // append the decomposition to the destination buffer, assume length>0 { int reorderSplit = destIndex; if (p == null) { // fastpath: single code point if (needSingleQuotation(c1)) { //if we need single quotation, no need to consider "prevCC" //and it must NOT be a supplementary pair dest[destIndex++] = '\''; dest[destIndex++] = c1; dest[destIndex++] = '\''; trailCC = 0; } else if(cc != 0 && cc < prevCC) { // (c1, c2) is out of order with respect to the preceding // text destIndex += length; trailCC = insertOrdered(dest, reorderStartIndex, reorderSplit, destIndex, c1, c2, cc); } else { // just append (c1, c2) dest[destIndex++] = c1; if(c2 != 0) { dest[destIndex++] = c2; } } } else { // general: multiple code points (ordered by themselves) // from decomposition if (needSingleQuotation(p[pStart])) { dest[destIndex++] = '\''; dest[destIndex++] = p[pStart++]; dest[destIndex++] = '\''; length--; do { dest[destIndex++] = p[pStart++]; } while(--length > 0); } else if (cc != 0 && cc < prevCC) { destIndex += length; trailCC = mergeOrdered(dest, reorderStartIndex, reorderSplit, p, pStart, pStart+length); } else { // just append the decomposition do { dest[destIndex++] = p[pStart++]; } while (--length > 0); } } } prevCC = trailCC; if(prevCC == 0) { reorderStartIndex = destIndex; } } return new String(dest, 0, destIndex); } /** * simpler, single-character version of mergeOrdered() - * bubble-insert one single code point into the preceding string * which is already canonically ordered * (c, c2) may or may not yet have been inserted at src[current]..src[p] * * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) * * before: src[start]..src[current] is already ordered, and * src[current]..src[p] may or may not hold (c, c2) but * must be exactly the same length as (c, c2) * after: src[start]..src[p] is ordered * * @return the trailing combining class */ private static int/*unsigned byte*/ insertOrdered(char[] source, int start, int current, int p, char c1, char c2, int/*unsigned byte*/ cc) { int back, preBack; int r; int prevCC, trailCC=cc; if (start<current && cc!=0) { // search for the insertion point where cc>=prevCC preBack=back=current; PrevArgs prevArgs = new PrevArgs(); prevArgs.current = current; prevArgs.start = start; prevArgs.src = source; prevArgs.c1 = c1; prevArgs.c2 = c2; // get the prevCC prevCC=getPrevCC(prevArgs); preBack = prevArgs.current; if(cc<prevCC) { // this will be the last code point, so keep its cc trailCC=prevCC; back=preBack; while(start<preBack) { prevCC=getPrevCC(prevArgs); preBack=prevArgs.current; if(cc>=prevCC) { break; } back=preBack; } // this is where we are right now with all these indicies: // [start]..[pPreBack] 0..? code points that we can ignore // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) // [current]..[p] 1 code point (c, c2) with cc // move the code units in between up r=p; do { source[--r]=source[--current]; } while (back!=current); } } // insert (c1, c2) source[current] = c1; if (c2!=0) { source[(current+1)] = c2; } // we know the cc of the last code point return trailCC; } /** * merge two UTF-16 string parts together * to canonically order (order by combining classes) their concatenation * * the two strings may already be adjacent, so that the merging is done * in-place if the two strings are not adjacent, then the buffer holding the * first one must be large enough * the second string may or may not be ordered in itself * * before: [start]..[current] is already ordered, and * [next]..[limit] may be ordered in itself, but * is not in relation to [start..current[ * after: [start..current+(limit-next)[ is ordered * * the algorithm is a simple bubble-sort that takes the characters from * src[next++] and inserts them in correct combining class order into the * preceding part of the string * * since this function is called much less often than the single-code point * insertOrdered(), it just uses that for easier maintenance * * @return the trailing combining class */ private static int /*unsigned byte*/ mergeOrdered(char[] source, int start, int current, char[] data, int next, int limit) { int r; int /*unsigned byte*/ cc, trailCC=0; boolean adjacent; adjacent= current==next; NextCCArgs ncArgs = new NextCCArgs(); ncArgs.source = data; ncArgs.next = next; ncArgs.limit = limit; if(start!=current) { while(ncArgs.next<ncArgs.limit) { cc=getNextCC(ncArgs); if(cc==0) { // does not bubble back trailCC=0; if(adjacent) { current=ncArgs.next; } else { data[current++]=ncArgs.c1; if(ncArgs.c2!=0) { data[current++]=ncArgs.c2; } } break; } else { r=current+(ncArgs.c2==0 ? 1 : 2); trailCC=insertOrdered(source,start, current, r, ncArgs.c1, ncArgs.c2, cc); current=r; } } } if(ncArgs.next==ncArgs.limit) { // we know the cc of the last code point return trailCC; } else { if(!adjacent) { // copy the second string part do { source[current++]=data[ncArgs.next++]; } while(ncArgs.next!=ncArgs.limit); ncArgs.limit=current; } PrevArgs prevArgs = new PrevArgs(); prevArgs.src = data; prevArgs.start = start; prevArgs.current = ncArgs.limit; return getPrevCC(prevArgs); } } private static final class PrevArgs{ char[] src; int start; int current; char c1; char c2; } private static final class NextCCArgs{ char[] source; int next; int limit; char c1; char c2; } private static int /*unsigned*/ getPrevCC(PrevArgs args) { args.c1=args.src[--args.current]; args.c2=0; if (args.c1 < MIN_CCC_LCCC_CP) { return 0; } else if (UTF16.isLeadSurrogate(args.c1)) { /* unpaired first surrogate */ return 0; } else if (!UTF16.isTrailSurrogate(args.c1)) { return UCharacter.getCombiningClass(args.c1); } else if (args.current!=args.start && UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { --args.current; return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); } else { /* unpaired second surrogate */ args.c2=0; return 0; } } private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { args.c1=args.source[args.next++]; args.c2=0; if (UTF16.isTrailSurrogate(args.c1)) { /* unpaired second surrogate */ return 0; } else if (!UTF16.isLeadSurrogate(args.c1)) { return UCharacter.getCombiningClass(args.c1); } else if (args.next!=args.limit && UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ ++args.next; return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); } else { /* unpaired first surrogate */ args.c2=0; return 0; } } private VersionInfo dataVersion; // Code point thresholds for quick check codes. private int minDecompNoCP; private int minCompNoMaybeCP; // Norm16 value thresholds for quick check combinations and types of extra data. private int minYesNo; private int minYesNoMappingsOnly; private int minNoNo; private int limitNoNo; private int minMaybeYes; private Trie2_16 normTrie; private String maybeYesCompositions; private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F }