/* * Copyright (c) 2005, 2006, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * * * * The original version of this source code and documentation is copyrighted * * and owned by IBM, These materials are provided under terms of a License * * Agreement between IBM and Sun. This technology is protected by multiple * * US and International patents. This notice and attribution to IBM may not * * to removed. * ******************************************************************************* */ package sun.text.normalizer; import java.io.BufferedInputStream; import java.io.InputStream; import java.io.IOException; import java.text.BreakIterator; import java.util.Locale; /** * <p>Internal class used for Unicode character property database.</p> * <p>This classes store binary data read from uprops.icu. * It does not have the capability to parse the data into more high-level * information. It only returns bytes of information when required.</p> * <p>Due to the form most commonly used for retrieval, array of char is used * to store the binary data.</p> * <p>UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data.</p> * <p>Responsibility for molding the binary data into more meaning form lies on * <a href=UCharacter.html>UCharacter</a>.</p> * @author Syn Wee Quek * @since release 2.1, february 1st 2002 * @draft 2.1 */ public final class UCharacterProperty implements Trie.DataManipulate { // public data members ----------------------------------------------- /** * Trie data */ public CharTrie m_trie_; /** * Optimization * CharTrie index array */ public char[] m_trieIndex_; /** * Optimization * CharTrie data array */ public char[] m_trieData_; /** * Optimization * CharTrie data offset */ public int m_trieInitialValue_; /** * Character property table */ public int m_property_[]; /** * Unicode version */ public VersionInfo m_unicodeVersion_; /** * Exception indicator for uppercase type */ public static final int EXC_UPPERCASE_ = 0; /** * Exception indicator for lowercase type */ public static final int EXC_LOWERCASE_ = 1; /** * Exception indicator for titlecase type */ public static final int EXC_TITLECASE_ = 2; /** * Exception indicator for digit type */ public static final int EXC_UNUSED_ = 3; /** * Exception indicator for numeric type */ public static final int EXC_NUMERIC_VALUE_ = 4; /** * Exception indicator for denominator type */ public static final int EXC_DENOMINATOR_VALUE_ = 5; /** * Exception indicator for mirror type */ public static final int EXC_MIRROR_MAPPING_ = 6; /** * Exception indicator for special casing type */ public static final int EXC_SPECIAL_CASING_ = 7; /** * Exception indicator for case folding type */ public static final int EXC_CASE_FOLDING_ = 8; /** * EXC_COMBINING_CLASS_ is not found in ICU. * Used to retrieve the combining class of the character in the exception * value */ public static final int EXC_COMBINING_CLASS_ = 9; /** * Latin lowercase i */ public static final char LATIN_SMALL_LETTER_I_ = 0x69; /** * Character type mask */ public static final int TYPE_MASK = 0x1F; /** * Exception test mask */ public static final int EXCEPTION_MASK = 0x20; // public methods ---------------------------------------------------- /** * Java friends implementation */ public void setIndexData(CharTrie.FriendAgent friendagent) { m_trieIndex_ = friendagent.getPrivateIndex(); m_trieData_ = friendagent.getPrivateData(); m_trieInitialValue_ = friendagent.getPrivateInitialValue(); } /** * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's * data the index array offset of the indexes for that lead surrogate. * @param value data value for a surrogate from the trie, including the * folding offset * @return data offset or 0 if there is no data for the lead surrogate */ public int getFoldingOffset(int value) { if ((value & SUPPLEMENTARY_FOLD_INDICATOR_MASK_) != 0) { return (value & SUPPLEMENTARY_FOLD_OFFSET_MASK_); } else { return 0; } } /** * Gets the property value at the index. * This is optimized. * Note this is alittle different from CharTrie the index m_trieData_ * is never negative. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public int getProperty(int ch) { if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { // BMP codepoint // optimized try { return m_property_[ m_trieData_[ (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]]; } catch (ArrayIndexOutOfBoundsException e) { return m_property_[m_trieInitialValue_]; } } if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { return m_property_[ m_trieData_[ (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]]; } // for optimization if (ch <= UTF16.CODEPOINT_MAX_VALUE) { // look at the construction of supplementary characters // trail forms the ends of it. return m_property_[m_trie_.getSurrogateValue( UTF16.getLeadSurrogate(ch), (char)(ch & Trie.SURROGATE_MASK_))]; } // return m_dataOffset_ if there is an error, in this case we return // the default value: m_initialValue_ // we cannot assume that m_initialValue_ is at offset 0 // this is for optimization. return m_property_[m_trieInitialValue_]; // return m_property_[m_trie_.getCodePointValue(ch)]; } /** * Getting the signed numeric value of a character embedded in the property * argument * @param prop the character * @return signed numberic value */ public static int getSignedValue(int prop) { return (prop >> VALUE_SHIFT_); } /** * Getting the exception index for argument property * @param prop character property * @return exception index */ public static int getExceptionIndex(int prop) { return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; } /** * Determines if the exception value passed in has the kind of information * which the indicator wants, e.g if the exception value contains the digit * value of the character * @param index exception index * @param indicator type indicator * @return true if type value exist */ public boolean hasExceptionValue(int index, int indicator) { return (m_exception_[index] & (1 << indicator)) != 0; } /** * Gets the exception value at the index, assuming that data type is * available. Result is undefined if data is not available. Use * hasExceptionValue() to determine data's availability. * @param index * @param etype exception data type * @return exception data type value at index */ public int getException(int index, int etype) { // contained in exception data if (etype == EXC_COMBINING_CLASS_) { return m_exception_[index]; } // contained in the exception digit address index = addExceptionOffset(m_exception_[index], etype, ++ index); return m_exception_[index]; } /** * Gets the folded case value at the index * @param index of the case value to be retrieved * @param count number of characters to retrieve * @param str string buffer to which to append the result */ public void getFoldCase(int index, int count, StringBuffer str) { // first 2 chars are for the simple mappings index += 2; while (count > 0) { str.append(m_case_[index]); index ++; count --; } } /** * Gets the unicode additional properties. * C version getUnicodeProperties. * @param codepoint codepoint whose additional properties is to be * retrieved * @return unicode properties */ public int getAdditional(int codepoint) { return m_additionalVectors_[m_additionalTrie_.getCodePointValue(codepoint)]; } /** * <p>Get the "age" of the code point.</p> * <p>The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character.</p> * <p>This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.</p> * <p>The data is from the UCD file DerivedAge.txt.</p> * <p>This API does not check the validity of the codepoint.</p> * @param codepoint The code point. * @return the Unicode version number * @draft ICU 2.1 */ public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint) >> AGE_SHIFT_; return VersionInfo.getInstance( (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); } /** * Forms a supplementary code point from the argument character<br> * Note this is for internal use hence no checks for the validity of the * surrogate characters are done * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */ public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; } /** * Loads the property data and initialize the UCharacterProperty instance. * @throws RuntimeException when data is missing or data has been corrupted */ public static UCharacterProperty getInstance() throws RuntimeException { if (INSTANCE_ == null) { try { INSTANCE_ = new UCharacterProperty(); } catch (Exception e) { throw new RuntimeException(e.getMessage()); } } return INSTANCE_; } /** * Checks if the argument c is to be treated as a white space in ICU * rules. Usually ICU rule white spaces are ignored unless quoted. * @param c codepoint to check * @return true if c is a ICU white space */ public static boolean isRuleWhiteSpace(int c) { /* "white space" in the sense of ICU rule parsers This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. See UTR #31: http://www.unicode.org/reports/tr31/. U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 */ return (c >= 0x0009 && c <= 0x2029 && (c <= 0x000D || c == 0x0020 || c == 0x0085 || c == 0x200E || c == 0x200F || c >= 0x2028)); } // protected variables ----------------------------------------------- /** * Case table */ char m_case_[]; /** * Exception property table */ int m_exception_[]; /** * Extra property trie */ CharTrie m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary * properties. */ int m_additionalVectors_[]; /** * Number of additional columns */ int m_additionalColumnsCount_; /** * Maximum values for block, bits used as in vector word * 0 */ int m_maxBlockScriptValue_; /** * Maximum values for script, bits used as in vector word * 0 */ int m_maxJTGValue_; // private variables ------------------------------------------------- /** * UnicodeData.txt property object */ private static UCharacterProperty INSTANCE_ = null; /** * Default name of the datafile */ private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; /** * Default buffer size of datafile */ private static final int DATA_BUFFER_SIZE_ = 25000; /** * This, from what i infer is the max size of the indicators used for the * exception values. * Number of bits in an 8-bit integer value */ private static final int EXC_GROUP_ = 8; /** * Mask to get the group */ private static final int EXC_GROUP_MASK_ = 255; /** * Mask to get the digit value in the exception result */ private static final int EXC_DIGIT_MASK_ = 0xFFFF; /** * Offset table for data in exception block.<br> * Table formed by the number of bits used for the index, e.g. 0 = 0 bits, * 1 = 1 bits. */ private static final byte FLAGS_OFFSET_[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; /** * Numeric value shift */ private static final int VALUE_SHIFT_ = 20; /** * Mask to be applied after shifting to obtain an unsigned numeric value */ private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0x7FF; /** * */ private static final int NUMERIC_TYPE_SHIFT = 12; /** * Folding indicator mask */ private static final int SUPPLEMENTARY_FOLD_INDICATOR_MASK_ = 0x8000; /** * Folding offset mask */ private static final int SUPPLEMENTARY_FOLD_OFFSET_MASK_ = 0x7FFF; /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** * Offset to add to combined surrogate pair to avoid msking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; /** * To get the last character out from a data type */ private static final int LAST_CHAR_MASK_ = 0xFFFF; /** * First nibble shift */ private static final int FIRST_NIBBLE_SHIFT_ = 0x4; /** * Second nibble mask */ private static final int LAST_NIBBLE_MASK_ = 0xF; /** * Age value shift */ private static final int AGE_SHIFT_ = 24; // private constructors -------------------------------------------------- /** * Constructor * @exception thrown when data reading fails or data corrupted */ private UCharacterProperty() throws IOException { // jar access InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_); BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_); UCharacterPropertyReader reader = new UCharacterPropertyReader(b); reader.read(this); b.close(); m_trie_.putIndexData(this); } /* Is followed by {case-ignorable}* cased ? */ /** * Getting the correct address for data in the exception value * @param evalue exception value * @param indicator type of data to retrieve * @param address current address to move from * @return the correct address */ private int addExceptionOffset(int evalue, int indicator, int address) { int result = address; if (indicator >= EXC_GROUP_) { result += FLAGS_OFFSET_[evalue & EXC_GROUP_MASK_]; evalue >>= EXC_GROUP_; indicator -= EXC_GROUP_; } int mask = (1 << indicator) - 1; result += FLAGS_OFFSET_[evalue & mask]; return result; } private static final int TAB = 0x0009; private static final int LF = 0x000a; private static final int FF = 0x000c; private static final int CR = 0x000d; private static final int U_A = 0x0041; private static final int U_Z = 0x005a; private static final int U_a = 0x0061; private static final int U_z = 0x007a; private static final int DEL = 0x007f; private static final int NL = 0x0085; private static final int NBSP = 0x00a0; private static final int CGJ = 0x034f; private static final int FIGURESP= 0x2007; private static final int HAIRSP = 0x200a; private static final int ZWNJ = 0x200c; private static final int ZWJ = 0x200d; private static final int RLM = 0x200f; private static final int NNBSP = 0x202f; private static final int WJ = 0x2060; private static final int INHSWAP = 0x206a; private static final int NOMDIG = 0x206f; private static final int ZWNBSP = 0xfeff; public UnicodeSet addPropertyStarts(UnicodeSet set) { int c; /* add the start code point of each same-value range of each trie */ //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); TrieIterator propsIter = new TrieIterator(m_trie_); RangeValueIterator.Element propsResult = new RangeValueIterator.Element(); while(propsIter.next(propsResult)){ set.add(propsResult.start); } //utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, set); TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); while(propsVectorsIter.next(propsVectorsResult)){ set.add(propsVectorsResult.start); } /* add code points with hardcoded properties, plus the ones following them */ /* add for IS_THAT_CONTROL_SPACE() */ set.add(TAB); /* range TAB..CR */ set.add(CR+1); set.add(0x1c); set.add(0x1f+1); set.add(NL); set.add(NL+1); /* add for u_isIDIgnorable() what was not added above */ set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ set.add(HAIRSP); set.add(RLM+1); set.add(INHSWAP); set.add(NOMDIG+1); set.add(ZWNBSP); set.add(ZWNBSP+1); /* add no-break spaces for u_isWhitespace() what was not added above */ set.add(NBSP); set.add(NBSP+1); set.add(FIGURESP); set.add(FIGURESP+1); set.add(NNBSP); set.add(NNBSP+1); /* add for u_charDigitValue() */ set.add(0x3007); set.add(0x3008); set.add(0x4e00); set.add(0x4e01); set.add(0x4e8c); set.add(0x4e8d); set.add(0x4e09); set.add(0x4e0a); set.add(0x56db); set.add(0x56dc); set.add(0x4e94); set.add(0x4e95); set.add(0x516d); set.add(0x516e); set.add(0x4e03); set.add(0x4e04); set.add(0x516b); set.add(0x516c); set.add(0x4e5d); set.add(0x4e5e); /* add for u_digit() */ set.add(U_a); set.add(U_z+1); set.add(U_A); set.add(U_Z+1); /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ set.add(WJ); /* range WJ..NOMDIG */ set.add(0xfff0); set.add(0xfffb+1); set.add(0xe0000); set.add(0xe0fff+1); /* add for UCHAR_GRAPHEME_BASE and others */ set.add(CGJ); set.add(CGJ+1); /* add for UCHAR_JOINING_TYPE */ set.add(ZWNJ); /* range ZWNJ..ZWJ */ set.add(ZWJ+1); /* add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE */ set.add(0x1100); int value= UCharacter.HangulSyllableType.LEADING_JAMO; int value2; for(c=0x115a; c<=0x115f; ++c) { value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE); if(value!=value2) { value=value2; set.add(c); } } set.add(0x1160); value=UCharacter.HangulSyllableType.VOWEL_JAMO; for(c=0x11a3; c<=0x11a7; ++c) { value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE); if(value!=value2) { value=value2; set.add(c); } } set.add(0x11a8); value=UCharacter.HangulSyllableType.TRAILING_JAMO; for(c=0x11fa; c<=0x11ff; ++c) { value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE); if(value!=value2) { value=value2; set.add(c); } } /* * Omit code points for u_charCellWidth() because * - it is deprecated and not a real Unicode property * - they are probably already set from the trie enumeration */ /* * Omit code points with hardcoded specialcasing properties * because we do not build property UnicodeSets for them right now. */ return set; // for chaining } /*---------------------------------------------------------------- * Inclusions list *----------------------------------------------------------------*/ /* * Return a set of characters for property enumeration. * The set implicitly contains 0x110000 as well, which is one more than the highest * Unicode code point. * * This set is used as an ordered list - its code points are ordered, and * consecutive code points (in Unicode code point order) in the set define a range. * For each two consecutive characters (start, limit) in the set, * all of the UCD/normalization and related properties for * all code points start..limit-1 are all the same, * except for character names and ISO comments. * * All Unicode code points U+0000..U+10ffff are covered by these ranges. * The ranges define a partition of the Unicode code space. * ICU uses the inclusions set to enumerate properties for generating * UnicodeSets containing all code points that have a certain property value. * * The Inclusion List is generated from the UCD. It is generated * by enumerating the data tries, and code points for hardcoded properties * are added as well. * * -------------------------------------------------------------------------- * * The following are ideas for getting properties-unique code point ranges, * with possible optimizations beyond the current implementation. * These optimizations would require more code and be more fragile. * The current implementation generates one single list (set) for all properties. * * To enumerate properties efficiently, one needs to know ranges of * repetitive values, so that the value of only each start code point * can be applied to the whole range. * This information is in principle available in the uprops.icu/unorm.icu data. * * There are two obstacles: * * 1. Some properties are computed from multiple data structures, * making it necessary to get repetitive ranges by intersecting * ranges from multiple tries. * * 2. It is not economical to write code for getting repetitive ranges * that are precise for each of some 50 properties. * * Compromise ideas: * * - Get ranges per trie, not per individual property. * Each range contains the same values for a whole group of properties. * This would generate currently five range sets, two for uprops.icu tries * and three for unorm.icu tries. * * - Combine sets of ranges for multiple tries to get sufficient sets * for properties, e.g., the uprops.icu main and auxiliary tries * for all non-normalization properties. * * Ideas for representing ranges and combining them: * * - A UnicodeSet could hold just the start code points of ranges. * Multiple sets are easily combined by or-ing them together. * * - Alternatively, a UnicodeSet could hold each even-numbered range. * All ranges could be enumerated by using each start code point * (for the even-numbered ranges) as well as each limit (end+1) code point * (for the odd-numbered ranges). * It should be possible to combine two such sets by xor-ing them, * but no more than two. * * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, * but the first one is certainly simpler and applicable for combining more than * two range sets. * * It is possible to combine all range sets for all uprops/unorm tries into one * set that can be used for all properties. * As an optimization, there could be less-combined range sets for certain * groups of properties. * The relationship of which less-combined range set to use for which property * depends on the implementation of the properties and must be hardcoded * - somewhat error-prone and higher maintenance but can be tested easily * by building property sets "the simple way" in test code. * * --- * * Do not use a UnicodeSet pattern because that causes infinite recursion; * UnicodeSet depends on the inclusions set. */ public UnicodeSet getInclusions() { UnicodeSet set = new UnicodeSet(); NormalizerImpl.addPropertyStarts(set); addPropertyStarts(set); return set; } }