/* * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.MissingResourceException; import sun.text.normalizer.UCharacter.HangulSyllableType; import sun.text.normalizer.UCharacter.NumericType; /** * <p>Internal class used for Unicode character property database.</p> * <p>This classes store binary data read from uprops.icu. * It does not have the capability to parse the data into more high-level * information. It only returns bytes of information when required.</p> * <p>Due to the form most commonly used for retrieval, array of char is used * to store the binary data.</p> * <p>UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data.</p> * <p>Responsibility for molding the binary data into more meaning form lies on * <a href=UCharacter.html>UCharacter</a>.</p> * @author Syn Wee Quek * @since release 2.1, february 1st 2002 */ final class UCharacterProperty { // public data members ----------------------------------------------- /* * public singleton instance */ public static final UCharacterProperty INSTANCE; /** * Trie data */ public Trie2_16 m_trie_; /** * Unicode version */ public VersionInfo m_unicodeVersion_; /** * Character type mask */ public static final int TYPE_MASK = 0x1F; // uprops.h enum UPropertySource --------------------------------------- *** /** From uchar.c/uprops.icu main trie */ public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; /** From ubidi_props.c/ubidi.icu */ public static final int SRC_BIDI=5; /** From normalizer2impl.cpp/nfc.nrm */ public static final int SRC_NFC=8; /** From normalizer2impl.cpp/nfkc.nrm */ public static final int SRC_NFKC=9; // public methods ---------------------------------------------------- /** * Gets the main property value for code point ch. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { return m_trie_.get(ch); } /** * Gets the unicode additional properties. * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved * @param column The column index. * @return unicode properties */ public int getAdditional(int codepoint, int column) { assert column >= 0; if (column >= m_additionalColumnsCount_) { return 0; } return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; } /** * <p>Get the "age" of the code point.</p> * <p>The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character.</p> * <p>This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.</p> * <p>The data is from the UCD file DerivedAge.txt.</p> * <p>This API does not check the validity of the codepoint.</p> * @param codepoint The code point. * @return the Unicode version number */ public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; return VersionInfo.getInstance( (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); } // int-value and enumerated properties --------------------------------- *** public int getType(int c) { return getProperty(c)&TYPE_MASK; } /* * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. */ private static final int /* UHangulSyllableType */ gcbToHst[]={ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ /* * Omit GCB values beyond what we need for hst. * The code below checks for the array length. */ }; private class IntProperty { int column; // SRC_PROPSVEC column, or "source" if mask==0 int mask; int shift; IntProperty(int column, int mask, int shift) { this.column=column; this.mask=mask; this.shift=shift; } IntProperty(int source) { this.column=source; this.mask=0; } int getValue(int c) { // systematic, directly stored properties return (getAdditional(c, column)&mask)>>>shift; } } private class BiDiIntProperty extends IntProperty { BiDiIntProperty() { super(SRC_BIDI); } } private class CombiningClassIntProperty extends IntProperty { CombiningClassIntProperty(int source) { super(source); } } private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties int which; int max; NormQuickCheckIntProperty(int source, int which, int max) { super(source); this.which=which; this.max=max; } } private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE int getValue(int c) { return UBiDiProps.INSTANCE.getPairedBracketType(c); } }; public int getIntPropertyValue(int c, int which) { if (which == BIDI_PAIRED_BRACKET_TYPE) { return intProp.getValue(c); } return 0; // undefined } /** * Forms a supplementary code point from the argument character<br> * Note this is for internal use hence no checks for the validity of the * surrogate characters are done * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */ public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; } /** * Gets the type mask * @param type character type * @return mask */ public static final int getMask(int type) { return 1 << type; } /** * Returns the digit values of characters like 'A' - 'Z', normal, * half-width and full-width. This method assumes that the other digit * characters are checked by the calling method. * @param ch character to test * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise * its corresponding digit will be returned. */ public static int getEuropeanDigit(int ch) { if ((ch > 0x7a && ch < 0xff21) || ch < 0x41 || (ch > 0x5a && ch < 0x61) || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { return -1; } if (ch <= 0x7a) { // ch >= 0x41 or ch < 0x61 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); } // ch >= 0xff21 if (ch <= 0xff3a) { return ch + 10 - 0xff21; } // ch >= 0xff41 && ch <= 0xff5a return ch + 10 - 0xff41; } public int digit(int c) { int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; if(value<=9) { return value; } else { return -1; } } // protected variables ----------------------------------------------- /** * Extra property trie */ Trie2_16 m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary * properties. */ int m_additionalVectors_[]; /** * Number of additional columns */ int m_additionalColumnsCount_; /** * Maximum values for block, bits used as in vector word * 0 */ int m_maxBlockScriptValue_; /** * Maximum values for script, bits used as in vector word * 0 */ int m_maxJTGValue_; /** * Script_Extensions data */ public char[] m_scriptExtensions_; // private variables ------------------------------------------------- /** * Default name of the datafile */ private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** * Offset to add to combined surrogate pair to avoid masking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; // property data constants ------------------------------------------------- /** * Numeric types and values in the main properties words. */ private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; private static final int getNumericTypeValue(int props) { return props >> NUMERIC_TYPE_VALUE_SHIFT_; } /* constants for the storage form of numeric types and values */ /** No numeric value. */ private static final int NTV_NONE_ = 0; /** Decimal digits: nv=0..9 */ private static final int NTV_DECIMAL_START_ = 1; /** Other digits: nv=0..9 */ private static final int NTV_DIGIT_START_ = 11; /** Small integers: nv=0..154 */ private static final int NTV_NUMERIC_START_ = 21; private static final int ntvGetType(int ntv) { return (ntv==NTV_NONE_) ? NumericType.NONE : (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : NumericType.NUMERIC; } /* * Properties in vector word 0 * Bits * 31..24 DerivedAge version major/minor one nibble each * 23..22 3..1: Bits 7..0 = Script_Extensions index * 3: Script value from Script_Extensions * 2: Script=Inherited * 1: Script=Common * 0: Script=bits 7..0 * 21..20 reserved * 19..17 East Asian Width * 16.. 8 UBlockCode * 7.. 0 UScriptCode */ /** * Script_Extensions: mask includes Script */ public static final int SCRIPT_X_MASK = 0x00c000ff; //private static final int SCRIPT_X_SHIFT = 22; /** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_MASK */ private static final int EAST_ASIAN_MASK_ = 0x000e0000; /** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_SHIFT */ private static final int EAST_ASIAN_SHIFT_ = 17; /** * Integer properties mask and shift values for blocks. * Equivalent to icu4c UPROPS_BLOCK_MASK */ private static final int BLOCK_MASK_ = 0x0001ff00; /** * Integer properties mask and shift values for blocks. * Equivalent to icu4c UPROPS_BLOCK_SHIFT */ private static final int BLOCK_SHIFT_ = 8; /** * Integer properties mask and shift values for scripts. * Equivalent to icu4c UPROPS_SHIFT_MASK */ public static final int SCRIPT_MASK_ = 0x000000ff; /** * Additional properties used in internal trie data */ /* * Properties in vector word 1 * Each bit encodes one binary property. * The following constants represent the bit number, use 1<<UPROPS_XYZ. * UPROPS_BINARY_1_TOP<=32! * * Keep this list of property enums in sync with * propListNames[] in icu/source/tools/genprops/props2.c! * * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". */ private static final int WHITE_SPACE_PROPERTY_ = 0; private static final int DASH_PROPERTY_ = 1; private static final int HYPHEN_PROPERTY_ = 2; private static final int QUOTATION_MARK_PROPERTY_ = 3; private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; private static final int MATH_PROPERTY_ = 5; private static final int HEX_DIGIT_PROPERTY_ = 6; private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; private static final int ALPHABETIC_PROPERTY_ = 8; private static final int IDEOGRAPHIC_PROPERTY_ = 9; private static final int DIACRITIC_PROPERTY_ = 10; private static final int EXTENDER_PROPERTY_ = 11; private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; private static final int GRAPHEME_LINK_PROPERTY_ = 14; private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; private static final int RADICAL_PROPERTY_ = 17; private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; private static final int DEPRECATED_PROPERTY_ = 20; private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; private static final int XID_START_PROPERTY_ = 22; private static final int XID_CONTINUE_PROPERTY_ = 23; private static final int ID_START_PROPERTY_ = 24; private static final int ID_CONTINUE_PROPERTY_ = 25; private static final int GRAPHEME_BASE_PROPERTY_ = 26; private static final int S_TERM_PROPERTY_ = 27; private static final int VARIATION_SELECTOR_PROPERTY_ = 28; private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ private static final int PATTERN_WHITE_SPACE = 30; /* * Properties in vector word 2 * Bits * 31..26 reserved * 25..20 Line Break * 19..15 Sentence Break * 14..10 Word Break * 9.. 5 Grapheme Cluster Break * 4.. 0 Decomposition Type */ private static final int LB_MASK = 0x03f00000; private static final int LB_SHIFT = 20; private static final int SB_MASK = 0x000f8000; private static final int SB_SHIFT = 15; private static final int WB_MASK = 0x00007c00; private static final int WB_SHIFT = 10; private static final int GCB_MASK = 0x000003e0; private static final int GCB_SHIFT = 5; /** * Integer properties mask for decomposition type. * Equivalent to icu4c UPROPS_DT_MASK. */ private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; /** * First nibble shift */ private static final int FIRST_NIBBLE_SHIFT_ = 0x4; /** * Second nibble mask */ private static final int LAST_NIBBLE_MASK_ = 0xF; /** * Age value shift */ private static final int AGE_SHIFT_ = 24; // private constructors -------------------------------------------------- /** * Constructor * @exception IOException thrown when data reading fails or data corrupted */ private UCharacterProperty() throws IOException { // jar access ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); // Read or skip the 16 indexes. int propertyOffset = bytes.getInt(); /* exceptionOffset = */ bytes.getInt(); /* caseOffset = */ bytes.getInt(); int additionalOffset = bytes.getInt(); int additionalVectorsOffset = bytes.getInt(); m_additionalColumnsCount_ = bytes.getInt(); int scriptExtensionsOffset = bytes.getInt(); int reservedOffset7 = bytes.getInt(); /* reservedOffset8 = */ bytes.getInt(); /* dataTopOffset = */ bytes.getInt(); m_maxBlockScriptValue_ = bytes.getInt(); m_maxJTGValue_ = bytes.getInt(); ICUBinary.skipBytes(bytes, (16 - 12) << 2); // read the main properties trie m_trie_ = Trie2_16.createFromSerialized(bytes); int expectedTrieLength = (propertyOffset - 16) * 4; int trieLength = m_trie_.getSerializedLength(); if(trieLength > expectedTrieLength) { throw new IOException("uprops.icu: not enough bytes for main trie"); } // skip padding after trie bytes ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); // skip unused intervening data structures ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); if(m_additionalColumnsCount_ > 0) { // reads the additional property block m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; trieLength = m_additionalTrie_.getSerializedLength(); if(trieLength > expectedTrieLength) { throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); } // skip padding after trie bytes ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); // additional properties int size = scriptExtensionsOffset - additionalVectorsOffset; m_additionalVectors_ = new int[size]; for (int i = 0; i < size; i ++) { m_additionalVectors_[i] = bytes.getInt(); } } // Script_Extensions int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; if(numChars > 0) { m_scriptExtensions_ = new char[numChars]; for(int i = 0; i < numChars; ++i) { m_scriptExtensions_[i] = bytes.getChar(); } } } private static final class IsAcceptable implements ICUBinary.Authenticate { // @Override when we switch to Java 6 public boolean isDataVersionAcceptable(byte version[]) { return version[0] == 7; } } private static final int DATA_FORMAT = 0x5550726F; // "UPro" public void upropsvec_addPropertyStarts(UnicodeSet set) { /* add the start code point of each same-value range of the properties vectors trie */ if(m_additionalColumnsCount_>0) { /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); Trie2.Range range; while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { set.add(range.startCodePoint); } } } // This static initializer block must be placed after // other static member initialization static { try { INSTANCE = new UCharacterProperty(); } catch (IOException e) { throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); } } // Moved from UProperty.java /** * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). * Used in UAX #9: Unicode Bidirectional Algorithm * (http://www.unicode.org/reports/tr9/) * Returns UCharacter.BidiPairedBracketType values. * @stable ICU 52 */ public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; }