/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.rdfxml.xmlinput.impl; import java.text.Normalizer; /** * Some support for the Character Model Recommendation * from the W3C (currently in second last call working * draft). */ public class CharacterModel { static private final boolean SWITCH_OFF = false; /** Is this string in Unicode Normal Form C. * @param str The string to be tested. */ static public boolean isNormalFormC(String str) { try { return SWITCH_OFF || Normalizer.isNormalized(str,Normalizer.Form.NFC); } catch (ArrayIndexOutOfBoundsException e) { String normalized = Normalizer.normalize(str, Normalizer.Form.NFC); return normalized.equals(str); } } /* Does this string start with a composing character as defined * by the * <a href="http://www.w3.org/TR/charmod"> * Character Model 2nd Last Call Working Draft</a>. * @param str The string to be tested. */ static public boolean startsWithComposingCharacter(String str) { return SWITCH_OFF ? false : (str.length()==0?false:isComposingChar(str.charAt(0))); } /** Is this string fully normalized as defined * by the * <a href="http://www.w3.org/TR/charmod"> * Character Model 2nd Last Call Working Draft</a>. * @param str The string to be tested. */ static public boolean isFullyNormalizedConstruct(String str) { return SWITCH_OFF || (isNormalFormC(str) && !startsWithComposingCharacter(str)); } /** Is the character a composing character as defined * by the * <a href="http://www.w3.org/TR/charmod"> * Character Model 2nd Last Call Working Draft</a>. * @param x The character to be tested. */ static public boolean isComposingChar(char x) { if ( SWITCH_OFF ) return false; switch (x) { // Brahmi-derived scripts case 0X09BE: // BENGALI VOWEL SIGN AA case 0X09D7: // BENGALI AU LENGTH MARK case 0X0B3E: // ORIYA VOWEL SIGN AA case 0X0B56: // ORIYA AI LENGTH MARK case 0X0B57: // ORIYA AU LENGTH MARK case 0X0BBE: // TAMIL VOWEL SIGN AA case 0X0BD7: // TAMIL AU LENGTH MARK case 0X0CC2: // KANNADA VOWEL SIGN UU case 0X0CD5: // KANNADA LENGTH MARK case 0X0CD6: // KANNADA AI LENGTH MARK case 0X0D3E: // MALAYALAM VOWEL SIGN AA case 0X0D57: // MALAYALAM AU LENGTH MARK case 0X0DCF: // SINHALA VOWEL SIGN AELA-PILLA case 0X0DDF: // SINHALA VOWEL SING GAYANUKITTA case 0X0FB5: // TIBETAN SUBJOINED LETTER SSA case 0X0FB7: // TIBETAN SUBJOINED LETTER HA case 0X102E: // MYANMAR VOWEL SIGN II // Hangul vowels case 0X1161: // HANGUL JUNGSEONG A case 0X1162: // HANGUL JUNGSEONG AE case 0X1163: // HANGUL JUNGSEONG YA case 0X1164: // HANGUL JUNGSEONG YAE case 0X1165: // HANGUL JUNGSEONG EO case 0X1166: // HANGUL JUNGSEONG E case 0X1167: // HANGUL JUNGSEONG YEO case 0X1168: // HANGUL JUNGSEONG YE case 0X1169: // HANGUL JUNGSEONG O case 0X116A: // HANGUL JUNGSEONG WA case 0X116B: // HANGUL JUNGSEONG WAE case 0X116C: // HANGUL JUNGSEONG OE case 0X116D: // HANGUL JUNGSEONG YO case 0X116E: // HANGUL JUNGSEONG U case 0X116F: // HANGUL JUNGSEONG WEO case 0X1170: // HANGUL JUNGSEONG WE case 0X1171: // HANGUL JUNGSEONG WI case 0X1172: // HANGUL JUNGSEONG YU case 0X1173: // HANGUL JUNGSEONG EU case 0X1174: // HANGUL JUNGSEONG YI case 0X1175: // HANGUL JUNGSEONG I // Hangul trailing consonants case 0X11A8: // HANGUL JONGSEONG KIYEOK case 0X11A9: // HANGUL JONGSEONG SSANGKIYEOK case 0X11AA: // HANGUL JONGSEONG KIYEOK-SIOS case 0X11AB: // HANGUL JONGSEONG NIEUN case 0X11AC: // HANGUL JONGSEONG NIEUN-CIEUC case 0X11AD: // HANGUL JONGSEONG NIEUN-HIEUH case 0X11AE: // HANGUL JONGSEONG TIKEUT case 0X11AF: // HANGUL JONGSEONG RIEUL case 0X11B0: // HANGUL JONGSEONG RIEUL-KIYEOK case 0X11B1: // HANGUL JONGSEONG RIEUL-MIEUM case 0X11B2: // HANGUL JONGSEONG RIEUL-PIEUP case 0X11B3: // HANGUL JONGSEONG RIEUL-SIOS case 0X11B4: // HANGUL JONGSEONG RIEUL-THIEUTH case 0X11B5: // HANGUL JONGSEONG RIEUL-PHIEUPH case 0X11B6: // HANGUL JONGSEONG RIEUL-HIEUH case 0X11B7: // HANGUL JONGSEONG MIEUM case 0X11B8: // HANGUL JONGSEONG PIEUP case 0X11B9: // HANGUL JONGSEONG PIEUP-SIOS case 0X11BA: // HANGUL JONGSEONG SIOS case 0X11BB: // HANGUL JONGSEONG SSANGSIOS case 0X11BC: // HANGUL JONGSEONG IEUNG case 0X11BD: // HANGUL JONGSEONG CIEUC case 0X11BE: // HANGUL JONGSEONG CHIEUCH case 0X11BF: // HANGUL JONGSEONG KHIEUKH case 0X11C0: // HANGUL JONGSEONG THIEUTH case 0X11C1: // HANGUL JONGSEONG PHIEUPH case 0X11C2: // HANGUL JONGSEONG HIEUH return true; default: return isCombining(x); } } /* * Replacing icu4j function for non-zero combining class * Data from {@link http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B:Canonical_Combining_Class!=0:%5D} */ private static boolean isCombining(int c) { // Common case if (c < 0x0300) return false; if (c >= 0x0300 && c <= 0x034E) return true; if (c >= 0x0350 && c <= 0x036F) return true; if (c >= 0x0483 && c <= 0x0487) return true; if (c >= 0x0591 && c <= 0x05BD) return true; if (c >= 0x0610 && c <= 0x061A) return true; if (c >= 0x064B && c <= 0x065F) return true; if (c >= 0x06D6 && c <= 0x06DC) return true; if (c >= 0x06DF && c <= 0x06E4) return true; if (c >= 0x06EA && c <= 0x06ED) return true; if (c >= 0x0730 && c <= 0x074A) return true; if (c >= 0x07EB && c <= 0x07F3) return true; if (c >= 0x0816 && c <= 0x0819) return true; if (c >= 0x081B && c <= 0x0823) return true; if (c >= 0x0825 && c <= 0x0827) return true; if (c >= 0x0829 && c <= 0x082D) return true; if (c >= 0x0859 && c <= 0x085B) return true; if (c >= 0x0951 && c <= 0x0954) return true; if (c >= 0x0E38 && c <= 0x0E3A) return true; if (c >= 0x0E48 && c <= 0x0E4B) return true; if (c >= 0x0EC8 && c <= 0x0ECB) return true; if (c >= 0x0F7A && c <= 0x0F7D) return true; if (c >= 0x0F82 && c <= 0x0F84) return true; if (c >= 0x135D && c <= 0x135F) return true; if (c >= 0x1939 && c <= 0x193B) return true; if (c >= 0x1A75 && c <= 0x1A7C) return true; if (c >= 0x1B6B && c <= 0x1B73) return true; if (c >= 0x1CD0 && c <= 0x1CD2) return true; if (c >= 0x1CD4 && c <= 0x1CE0) return true; if (c >= 0x1CE2 && c <= 0x1CE8) return true; if (c >= 0x1DC0 && c <= 0x1DE6) return true; if (c >= 0x1DFC && c <= 0x1DFF) return true; if (c >= 0x20D0 && c <= 0x20DC) return true; if (c >= 0x20E5 && c <= 0x20F0) return true; if (c >= 0x2CEF && c <= 0x2CF1) return true; if (c >= 0x2DE0 && c <= 0x2DFF) return true; if (c >= 0x302A && c <= 0x302F) return true; if (c >= 0xA8E0 && c <= 0xA8F1) return true; if (c >= 0xA92B && c <= 0xA92D) return true; if (c >= 0xAAB2 && c <= 0xAAB4) return true; if (c >= 0xFE20 && c <= 0xFE26) return true; if (c >= 0x00010A38 && c <= 0x00010A3A) return true; if (c >= 0x0001D165 && c <= 0x0001D169) return true; if (c >= 0x0001D16D && c <= 0x0001D172) return true; if (c >= 0x0001D17B && c <= 0x0001D182) return true; if (c >= 0x0001D185 && c <= 0x0001D18B) return true; if (c >= 0x0001D1AA && c <= 0x0001D1AD) return true; if (c >= 0x0001D242 && c <= 0x0001D244) return true; switch (c) { case 0x05BF: case 0x05C1: case 0x05C2: case 0x05C4: case 0x05C5: case 0x05C7: case 0x0670: case 0x06E7: case 0x06E8: case 0x0711: case 0x093C: case 0x094D: case 0x09BC: case 0x09CD: case 0x0A3C: case 0x0A4D: case 0x0ABC: case 0x0ACD: case 0x0B3C: case 0x0B4D: case 0x0BCD: case 0x0C4D: case 0x0C55: case 0x0C56: case 0x0CBC: case 0x0CCD: case 0x0D4D: case 0x0DCA: case 0x0EB8: case 0x0EB9: case 0x0F18: case 0x0F19: case 0x0F35: case 0x0F37: case 0x0F39: case 0x0F71: case 0x0F72: case 0x0F74: case 0x0F80: case 0x0F86: case 0x0F87: case 0x0FC6: case 0x1037: case 0x1039: case 0x103A: case 0x108D: case 0x1714: case 0x1734: case 0x17D2: case 0x17DD: case 0x18A9: case 0x1A17: case 0x1A18: case 0x1A60: case 0x1A7F: case 0x1B34: case 0x1B44: case 0x1BAA: case 0x1BE6: case 0x1BF2: case 0x1BF3: case 0x1C37: case 0x1CED: case 0x20E1: case 0x2D7F: case 0x3099: case 0x309A: case 0xA66F: case 0xA67C: case 0xA67D: case 0xA6F0: case 0xA6F1: case 0xA806: case 0xA8C4: case 0xA953: case 0xA9B3: case 0xA9C0: case 0xAAB0: case 0xAAB7: case 0xAAB8: case 0xAABE: case 0xAABF: case 0xAAC1: case 0xABED: case 0xFB1E: case 0x000101FD: case 0x00010A0D: case 0x00010A0F: case 0x00010A3F: case 0x00011046: case 0x000110B9: case 0x000110BA: return true; default: return false; } } }