package com.thaiopensource.datatype.xsd.regex.java; import com.thaiopensource.datatype.xsd.regex.RegexSyntaxException; import com.thaiopensource.util.Localizer; import com.thaiopensource.util.Utf16; import java.math.BigDecimal; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Vector; /** * Translates XML Schema regexes into <code>java.util.regex</code> regexes. * * @see java.util.regex.Pattern * @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a> */ public class Translator { private final String regExp; private int pos = 0; private final int length; private char curChar; private boolean eos = false; private final StringBuffer result = new StringBuffer(); static private final boolean surrogatesDirect = RegexFeatures.SURROGATES_DIRECT; static private final String categories = "LMNPZSC"; static private final CharClass[] categoryCharClasses = new CharClass[categories.length()]; static private final String subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn"; static private final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2]; static private final int NONBMP_MIN = 0x10000; static private final int NONBMP_MAX = 0x10FFFF; static private final char SURROGATE2_MIN = '\uDC00'; static private final char SURROGATE2_MAX = '\uDFFF'; static final Localizer localizer = new Localizer(Translator.class); static private final String[] blockNames = { "BasicLatin", "Latin-1Supplement", "LatinExtended-A", "LatinExtended-B", "IPAExtensions", "SpacingModifierLetters", "CombiningDiacriticalMarks", "Greek", "Cyrillic", "Armenian", "Hebrew", "Arabic", "Syriac", "Thaana", "Devanagari", "Bengali", "Gurmukhi", "Gujarati", "Oriya", "Tamil", "Telugu", "Kannada", "Malayalam", "Sinhala", "Thai", "Lao", "Tibetan", "Myanmar", "Georgian", "HangulJamo", "Ethiopic", "Cherokee", "UnifiedCanadianAboriginalSyllabics", "Ogham", "Runic", "Khmer", "Mongolian", "LatinExtendedAdditional", "GreekExtended", "GeneralPunctuation", "SuperscriptsandSubscripts", "CurrencySymbols", "CombiningMarksforSymbols", "LetterlikeSymbols", "NumberForms", "Arrows", "MathematicalOperators", "MiscellaneousTechnical", "ControlPictures", "OpticalCharacterRecognition", "EnclosedAlphanumerics", "BoxDrawing", "BlockElements", "GeometricShapes", "MiscellaneousSymbols", "Dingbats", "BraillePatterns", "CJKRadicalsSupplement", "KangxiRadicals", "IdeographicDescriptionCharacters", "CJKSymbolsandPunctuation", "Hiragana", "Katakana", "Bopomofo", "HangulCompatibilityJamo", "Kanbun", "BopomofoExtended", "EnclosedCJKLettersandMonths", "CJKCompatibility", "CJKUnifiedIdeographsExtensionA", "CJKUnifiedIdeographs", "YiSyllables", "YiRadicals", "HangulSyllables", // surrogates excluded because there are never any *characters* with codes in surrogate range // "PrivateUse", excluded because 3.1 adds non-BMP ranges "CJKCompatibilityIdeographs", "AlphabeticPresentationForms", "ArabicPresentationForms-A", "CombiningHalfMarks", "CJKCompatibilityForms", "SmallFormVariants", "ArabicPresentationForms-B", "Specials", "HalfwidthandFullwidthForms", "Specials" }; /** * Names of blocks including ranges outside the BMP. */ static private final String[] specialBlockNames = { "OldItalic", "Gothic", "Deseret", "ByzantineMusicalSymbols", "MusicalSymbols", "MathematicalAlphanumericSymbols", "CJKUnifiedIdeographsExtensionB", "CJKCompatibilityIdeographsSupplement", "Tags", "PrivateUse" }; /** * CharClass for each block name in specialBlockNames. */ static private final CharClass[] specialBlockCharClasses = { new CharRange(0x10300, 0x1032F), new CharRange(0x10330, 0x1034F), new CharRange(0x10400, 0x1044F), new CharRange(0x1D000, 0x1D0FF), new CharRange(0x1D100, 0x1D1FF), new CharRange(0x1D400, 0x1D7FF), new CharRange(0x20000, 0x2A6D6), new CharRange(0x2F800, 0x2FA1F), new CharRange(0xE0000, 0xE007F), new Union(new CharClass[] { new CharRange(0xE000, 0xF8FF), new CharRange(0xF0000, 0xFFFFD), new CharRange(0x100000, 0x10FFFD) }) }; static private final CharClass DOT = new Complement(new Union(new CharClass[] { new SingleChar('\n'), new SingleChar('\r') })); static private final CharClass ESC_d = new Property("Nd"); static private final CharClass ESC_D = new Complement(ESC_d); static private final CharClass ESC_W = new Union(new CharClass[] {new Property("P"), new Property("Z"), new Property("C")}); static private final CharClass ESC_w = new Complement(ESC_W); static private final CharClass ESC_s = new Union(new CharClass[] { new SingleChar(' '), new SingleChar('\n'), new SingleChar('\r'), new SingleChar('\t') }); static private final CharClass ESC_S = new Complement(ESC_s); static private final CharClass ESC_i = makeCharClass(NamingExceptions.NMSTRT_CATEGORIES, NamingExceptions.NMSTRT_INCLUDES, NamingExceptions.NMSTRT_EXCLUDE_RANGES); static private final CharClass ESC_I = new Complement(ESC_i); static private final CharClass ESC_c = makeCharClass(NamingExceptions.NMCHAR_CATEGORIES, NamingExceptions.NMCHAR_INCLUDES, NamingExceptions.NMCHAR_EXCLUDE_RANGES); static private final CharClass ESC_C = new Complement(ESC_c); static private final char EOS = '\0'; private Translator(String regExp) { this.regExp = regExp; this.length = regExp.length(); advance(); } /** * Translates a regular expression in the syntax of XML Schemas Part 2 into a regular * expression in the syntax of <code>java.util.regex.Pattern</code>. The translation * assumes that the string to be matched against the regex uses surrogate pairs correctly. * If the string comes from XML content, a conforming XML parser will automatically * check this; if the string comes from elsewhere, it may be necessary to check * surrogate usage before matching. * * @param regexp a String containing a regular expression in the syntax of XML Schemas Part 2 * @return a String containing a regular expression in the syntax of java.util.regex.Pattern * @throws RegexSyntaxException if <code>regexp</code> is not a regular expression in the * syntax of XML Schemas Part 2 * @see java.util.regex.Pattern * @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a> */ static public String translate(String regexp) throws RegexSyntaxException { Translator tr = new Translator(regexp); tr.translateTop(); return tr.result.toString(); } private void advance() { if (pos < length) curChar = regExp.charAt(pos++); else { pos++; curChar = EOS; eos = true; } } private void translateTop() throws RegexSyntaxException { translateRegExp(); if (!eos) throw makeException("expected_eos"); } private void translateRegExp() throws RegexSyntaxException { translateBranch(); while (curChar == '|') { copyCurChar(); translateBranch(); } } private void translateBranch() throws RegexSyntaxException { while (translateAtom()) translateQuantifier(); } private void translateQuantifier() throws RegexSyntaxException { switch (curChar) { case '*': case '?': case '+': copyCurChar(); return; case '{': copyCurChar(); translateQuantity(); expect('}'); copyCurChar(); } } private void translateQuantity() throws RegexSyntaxException { String lower = parseQuantExact(); int lowerValue = -1; try { lowerValue = Integer.parseInt(lower); result.append(lower); } catch (NumberFormatException e) { // JDK 1.4 cannot handle ranges bigger than this result.append(Integer.MAX_VALUE); } if (curChar == ',') { copyCurChar(); if (curChar != '}') { String upper = parseQuantExact(); try { int upperValue = Integer.parseInt(upper); result.append(upper); if (lowerValue < 0 || upperValue < lowerValue) throw makeException("invalid_quantity_range"); } catch (NumberFormatException e) { result.append(Integer.MAX_VALUE); if (lowerValue < 0 && new BigDecimal(lower).compareTo(new BigDecimal(upper)) > 0) throw makeException("invalid_quantity_range"); } } } } private String parseQuantExact() throws RegexSyntaxException { StringBuffer buf = new StringBuffer(); do { if ("0123456789".indexOf(curChar) < 0) throw makeException("expected_digit"); buf.append(curChar); advance(); } while (curChar != ',' && curChar != '}'); return buf.toString(); } private void copyCurChar() { result.append(curChar); advance(); } static final int NONE = -1; static final int SOME = 0; static final int ALL = 1; static final String SURROGATES1_CLASS = "[\uD800-\uDBFF]"; static final String SURROGATES2_CLASS = "[\uDC00-\uDFFF]"; static final String NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]"; static final class Range implements Comparable { private final int min; private final int max; Range(int min, int max) { this.min = min; this.max = max; } int getMin() { return min; } int getMax() { return max; } public int compareTo(Object o) { Range other = (Range)o; if (this.min < other.min) return -1; if (this.min > other.min) return 1; if (this.max > other.max) return -1; if (this.max < other.max) return 1; return 0; } } static abstract class CharClass { private final int containsBmp; // if it contains ALL and containsBmp != NONE, then the generated class for containsBmp must // contain all the high surrogates private final int containsNonBmp; protected CharClass(int containsBmp, int containsNonBmp) { this.containsBmp = containsBmp; this.containsNonBmp = containsNonBmp; } int getContainsBmp() { return containsBmp; } int getContainsNonBmp() { return containsNonBmp; } final void output(StringBuffer buf) { if (surrogatesDirect) outputDirect(buf); else outputMungeSurrogates(buf); } final void outputMungeSurrogates(StringBuffer buf) { switch (containsNonBmp) { case NONE: if (containsBmp == NONE) buf.append(NOT_ALLOWED_CLASS); else outputDirect(buf); break; case ALL: buf.append('('); if (containsBmp == NONE) { buf.append(SURROGATES1_CLASS); buf.append(SURROGATES2_CLASS); } else { outputDirect(buf); buf.append(SURROGATES2_CLASS); buf.append('?'); } buf.append(')'); break; case SOME: buf.append('('); boolean needSep = false; if (containsBmp != NONE) { needSep = true; outputDirect(buf); } List ranges = new Vector(); addNonBmpRanges(ranges); sortRangeList(ranges); String hi = highSurrogateRanges(ranges); if (hi.length() > 0) { if (needSep) buf.append('|'); else needSep = true; buf.append('['); for (int i = 0, len = hi.length(); i < len; i += 2) { char min = hi.charAt(i); char max = hi.charAt(i + 1); if (min == max) buf.append(min); else { buf.append(min); buf.append('-'); buf.append(max); } } buf.append(']'); buf.append(SURROGATES2_CLASS); } String lo = lowSurrogateRanges(ranges); for (int i = 0, len = lo.length(); i < len; i += 3) { if (needSep) buf.append('|'); else needSep = true; buf.append(lo.charAt(i)); char min = lo.charAt(i + 1); char max = lo.charAt(i + 2); if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))) buf.append(min); else { buf.append('['); for (;;) { if (min == max) buf.append(min); else { buf.append(min); buf.append('-'); buf.append(max); } if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)) break; i += 3; min = lo.charAt(i + 1); max = lo.charAt(i + 2); } buf.append(']'); } } if (!needSep) buf.append(NOT_ALLOWED_CLASS); buf.append(')'); break; } } static String highSurrogateRanges(List ranges) { StringBuffer highRanges = new StringBuffer(); for (int i = 0, len = ranges.size(); i < len; i++) { Range r = (Range)ranges.get(i); char min1 = Utf16.surrogate1(r.getMin()); char min2 = Utf16.surrogate2(r.getMin()); char max1 = Utf16.surrogate1(r.getMax()); char max2 = Utf16.surrogate2(r.getMax()); if (min2 != SURROGATE2_MIN) min1++; if (max2 != SURROGATE2_MAX) max1--; if (max1 >= min1) { highRanges.append(min1); highRanges.append(max1); } } return highRanges.toString(); } static String lowSurrogateRanges(List ranges) { StringBuffer lowRanges = new StringBuffer(); for (int i = 0, len = ranges.size(); i < len; i++) { Range r = (Range)ranges.get(i); char min1 = Utf16.surrogate1(r.getMin()); char min2 = Utf16.surrogate2(r.getMin()); char max1 = Utf16.surrogate1(r.getMax()); char max2 = Utf16.surrogate2(r.getMax()); if (min1 == max1) { if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) { lowRanges.append(min1); lowRanges.append(min2); lowRanges.append(max2); } } else { if (min2 != SURROGATE2_MIN) { lowRanges.append(min1); lowRanges.append(min2); lowRanges.append(SURROGATE2_MAX); } if (max2 != SURROGATE2_MAX) { lowRanges.append(max1); lowRanges.append(SURROGATE2_MIN); lowRanges.append(max2); } } } return lowRanges.toString(); } abstract void outputDirect(StringBuffer buf); abstract void outputComplementDirect(StringBuffer buf); int singleChar() { return -1; } void addNonBmpRanges(List ranges) { } static void sortRangeList(List ranges) { Collections.sort(ranges); int toIndex = 0; int fromIndex = 0; int len = ranges.size(); while (fromIndex < len) { Range r = (Range)ranges.get(fromIndex); int min = r.getMin(); int max = r.getMax(); while (++fromIndex < len) { Range r2 = (Range)ranges.get(fromIndex); if (r2.getMin() > max + 1) break; if (r2.getMax() > max) max = r2.getMax(); } if (max != r.getMax()) r = new Range(min, max); ranges.set(toIndex++, r); } while (len > toIndex) ranges.remove(--len); } } static abstract class SimpleCharClass extends CharClass { SimpleCharClass(int containsBmp, int containsNonBmp) { super(containsBmp, containsNonBmp); } void outputDirect(StringBuffer buf) { buf.append('['); inClassOutputDirect(buf); buf.append(']'); } // must not call if containsBmp == ALL && !surrogatesDirect void outputComplementDirect(StringBuffer buf) { if (!surrogatesDirect && getContainsBmp() == NONE) buf.append("[\u0000-\uFFFF]"); else { buf.append("[^"); inClassOutputDirect(buf); buf.append(']'); } } abstract void inClassOutputDirect(StringBuffer buf); static void outputWide(StringBuffer buf, int c) { buf.append(Utf16.surrogate1(c)); buf.append(Utf16.surrogate2(c)); } } static class SingleChar extends SimpleCharClass { private final char c; SingleChar(char c) { super(SOME, NONE); this.c = c; } int singleChar() { return c; } void outputDirect(StringBuffer buf) { inClassOutputDirect(buf); } void inClassOutputDirect(StringBuffer buf) { if (isJavaMetaChar(c)) buf.append('\\'); buf.append(c); } } static class WideSingleChar extends SimpleCharClass { private final int c; WideSingleChar(int c) { super(NONE, SOME); this.c = c; } void inClassOutputDirect(StringBuffer buf) { if (!surrogatesDirect) throw new RuntimeException("BMP output botch"); outputWide(buf, c); } int singleChar() { return c; } void addNonBmpRanges(List ranges) { ranges.add(new Range(c, c)); } } static class CharRange extends SimpleCharClass { private final int lower; private final int upper; CharRange(int lower, int upper) { super(lower < NONBMP_MIN ? SOME : NONE, // don't use ALL here, because that requires that the BMP class contains high surrogates upper >= NONBMP_MIN ? SOME : NONE); this.lower = lower; this.upper = upper; } void inClassOutputDirect(StringBuffer buf) { if (lower < NONBMP_MIN) { if (isJavaMetaChar((char)lower)) buf.append('\\'); buf.append((char)lower); } else if (surrogatesDirect) outputWide(buf, lower); else throw new RuntimeException("BMP output botch"); buf.append('-'); if (upper < NONBMP_MIN) { if (isJavaMetaChar((char)upper)) buf.append('\\'); buf.append((char)upper); } else if (surrogatesDirect) outputWide(buf, upper); else buf.append('\uFFFF'); } void addNonBmpRanges(List ranges) { if (upper >= NONBMP_MIN) ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper)); } } static class Property extends SimpleCharClass { private final String name; Property(String name) { super(SOME, NONE); this.name = name; } void outputDirect(StringBuffer buf) { inClassOutputDirect(buf); } void inClassOutputDirect(StringBuffer buf) { buf.append("\\p{"); buf.append(name); buf.append('}'); } void outputComplementDirect(StringBuffer buf) { buf.append("\\P{"); buf.append(name); buf.append('}'); } } static class Subtraction extends CharClass { private final CharClass cc1; private final CharClass cc2; Subtraction(CharClass cc1, CharClass cc2) { // min corresponds to intersection // complement corresponds to negation super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()), Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp())); this.cc1 = cc1; this.cc2 = cc2; } void outputDirect(StringBuffer buf) { buf.append('['); cc1.outputDirect(buf); buf.append("&&"); cc2.outputComplementDirect(buf); buf.append(']'); } void outputComplementDirect(StringBuffer buf) { buf.append('['); cc1.outputComplementDirect(buf); cc2.outputDirect(buf); buf.append(']'); } void addNonBmpRanges(List ranges) { List posList = new Vector(); cc1.addNonBmpRanges(posList); List negList = new Vector(); cc2.addNonBmpRanges(negList); sortRangeList(posList); sortRangeList(negList); Iterator negIter = negList.iterator(); Range negRange; if (negIter.hasNext()) negRange = (Range)negIter.next(); else negRange = null; for (int i = 0, len = posList.size(); i < len; i++) { Range posRange = (Range)posList.get(i); while (negRange != null && negRange.getMax() < posRange.getMin()) { if (negIter.hasNext()) negRange = (Range)negIter.next(); else negRange = null; } // if negRange != null, negRange.max >= posRange.min int min = posRange.getMin(); while (negRange != null && negRange.getMin() <= posRange.getMax()) { if (min < negRange.getMin()) { ranges.add(new Range(min, negRange.getMin() - 1)); } min = negRange.getMax() + 1; if (min > posRange.getMax()) break; if (negIter.hasNext()) negRange = (Range)negIter.next(); else negRange = null; } if (min <= posRange.getMax()) ranges.add(new Range(min, posRange.getMax())); } } } static class Union extends CharClass { private final List members; Union(CharClass[] v) { this(toList(v)); } static private List toList(CharClass[] v) { List members = new Vector(); for (int i = 0; i < v.length; i++) members.add(v[i]); return members; } Union(List members) { super(computeContainsBmp(members), computeContainsNonBmp(members)); this.members = members; } void outputDirect(StringBuffer buf) { buf.append('['); for (int i = 0, len = members.size(); i < len; i++) { CharClass cc = (CharClass)members.get(i); if (surrogatesDirect || cc.getContainsBmp() != NONE) { if (cc instanceof SimpleCharClass) ((SimpleCharClass)cc).inClassOutputDirect(buf); else cc.outputDirect(buf); } } buf.append(']'); } void outputComplementDirect(StringBuffer buf) { boolean first = true; int len = members.size(); for (int i = 0; i < len; i++) { CharClass cc = (CharClass)members.get(i); if ((surrogatesDirect || cc.getContainsBmp() != NONE) && cc instanceof SimpleCharClass) { if (first) { buf.append("[^"); first = false; } ((SimpleCharClass)cc).inClassOutputDirect(buf); } } for (int i = 0; i < len; i++) { CharClass cc = (CharClass)members.get(i); if ((surrogatesDirect || cc.getContainsBmp() != NONE) && !(cc instanceof SimpleCharClass)) { if (first) { buf.append('['); first = false; } else buf.append("&&"); // can't have any members that are ALL, because that would make this ALL, which violates // the precondition for outputComplementDirect cc.outputComplementDirect(buf); } } if (first) // all members are NONE, so this is NONE, so complement is everything buf.append("[\u0000-\uFFFF]"); else buf.append(']'); } void addNonBmpRanges(List ranges) { for (int i = 0, len = members.size(); i < len; i++) ((CharClass)members.get(i)).addNonBmpRanges(ranges); } private static int computeContainsBmp(List members) { int ret = NONE; for (int i = 0, len = members.size(); i < len; i++) ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp()); return ret; } private static int computeContainsNonBmp(List members) { int ret = NONE; for (int i = 0, len = members.size(); i < len; i++) ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp()); return ret; } } static class Complement extends CharClass { private final CharClass cc; Complement(CharClass cc) { super(-cc.getContainsBmp(), -cc.getContainsNonBmp()); this.cc = cc; } void outputDirect(StringBuffer buf) { cc.outputComplementDirect(buf); } void outputComplementDirect(StringBuffer buf) { cc.outputDirect(buf); } void addNonBmpRanges(List ranges) { List tem = new Vector(); cc.addNonBmpRanges(tem); sortRangeList(tem); int c = NONBMP_MIN; for (int i = 0, len = tem.size(); i < len; i++) { Range r = (Range)tem.get(i); if (r.getMin() > c) ranges.add(new Range(c, r.getMin() - 1)); c = r.getMax() + 1; } if (c != NONBMP_MAX + 1) ranges.add(new Range(c, NONBMP_MAX)); } } private boolean translateAtom() throws RegexSyntaxException { switch (curChar) { case EOS: if (!eos) break; // fall through case '?': case '*': case '+': case ')': case '{': case '}': case '|': case ']': return false; case '(': copyCurChar(); translateRegExp(); expect(')'); copyCurChar(); return true; case '\\': advance(); parseEsc().output(result); return true; case '[': advance(); parseCharClassExpr().output(result); return true; case '.': DOT.output(result); advance(); return true; case '$': case '^': result.append('\\'); break; } copyCurChar(); return true; } static private CharClass makeCharClass(String categories, String includes, String excludeRanges) { List includeList = new Vector(); for (int i = 0, len = categories.length(); i < len; i += 2) includeList.add(new Property(categories.substring(i, i + 2))); for (int i = 0, len = includes.length(); i < len; i++) { int j = i + 1; for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++) ; --j; if (i == j - 1) --j; if (i == j) includeList.add(new SingleChar(includes.charAt(i))); else includeList.add(new CharRange(includes.charAt(i), includes.charAt(j))); i = j; } List excludeList = new Vector(); for (int i = 0, len = excludeRanges.length(); i < len; i += 2) { char min = excludeRanges.charAt(i); char max = excludeRanges.charAt(i + 1); if (min == max) excludeList.add(new SingleChar(min)); else if (min == max - 1) { excludeList.add(new SingleChar(min)); excludeList.add(new SingleChar(max)); } else excludeList.add(new CharRange(min, max)); } if (surrogatesDirect) excludeList.add(new CharRange(NONBMP_MIN, NONBMP_MAX)); // Unicode 4.0 adds some non-BMP letters return new Subtraction(new Union(includeList), new Union(excludeList)); } private CharClass parseEsc() throws RegexSyntaxException { switch (curChar) { case 'n': advance(); return new SingleChar('\n'); case 'r': advance(); return new SingleChar('\r'); case 't': advance(); return new SingleChar('\t'); case '\\': case '|': case '.': case '-': case '^': case '?': case '*': case '+': case '(': case ')': case '{': case '}': case '[': case ']': break; case 's': advance(); return ESC_s; case 'S': advance(); return ESC_S; case 'i': advance(); return ESC_i; case 'I': advance(); return ESC_I; case 'c': advance(); return ESC_c; case 'C': advance(); return ESC_C; case 'd': advance(); return ESC_d; case 'D': advance(); return ESC_D; case 'w': advance(); return ESC_w; case 'W': advance(); return ESC_W; case 'p': advance(); return parseProp(); case 'P': advance(); return new Complement(parseProp()); default: throw makeException("bad_escape"); } CharClass tem = new SingleChar(curChar); advance(); return tem; } private CharClass parseProp() throws RegexSyntaxException { expect('{'); int start = pos; for (;;) { advance(); if (curChar == '}') break; if (!isAsciiAlnum(curChar) && curChar != '-') expect('}'); } String propertyName = regExp.substring(start, pos - 1); advance(); switch (propertyName.length()) { case 0: throw makeException("empty_property_name"); case 2: int sci = subCategories.indexOf(propertyName); if (sci < 0 || sci % 2 == 1) throw makeException("bad_category"); return getSubCategoryCharClass(sci / 2); case 1: int ci = categories.indexOf(propertyName.charAt(0)); if (ci < 0) throw makeException("bad_category", propertyName); return getCategoryCharClass(ci); default: if (!propertyName.startsWith("Is")) break; String blockName = propertyName.substring(2); for (int i = 0; i < specialBlockNames.length; i++) if (blockName.equals(specialBlockNames[i])) return specialBlockCharClasses[i]; if (!isBlock(blockName)) throw makeException("bad_block_name", blockName); return new Property( "In" + blockName); } throw makeException("bad_property_name", propertyName); } static private boolean isBlock(String name) { for (int i = 0; i < blockNames.length; i++) if (name.equals(blockNames[i])) return true; return false; } static private boolean isAsciiAlnum(char c) { if ('a' <= c && c <= 'z') return true; if ('A' <= c && c <= 'Z') return true; if ('0' <= c && c <= '9') return true; return false; } private void expect(char c) throws RegexSyntaxException { if (curChar != c) throw makeException("expected", new String(new char[]{c})); } private CharClass parseCharClassExpr() throws RegexSyntaxException { boolean compl; if (curChar == '^') { advance(); compl = true; } else compl = false; List members = new Vector(); do { CharClass lower = parseCharClassEscOrXmlChar(); members.add(lower); if (curChar == '-') { advance(); if (curChar == '[') break; CharClass upper = parseCharClassEscOrXmlChar(); if (lower.singleChar() < 0 || upper.singleChar() < 0) throw makeException("multi_range"); if (lower.singleChar() > upper.singleChar()) throw makeException("invalid_range"); members.set(members.size() - 1, new CharRange(lower.singleChar(), upper.singleChar())); if (curChar == '-') { advance(); expect('['); break; } } } while (curChar != ']'); CharClass result; if (members.size() == 1) result = (CharClass)members.get(0); else result = new Union(members); if (compl) result = new Complement(result); if (curChar == '[') { advance(); result = new Subtraction(result, parseCharClassExpr()); expect(']'); } advance(); return result; } private CharClass parseCharClassEscOrXmlChar() throws RegexSyntaxException { switch (curChar) { case EOS: if (eos) expect(']'); break; case '\\': advance(); return parseEsc(); case '[': case ']': case '-': throw makeException("should_quote", new String(new char[]{curChar})); } CharClass tem; if (Utf16.isSurrogate(curChar)) { if (!Utf16.isSurrogate1(curChar)) throw makeException("invalid_surrogate"); char c1 = curChar; advance(); if (!Utf16.isSurrogate2(curChar)) throw makeException("invalid_surrogate"); tem = new WideSingleChar(Utf16.scalarValue(c1, curChar)); } else tem = new SingleChar(curChar); advance(); return tem; } private RegexSyntaxException makeException(String key) { return new RegexSyntaxException(localizer.message(key), pos - 1); } private RegexSyntaxException makeException(String key, String arg) { return new RegexSyntaxException(localizer.message(key, arg), pos - 1); } static private boolean isJavaMetaChar(char c) { switch (c) { case '\\': case '^': case '?': case '*': case '+': case '(': case ')': case '{': case '}': case '|': case '[': case ']': case '-': case '&': case '$': case '.': return true; } return false; } static private synchronized CharClass getCategoryCharClass(int ci) { if (categoryCharClasses[ci] == null) categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci)); return categoryCharClasses[ci]; } static private synchronized CharClass getSubCategoryCharClass(int sci) { if (subCategoryCharClasses[sci] == null) subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2)); return subCategoryCharClasses[sci]; } static private final char UNICODE_3_1_ADD_Lu = '\u03F4'; // added in 3.1 static private final char UNICODE_3_1_ADD_Ll = '\u03F5'; // added in 3.1 // 3 characters changed from No to Nl between 3.0 and 3.1 static private final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE'; static private final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0'; static private final String CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; // Java doesn't know about category Pi static private final String CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; // Java doesn't know about category Pf static private CharClass computeCategoryCharClass(char code) { List classes = new Vector(); classes.add(new Property(new String(new char[] { code }))); if (!surrogatesDirect) { for (int ci = Categories.CATEGORY_NAMES.indexOf(code); ci >= 0; ci = Categories.CATEGORY_NAMES.indexOf(code, ci + 1)) { int[] addRanges = Categories.CATEGORY_RANGES[ci/2]; for (int i = 0; i < addRanges.length; i += 2) classes.add(new CharRange(addRanges[i], addRanges[i + 1])); } } if (code == 'P') classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf)); if (code == 'L') { classes.add(new SingleChar(UNICODE_3_1_ADD_Ll)); classes.add(new SingleChar(UNICODE_3_1_ADD_Lu)); } if (code == 'C') classes.add(computeSubCategoryCharClass("Cn")); // JDK 1.4 leaves Cn out of C? if (classes.size() == 1) return (CharClass)classes.get(0); return new Union(classes); } static private CharClass computeSubCategoryCharClass(String name) { if (name.equals("Pi")) return makeCharClass(CATEGORY_Pi); if (name.equals("Pf")) return makeCharClass(CATEGORY_Pf); CharClass base = new Property(name); if (name.equals("Cn")) { // Unassigned List assignedRanges = new Vector(); assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu)); assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll)); if (!surrogatesDirect) { for (int i = 0; i < Categories.CATEGORY_RANGES.length; i++) for (int j = 0; j < Categories.CATEGORY_RANGES[i].length; j += 2) assignedRanges.add(new CharRange(Categories.CATEGORY_RANGES[i][j], Categories.CATEGORY_RANGES[i][j + 1])); base = new Union(new CharClass[] { base, new CharRange(NONBMP_MIN, NONBMP_MAX) }); } return new Subtraction(base, new Union(assignedRanges)); } List classes = new Vector(); classes.add(base); if (!surrogatesDirect) { int sci = Categories.CATEGORY_NAMES.indexOf(name); if (sci >= 0) { int[] addRanges = Categories.CATEGORY_RANGES[sci/2]; for (int i = 0; i < addRanges.length; i += 2) classes.add(new CharRange(addRanges[i], addRanges[i + 1])); } } if (name.equals("Lu")) classes.add(new SingleChar(UNICODE_3_1_ADD_Lu)); else if (name.equals("Ll")) classes.add(new SingleChar(UNICODE_3_1_ADD_Ll)); else if (name.equals("Nl")) classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX)); else if (name.equals("No")) return new Subtraction(new Union(classes), new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX)); if (classes.size() == 1) return base; return new Union(classes); } private static CharClass makeCharClass(String members) { List list = new Vector(); for (int i = 0, len = members.length(); i < len; i++) list.add(new SingleChar(members.charAt(i))); return new Union(list); } public static void main(String[] args) throws RegexSyntaxException { String s = translate(args[0]); for (int i = 0, len = s.length(); i < len; i++) { char c = s.charAt(i); if (c >= 0x20 && c <= 0x7e) System.err.print(c); else { System.err.print("\\u"); for (int shift = 12; shift >= 0; shift -= 4) System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF)); } } System.err.println(); } }