package org.exist.xquery.util;
import java.math.BigDecimal;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import org.exist.util.XMLChar;
import org.exist.xquery.Constants;
/**
* This class translates XML Schema regex syntax into JDK 1.4 regex syntax.
* Copied from Saxon package net.sf.saxon.type without change. The original
* author is James Clark. See license.
*/
public class RegexTranslator {
/**
* Translates XML Schema regexes into <code>java.util.regex</code> regexes.
*
* @see java.util.regex.Pattern
* @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
*/
private final String regExp;
private boolean isXPath;
private int pos = 0;
private final int length;
private char curChar;
private boolean eos = false;
private final StringBuilder result = new StringBuilder();
static private final String categories = "LMNPZSC";
static private final CharClass[] categoryCharClasses = new CharClass[categories.length()];
static private final String subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn";
static private final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2];
static private final int NONBMP_MIN = 0x10000;
static private final int NONBMP_MAX = 0x10FFFF;
static private final char SURROGATE2_MIN = '\uDC00';
static private final char SURROGATE2_MAX = '\uDFFF';
//static final Localizer localizer = new Localizer(RegexTranslator.class);
static private final String[] blockNames = {
"BasicLatin",
"Latin-1Supplement",
"LatinExtended-A",
"LatinExtended-B",
"IPAExtensions",
"SpacingModifierLetters",
"CombiningDiacriticalMarks",
"Greek",
"Cyrillic",
"Armenian",
"Hebrew",
"Arabic",
"Syriac",
"Thaana",
"Devanagari",
"Bengali",
"Gurmukhi",
"Gujarati",
"Oriya",
"Tamil",
"Telugu",
"Kannada",
"Malayalam",
"Sinhala",
"Thai",
"Lao",
"Tibetan",
"Myanmar",
"Georgian",
"HangulJamo",
"Ethiopic",
"Cherokee",
"UnifiedCanadianAboriginalSyllabics",
"Ogham",
"Runic",
"Khmer",
"Mongolian",
"LatinExtendedAdditional",
"GreekExtended",
"GeneralPunctuation",
"SuperscriptsandSubscripts",
"CurrencySymbols",
"CombiningMarksforSymbols",
"LetterlikeSymbols",
"NumberForms",
"Arrows",
"MathematicalOperators",
"MiscellaneousTechnical",
"ControlPictures",
"OpticalCharacterRecognition",
"EnclosedAlphanumerics",
"BoxDrawing",
"BlockElements",
"GeometricShapes",
"MiscellaneousSymbols",
"Dingbats",
"BraillePatterns",
"CJKRadicalsSupplement",
"KangxiRadicals",
"IdeographicDescriptionCharacters",
"CJKSymbolsandPunctuation",
"Hiragana",
"Katakana",
"Bopomofo",
"HangulCompatibilityJamo",
"Kanbun",
"BopomofoExtended",
"EnclosedCJKLettersandMonths",
"CJKCompatibility",
"CJKUnifiedIdeographsExtensionA",
"CJKUnifiedIdeographs",
"YiSyllables",
"YiRadicals",
"HangulSyllables",
// surrogates excluded because there are never any *characters* with codes in surrogate range
// "PrivateUse", excluded because 3.1 adds non-BMP ranges
"CJKCompatibilityIdeographs",
"AlphabeticPresentationForms",
"ArabicPresentationForms-A",
"CombiningHalfMarks",
"CJKCompatibilityForms",
"SmallFormVariants",
"ArabicPresentationForms-B",
"Specials",
"HalfwidthandFullwidthForms",
"Specials"
};
/**
* Names of blocks including ranges outside the BMP.
*/
static private final String[] specialBlockNames = {
"OldItalic",
"Gothic",
"Deseret",
"ByzantineMusicalSymbols",
"MusicalSymbols",
"MathematicalAlphanumericSymbols",
"CJKUnifiedIdeographsExtensionB",
"CJKCompatibilityIdeographsSupplement",
"Tags",
"PrivateUse",
"HighSurrogates",
"HighPrivateUseSurrogates",
"LowSurrogates",
};
// This file was automatically generated by CategoriesGen
static final String CATEGORY_NAMES = "NoLoMnCfLlNlPoLuMcNdSoSmCo";
static final int[][] CATEGORY_RANGES = {
{
// No
0x10107, 0x10133,
0x10320, 0x10323
},
{
// Lo
0x10000, 0x1000b,
0x1000d, 0x10026,
0x10028, 0x1003a,
0x1003c, 0x1003d,
0x1003f, 0x1004d,
0x10050, 0x1005d,
0x10080, 0x100fa,
0x10300, 0x1031e,
0x10330, 0x10349,
0x10380, 0x1039d,
0x10450, 0x1049d,
0x10800, 0x10805,
0x10808, 0x10808,
0x1080a, 0x10835,
0x10837, 0x10838,
0x1083c, 0x1083c,
0x1083f, 0x1083f,
0x20000, 0x2a6d6,
0x2f800, 0x2fa1d
},
{
// Mn
0x1d167, 0x1d169,
0x1d17b, 0x1d182,
0x1d185, 0x1d18b,
0x1d1aa, 0x1d1ad,
0xe0100, 0xe01ef
},
{
// Cf
0x1d173, 0x1d17a,
0xe0001, 0xe0001,
0xe0020, 0xe007f
},
{
// Ll
0x10428, 0x1044f,
0x1d41a, 0x1d433,
0x1d44e, 0x1d454,
0x1d456, 0x1d467,
0x1d482, 0x1d49b,
0x1d4b6, 0x1d4b9,
0x1d4bb, 0x1d4bb,
0x1d4bd, 0x1d4c3,
0x1d4c5, 0x1d4cf,
0x1d4ea, 0x1d503,
0x1d51e, 0x1d537,
0x1d552, 0x1d56b,
0x1d586, 0x1d59f,
0x1d5ba, 0x1d5d3,
0x1d5ee, 0x1d607,
0x1d622, 0x1d63b,
0x1d656, 0x1d66f,
0x1d68a, 0x1d6a3,
0x1d6c2, 0x1d6da,
0x1d6dc, 0x1d6e1,
0x1d6fc, 0x1d714,
0x1d716, 0x1d71b,
0x1d736, 0x1d74e,
0x1d750, 0x1d755,
0x1d770, 0x1d788,
0x1d78a, 0x1d78f,
0x1d7aa, 0x1d7c2,
0x1d7c4, 0x1d7c9
},
{
// Nl
0x1034a, 0x1034a
},
{
// Po
0x10100, 0x10101,
0x1039f, 0x1039f
},
{
// Lu
0x10400, 0x10427,
0x1d400, 0x1d419,
0x1d434, 0x1d44d,
0x1d468, 0x1d481,
0x1d49c, 0x1d49c,
0x1d49e, 0x1d49f,
0x1d4a2, 0x1d4a2,
0x1d4a5, 0x1d4a6,
0x1d4a9, 0x1d4ac,
0x1d4ae, 0x1d4b5,
0x1d4d0, 0x1d4e9,
0x1d504, 0x1d505,
0x1d507, 0x1d50a,
0x1d50d, 0x1d514,
0x1d516, 0x1d51c,
0x1d538, 0x1d539,
0x1d53b, 0x1d53e,
0x1d540, 0x1d544,
0x1d546, 0x1d546,
0x1d54a, 0x1d550,
0x1d56c, 0x1d585,
0x1d5a0, 0x1d5b9,
0x1d5d4, 0x1d5ed,
0x1d608, 0x1d621,
0x1d63c, 0x1d655,
0x1d670, 0x1d689,
0x1d6a8, 0x1d6c0,
0x1d6e2, 0x1d6fa,
0x1d71c, 0x1d734,
0x1d756, 0x1d76e,
0x1d790, 0x1d7a8
},
{
// Mc
0x1d165, 0x1d166,
0x1d16d, 0x1d172
},
{
// Nd
0x104a0, 0x104a9,
0x1d7ce, 0x1d7ff
},
{
// So
0x10102, 0x10102,
0x10137, 0x1013f,
0x1d000, 0x1d0f5,
0x1d100, 0x1d126,
0x1d12a, 0x1d164,
0x1d16a, 0x1d16c,
0x1d183, 0x1d184,
0x1d18c, 0x1d1a9,
0x1d1ae, 0x1d1dd,
0x1d300, 0x1d356
},
{
// Sm
0x1d6c1, 0x1d6c1,
0x1d6db, 0x1d6db,
0x1d6fb, 0x1d6fb,
0x1d715, 0x1d715,
0x1d735, 0x1d735,
0x1d74f, 0x1d74f,
0x1d76f, 0x1d76f,
0x1d789, 0x1d789,
0x1d7a9, 0x1d7a9,
0x1d7c3, 0x1d7c3
},
{
// Co
0xf0000, 0xffffd,
0x100000, 0x10fffd
}
};
// end of generated code
/**
* CharClass for each block name in specialBlockNames.
*/
static private final CharClass[] specialBlockCharClasses = {
new CharRange(0x10300, 0x1032F),
new CharRange(0x10330, 0x1034F),
new CharRange(0x10400, 0x1044F),
new CharRange(0x1D000, 0x1D0FF),
new CharRange(0x1D100, 0x1D1FF),
new CharRange(0x1D400, 0x1D7FF),
new CharRange(0x20000, 0x2A6D6),
new CharRange(0x2F800, 0x2FA1F),
new CharRange(0xE0000, 0xE007F),
new Union(new CharClass[]{
new CharRange(0xE000, 0xF8FF),
new CharRange(0xF0000, 0xFFFFD),
new CharRange(0x100000, 0x10FFFD)
}),
Empty.getInstance(),
Empty.getInstance(),
Empty.getInstance()
};
static private final CharClass DOT = new Complement(new Union(new CharClass[]{new SingleChar('\n'), new SingleChar('\r')}));
static private final CharClass ESC_d = new Property("Nd");
static private final CharClass ESC_D = new Complement(ESC_d);
static private final CharClass ESC_W = new Union(new CharClass[]{new Property("P"), new Property("Z"), new Property("C")});
static private final CharClass ESC_w = new Complement(ESC_W);
static private final CharClass ESC_s = new Union(new CharClass[]{
new SingleChar(' '),
new SingleChar('\n'),
new SingleChar('\r'),
new SingleChar('\t')
});
static private final CharClass ESC_b = new Escape('b');
static private final CharClass ESC_B = new Escape('B');
// This file was automatically generated by NamingExceptionsGen
// class NamingExceptions {
static final String NMSTRT_INCLUDES =
"\u003A\u005F\u02BB\u02BC\u02BD\u02BE\u02BF\u02C0\u02C1\u0559" +
"\u06E5\u06E6\u212E";
static final String NMSTRT_EXCLUDE_RANGES =
"\u00AA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149\u017F\u017F" +
"\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233\u02A9\u02AD" +
"\u03D7\u03D7\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1" +
"\u0400\u0400\u040D\u040D\u0450\u0450\u045D\u045D\u048C\u048F" +
"\u04EC\u04ED\u0587\u0587\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF" +
"\u06FA\u07A5\u0950\u0950\u0AD0\u0AD0\u0D85\u0DC6\u0E2F\u0E2F" +
"\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u1055\u1101\u1101\u1104\u1104" +
"\u1108\u1108\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D" +
"\u113F\u113F\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153" +
"\u1156\u1158\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168" +
"\u116A\u116C\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2" +
"\u11A9\u11AA\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB" +
"\u11C3\u11EA\u11EC\u11EF\u11F1\u11F8\u1200\u18A8\u207F\u2124" +
"\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u3006\u3038\u303A" +
"\u3131\u4DB5\uA000\uA48C\uF900\uFFDC";
static final String NMSTRT_CATEGORIES = "LlLuLoLtNl";
static final String NMCHAR_INCLUDES =
"\u002D\u002E\u003A\u005F\u00B7\u0387\u212E";
static final String NMCHAR_EXCLUDE_RANGES =
"\u00AA\u00B5\u00BA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149" +
"\u017F\u017F\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233" +
"\u02A9\u02B8\u02E0\u02EE\u0346\u034E\u0362\u037A\u03D7\u03D7" +
"\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1\u0400\u0400" +
"\u040D\u040D\u0450\u0450\u045D\u045D\u0488\u048F\u04EC\u04ED" +
"\u0587\u0587\u0653\u0655\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF" +
"\u06FA\u07B0\u0950\u0950\u0AD0\u0AD0\u0D82\u0DF3\u0E2F\u0E2F" +
"\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u0F6A\u0F96\u0F96\u0FAE\u0FB0" +
"\u0FB8\u0FB8\u0FBA\u1059\u1101\u1101\u1104\u1104\u1108\u1108" +
"\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D\u113F\u113F" +
"\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153\u1156\u1158" +
"\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168\u116A\u116C" +
"\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2\u11A9\u11AA" +
"\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB\u11C3\u11EA" +
"\u11EC\u11EF\u11F1\u11F8\u1200\u18A9\u207F\u207F\u20DD\u20E0" +
"\u20E2\u2124\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u2183" +
"\u3006\u3006\u3038\u303A\u3131\u4DB5\uA000\uA48C\uF900\uFFDC";
static final String NMCHAR_CATEGORIES = "LlLuLoLtNlMcMeMnLmNd";
// end of generated code
static private final CharClass ESC_S = new Complement(ESC_s);
static private final CharClass ESC_i = makeCharClass(NMSTRT_CATEGORIES,
NMSTRT_INCLUDES,
NMSTRT_EXCLUDE_RANGES);
static private final CharClass ESC_I = new Complement(ESC_i);
static private final CharClass ESC_c = makeCharClass(NMCHAR_CATEGORIES,
NMCHAR_INCLUDES,
NMCHAR_EXCLUDE_RANGES);
static private final CharClass ESC_C = new Complement(ESC_c);
static private final char EOS = '\0';
private RegexTranslator(String regExp) {
this.regExp = regExp;
this.length = regExp.length();
advance();
}
/**
* Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
* expression in the syntax of <code>java.util.regex.Pattern</code>. The translation
* assumes that the string to be matched against the regex uses surrogate pairs correctly.
* If the string comes from XML content, a conforming XML parser will automatically
* check this; if the string comes from elsewhere, it may be necessary to check
* surrogate usage before matching.
*
* @param regexp a String containing a regular expression in the syntax of XML Schemas Part 2
* @param xpath a boolean indicating whether the XPath 2.0 F+O extensions to the schema
* regex syntax are permitted
* @return a String containing a regular expression in the syntax of java.util.regex.Pattern
* @throws RegexSyntaxException if <code>regexp</code> is not a regular expression in the
* syntax of XML Schemas Part 2
* @see java.util.regex.Pattern
* @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
*/
static public String translate(String regexp, boolean xpath) throws RegexSyntaxException {
RegexTranslator tr = new RegexTranslator(regexp);
tr.isXPath = xpath;
tr.translateTop();
return tr.result.toString();
}
private void advance() {
if (pos < length)
curChar = regExp.charAt(pos++);
else {
pos++;
curChar = EOS;
eos = true;
}
}
private void translateTop() throws RegexSyntaxException {
translateRegExp();
if (!eos)
throw makeException("expected end of string");
}
private void translateRegExp() throws RegexSyntaxException {
translateBranch();
while (curChar == '|') {
copyCurChar();
translateBranch();
}
}
private void translateBranch() throws RegexSyntaxException {
while (translateAtom())
translateQuantifier();
}
private void translateQuantifier() throws RegexSyntaxException {
switch (curChar) {
case '*':
case '?':
case '+':
copyCurChar();
break;
case '{':
copyCurChar();
translateQuantity();
expect('}');
copyCurChar();
break;
default:
return;
}
if (curChar=='?' && isXPath) {
copyCurChar();
}
}
private void translateQuantity() throws RegexSyntaxException {
String lower = parseQuantExact();
int lowerValue = -1;
try {
lowerValue = Integer.parseInt(lower);
result.append(lower);
} catch (NumberFormatException e) {
// JDK 1.4 cannot handle ranges bigger than this
result.append(Integer.MAX_VALUE);
}
if (curChar == ',') {
copyCurChar();
if (curChar != '}') {
String upper = parseQuantExact();
try {
int upperValue = Integer.parseInt(upper);
result.append(upper);
if (lowerValue == Constants.STRING_NOT_FOUND || upperValue < lowerValue)
throw makeException("invalid quantity range");
} catch (NumberFormatException e) {
result.append(Integer.MAX_VALUE);
if (lowerValue == Constants.STRING_NOT_FOUND && new BigDecimal(lower).compareTo(new BigDecimal(upper)) > 0)
throw makeException("invalid quantity range");
}
}
}
}
private String parseQuantExact() throws RegexSyntaxException {
StringBuilder buf = new StringBuilder();
do {
if ("0123456789".indexOf(curChar) == Constants.STRING_NOT_FOUND)
throw makeException("expected digit");
buf.append(curChar);
advance();
} while (curChar != ',' && curChar != '}');
return buf.toString();
}
private void copyCurChar() {
result.append(curChar);
advance();
}
static final int NONE = -1;
static final int SOME = 0;
static final int ALL = 1;
static final String SURROGATES1_CLASS = "[\uD800-\uDBFF]";
static final String SURROGATES2_CLASS = "[\uDC00-\uDFFF]";
static final String NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]";
static final class Range implements Comparable {
private final int min;
private final int max;
Range(int min, int max) {
this.min = min;
this.max = max;
}
int getMin() {
return min;
}
int getMax() {
return max;
}
public int compareTo(Object o) {
Range other = (Range)o;
if (this.min < other.min)
return Constants.INFERIOR;
if (this.min > other.min)
return Constants.SUPERIOR;
if (this.max > other.max)
return Constants.INFERIOR;
if (this.max < other.max)
return Constants.SUPERIOR;
return Constants.EQUAL;
}
}
static abstract class CharClass {
private final int containsBmp;
// if it contains ALL and containsBmp != NONE, then the generated class for containsBmp must
// contain all the high surrogates
private final int containsNonBmp;
protected CharClass(int containsBmp, int containsNonBmp) {
this.containsBmp = containsBmp;
this.containsNonBmp = containsNonBmp;
}
int getContainsBmp() {
return containsBmp;
}
int getContainsNonBmp() {
return containsNonBmp;
}
final void output(StringBuilder buf) {
switch (containsNonBmp) {
case NONE:
if (containsBmp == NONE)
buf.append(NOT_ALLOWED_CLASS);
else
outputBmp(buf);
break;
case ALL:
buf.append("(?:");
if (containsBmp == NONE) {
buf.append(SURROGATES1_CLASS);
buf.append(SURROGATES2_CLASS);
} else {
outputBmp(buf);
buf.append(SURROGATES2_CLASS);
buf.append('?');
}
buf.append(')');
break;
case SOME:
buf.append("(?:");
boolean needSep = false;
if (containsBmp != NONE) {
needSep = true;
outputBmp(buf);
}
List ranges = new Vector();
addNonBmpRanges(ranges);
sortRangeList(ranges);
String hi = highSurrogateRanges(ranges);
if (hi.length() > 0) {
if (needSep)
buf.append('|');
else
needSep = true;
buf.append('[');
for (int i = 0, len = hi.length(); i < len; i += 2) {
char min = hi.charAt(i);
char max = hi.charAt(i + 1);
if (min == max)
buf.append(min);
else {
buf.append(min);
buf.append('-');
buf.append(max);
}
}
buf.append(']');
buf.append(SURROGATES2_CLASS);
}
String lo = lowSurrogateRanges(ranges);
for (int i = 0, len = lo.length(); i < len; i += 3) {
if (needSep)
buf.append('|');
else
needSep = true;
buf.append(lo.charAt(i));
char min = lo.charAt(i + 1);
char max = lo.charAt(i + 2);
if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)))
buf.append(min);
else {
buf.append('[');
for (; ;) {
if (min == max)
buf.append(min);
else {
buf.append(min);
buf.append('-');
buf.append(max);
}
if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))
break;
i += 3;
min = lo.charAt(i + 1);
max = lo.charAt(i + 2);
}
buf.append(']');
}
}
if (!needSep)
buf.append(NOT_ALLOWED_CLASS);
buf.append(')');
break;
}
}
static String highSurrogateRanges(List ranges) {
StringBuilder highRanges = new StringBuilder();
for (int i = 0, len = ranges.size(); i < len; i++) {
Range r = (Range)ranges.get(i);
char min1 = XMLChar.highSurrogate(r.getMin());
char min2 = XMLChar.lowSurrogate(r.getMin());
char max1 = XMLChar.highSurrogate(r.getMax());
char max2 = XMLChar.lowSurrogate(r.getMax());
if (min2 != SURROGATE2_MIN)
min1++;
if (max2 != SURROGATE2_MAX)
max1--;
if (max1 >= min1) {
highRanges.append(min1);
highRanges.append(max1);
}
}
return highRanges.toString();
}
static String lowSurrogateRanges(List ranges) {
StringBuilder lowRanges = new StringBuilder();
for (int i = 0, len = ranges.size(); i < len; i++) {
Range r = (Range)ranges.get(i);
char min1 = XMLChar.highSurrogate(r.getMin());
char min2 = XMLChar.lowSurrogate(r.getMin());
char max1 = XMLChar.highSurrogate(r.getMax());
char max2 = XMLChar.lowSurrogate(r.getMax());
if (min1 == max1) {
if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) {
lowRanges.append(min1);
lowRanges.append(min2);
lowRanges.append(max2);
}
} else {
if (min2 != SURROGATE2_MIN) {
lowRanges.append(min1);
lowRanges.append(min2);
lowRanges.append(SURROGATE2_MAX);
}
if (max2 != SURROGATE2_MAX) {
lowRanges.append(max1);
lowRanges.append(SURROGATE2_MIN);
lowRanges.append(max2);
}
}
}
return lowRanges.toString();
}
abstract void outputBmp(StringBuilder buf);
abstract void outputComplementBmp(StringBuilder buf);
int singleChar() {
return -1;
}
void addNonBmpRanges(List ranges) {
}
static void sortRangeList(List ranges) {
Collections.sort(ranges);
int toIndex = 0;
int fromIndex = 0;
int len = ranges.size();
while (fromIndex < len) {
Range r = (Range)ranges.get(fromIndex);
int min = r.getMin();
int max = r.getMax();
while (++fromIndex < len) {
Range r2 = (Range)ranges.get(fromIndex);
if (r2.getMin() > max + 1)
break;
if (r2.getMax() > max)
max = r2.getMax();
}
if (max != r.getMax())
r = new Range(min, max);
ranges.set(toIndex++, r);
}
while (len > toIndex)
ranges.remove(--len);
}
}
static abstract class SimpleCharClass extends CharClass {
SimpleCharClass(int containsBmp, int containsNonBmp) {
super(containsBmp, containsNonBmp);
}
void outputBmp(StringBuilder buf) {
buf.append('[');
inClassOutputBmp(buf);
buf.append(']');
}
// must not call if containsBmp == ALL
void outputComplementBmp(StringBuilder buf) {
if (getContainsBmp() == NONE)
buf.append("[\u0000-\uFFFF]");
else {
buf.append("[^");
inClassOutputBmp(buf);
buf.append(']');
}
}
abstract void inClassOutputBmp(StringBuilder buf);
}
static class SingleChar extends SimpleCharClass {
private final char c;
SingleChar(char c) {
super(SOME, NONE);
this.c = c;
}
int singleChar() {
return c;
}
void outputBmp(StringBuilder buf) {
inClassOutputBmp(buf);
}
void inClassOutputBmp(StringBuilder buf) {
if (isJavaMetaChar(c))
buf.append('\\');
buf.append(c);
}
}
static class WideSingleChar extends SimpleCharClass {
private final int c;
WideSingleChar(int c) {
super(NONE, SOME);
this.c = c;
}
void inClassOutputBmp(StringBuilder buf) {
throw new RuntimeException("BMP output botch");
}
int singleChar() {
return c;
}
void addNonBmpRanges(List ranges) {
ranges.add(new Range(c, c));
}
}
static class Empty extends SimpleCharClass {
static private final Empty instance = new Empty();
private Empty() {
super(NONE, NONE);
}
static Empty getInstance() {
return instance;
}
void inClassOutputBmp(StringBuilder buf) {
throw new RuntimeException("BMP output botch");
}
}
static class CharRange extends SimpleCharClass {
private final int lower;
private final int upper;
CharRange(int lower, int upper) {
super(lower < NONBMP_MIN ? SOME : NONE,
// don't use ALL here, because that requires that the BMP class contains high surrogates
upper >= NONBMP_MIN ? SOME : NONE);
this.lower = lower;
this.upper = upper;
}
void inClassOutputBmp(StringBuilder buf) {
if (lower >= NONBMP_MIN)
throw new RuntimeException("BMP output botch");
if (isJavaMetaChar((char)lower))
buf.append('\\');
buf.append((char)lower);
buf.append('-');
if (upper < NONBMP_MIN) {
if (isJavaMetaChar((char)upper))
buf.append('\\');
buf.append((char)upper);
} else
buf.append('\uFFFF');
}
void addNonBmpRanges(List ranges) {
if (upper >= NONBMP_MIN)
ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper));
}
}
static class Property extends SimpleCharClass {
private final String name;
Property(String name) {
super(SOME, NONE);
this.name = name;
}
void outputBmp(StringBuilder buf) {
inClassOutputBmp(buf);
}
void inClassOutputBmp(StringBuilder buf) {
buf.append("\\p{");
buf.append(name);
buf.append('}');
}
void outputComplementBmp(StringBuilder buf) {
buf.append("\\P{");
buf.append(name);
buf.append('}');
}
}
static class Subtraction extends CharClass {
private final CharClass cc1;
private final CharClass cc2;
Subtraction(CharClass cc1, CharClass cc2) {
// min corresponds to intersection
// complement corresponds to negation
super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()),
Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp()));
this.cc1 = cc1;
this.cc2 = cc2;
}
void outputBmp(StringBuilder buf) {
buf.append('[');
cc1.outputBmp(buf);
buf.append("&&");
cc2.outputComplementBmp(buf);
buf.append(']');
}
void outputComplementBmp(StringBuilder buf) {
buf.append('[');
cc1.outputComplementBmp(buf);
cc2.outputBmp(buf);
buf.append(']');
}
void addNonBmpRanges(List ranges) {
List posList = new Vector();
cc1.addNonBmpRanges(posList);
List negList = new Vector();
cc2.addNonBmpRanges(negList);
sortRangeList(posList);
sortRangeList(negList);
Iterator negIter = negList.iterator();
Range negRange;
if (negIter.hasNext())
negRange = (Range)negIter.next();
else
negRange = null;
for (int i = 0, len = posList.size(); i < len; i++) {
Range posRange = (Range)posList.get(i);
while (negRange != null && negRange.getMax() < posRange.getMin()) {
if (negIter.hasNext())
negRange = (Range)negIter.next();
else
negRange = null;
}
// if negRange != null, negRange.max >= posRange.min
int min = posRange.getMin();
while (negRange != null && negRange.getMin() <= posRange.getMax()) {
if (min < negRange.getMin()) {
ranges.add(new Range(min, negRange.getMin() - 1));
}
min = negRange.getMax() + 1;
if (min > posRange.getMax())
break;
if (negIter.hasNext())
negRange = (Range)negIter.next();
else
negRange = null;
}
if (min <= posRange.getMax())
ranges.add(new Range(min, posRange.getMax()));
}
}
}
static class Union extends CharClass {
private final List members;
Union(CharClass[] v) {
this(toList(v));
}
static private List toList(CharClass[] v) {
List members = new Vector();
for (int i = 0; i < v.length; i++)
members.add(v[i]);
return members;
}
Union(List members) {
super(computeContainsBmp(members), computeContainsNonBmp(members));
this.members = members;
}
void outputBmp(StringBuilder buf) {
buf.append('[');
for (int i = 0, len = members.size(); i < len; i++) {
CharClass cc = (CharClass)members.get(i);
if (cc.getContainsBmp() != NONE) {
if (cc instanceof SimpleCharClass)
((SimpleCharClass)cc).inClassOutputBmp(buf);
else
cc.outputBmp(buf);
}
}
buf.append(']');
}
void outputComplementBmp(StringBuilder buf) {
boolean first = true;
int len = members.size();
for (int i = 0; i < len; i++) {
CharClass cc = (CharClass)members.get(i);
if (cc.getContainsBmp() != NONE && cc instanceof SimpleCharClass) {
if (first) {
buf.append("[^");
first = false;
}
((SimpleCharClass)cc).inClassOutputBmp(buf);
}
}
for (int i = 0; i < len; i++) {
CharClass cc = (CharClass)members.get(i);
if (cc.getContainsBmp() != NONE && !(cc instanceof SimpleCharClass)) {
if (first) {
buf.append('[');
first = false;
} else
buf.append("&&");
// can't have any members that are ALL, because that would make this ALL, which violates
// the precondition for outputComplementBmp
cc.outputComplementBmp(buf);
}
}
if (first == true)
// all members are NONE, so this is NONE, so complement is everything
buf.append("[\u0000-\uFFFF]");
else
buf.append(']');
}
void addNonBmpRanges(List ranges) {
for (int i = 0, len = members.size(); i < len; i++)
((CharClass)members.get(i)).addNonBmpRanges(ranges);
}
private static int computeContainsBmp(List members) {
int ret = NONE;
for (int i = 0, len = members.size(); i < len; i++)
ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp());
return ret;
}
private static int computeContainsNonBmp(List members) {
int ret = NONE;
for (int i = 0, len = members.size(); i < len; i++)
ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp());
return ret;
}
}
static class BackReference extends CharClass {
private final int i;
BackReference(int i) {
super(SOME, NONE);
this.i = i;
}
void outputBmp(StringBuilder buf) {
inClassOutputBmp(buf);
}
void outputComplementBmp(StringBuilder buf) {
inClassOutputBmp(buf);
}
void inClassOutputBmp(StringBuilder buf) {
buf.append('\\');
buf.append(i);
}
}
static class Escape extends CharClass {
private final char ch;
Escape(char ch) {
super(SOME, NONE);
this.ch = ch;
}
void outputBmp(StringBuilder buf) {
inClassOutputBmp(buf);
}
void outputComplementBmp(StringBuilder buf) {
inClassOutputBmp(buf);
}
void inClassOutputBmp(StringBuilder buf) {
buf.append('\\');
buf.append(ch);
}
}
/**
* Thrown when an syntactically incorrect regular expression is detected.
*/
static public class RegexSyntaxException extends Exception {
private final int position;
/**
* Represents an unknown position within a string containing a regular expression.
*/
static public final int UNKNOWN_POSITION = -1;
public RegexSyntaxException(String detail) {
this(detail, UNKNOWN_POSITION);
}
public RegexSyntaxException(String detail, int position) {
super(detail);
this.position = position;
}
/**
* Returns the index into the regular expression where the error was detected
* or <code>UNKNOWN_POSITION</code> if this is unknown.
*
* @return the index into the regular expression where the error was detected,
* or <code>UNKNOWNN_POSITION</code> if this is unknown
*/
public int getPosition() {
return position;
}
}
// public static class Localizer {
// private final Class cls;
// private ResourceBundle bundle;
//
// public Localizer(Class cls) {
// this.cls = cls;
// }
//
// public String message(String key) {
// return MessageFormat.format(getBundle().getString(key), new Object[]{});
// }
//
// public String message(String key, Object arg) {
// return MessageFormat.format(getBundle().getString(key),
// new Object[]{arg});
// }
//
// public String message(String key, Object arg1, Object arg2) {
// return MessageFormat.format(getBundle().getString(key),
// new Object[]{arg1, arg2});
// }
//
// public String message(String key, Object[] args) {
// return MessageFormat.format(getBundle().getString(key), args);
// }
//
// private ResourceBundle getBundle() {
// if (bundle == null) {
// String s = cls.getName();
// int i = s.lastIndexOf('.');
// if (i > 0)
// s = s.substring(0, i + 1);
// else
// s = "";
// bundle = ResourceBundle.getBundle(s + "resources.Messages");
// }
// return bundle;
// }
// }
static class Complement extends CharClass {
private final CharClass cc;
Complement(CharClass cc) {
super(-cc.getContainsBmp(), -cc.getContainsNonBmp());
this.cc = cc;
}
void outputBmp(StringBuilder buf) {
cc.outputComplementBmp(buf);
}
void outputComplementBmp(StringBuilder buf) {
cc.outputBmp(buf);
}
void addNonBmpRanges(List ranges) {
List tem = new Vector();
cc.addNonBmpRanges(tem);
sortRangeList(tem);
int c = NONBMP_MIN;
for (int i = 0, len = tem.size(); i < len; i++) {
Range r = (Range)tem.get(i);
if (r.getMin() > c)
ranges.add(new Range(c, r.getMin() - 1));
c = r.getMax() + 1;
}
if (c != NONBMP_MAX + 1)
ranges.add(new Range(c, NONBMP_MAX));
}
}
private boolean translateAtom() throws RegexSyntaxException {
switch (curChar) {
case EOS:
if (!eos)
break;
// fall through
case '?':
case '*':
case '+':
case ')':
case '{':
case '}':
case '|':
case ']':
return false;
case '(':
copyCurChar();
translateRegExp();
expect(')');
copyCurChar();
return true;
case '\\':
advance();
parseEsc().output(result);
return true;
case '[':
advance();
parseCharClassExpr().output(result);
return true;
case '.':
if (isXPath) {
// TODO: in XPath, modify "." so that it matches a surrogate pair
break;
} else {
DOT.output(result);
advance();
return true;
}
case '$':
case '^':
if (isXPath) {
copyCurChar();
return true;
}
result.append('\\');
break;
}
copyCurChar();
return true;
}
static private CharClass makeCharClass(String categories, String includes, String excludeRanges) {
List includeList = new Vector();
for (int i = 0, len = categories.length(); i < len; i += 2)
includeList.add(new Property(categories.substring(i, i + 2)));
for (int i = 0, len = includes.length(); i < len; i++) {
int j = i + 1;
for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++)
;
--j;
if (i == j - 1)
--j;
if (i == j)
includeList.add(new SingleChar(includes.charAt(i)));
else
includeList.add(new CharRange(includes.charAt(i), includes.charAt(j)));
i = j;
}
List excludeList = new Vector();
for (int i = 0, len = excludeRanges.length(); i < len; i += 2) {
char min = excludeRanges.charAt(i);
char max = excludeRanges.charAt(i + 1);
if (min == max)
excludeList.add(new SingleChar(min));
else if (min == max - 1) {
excludeList.add(new SingleChar(min));
excludeList.add(new SingleChar(max));
} else
excludeList.add(new CharRange(min, max));
}
return new Subtraction(new Union(includeList), new Union(excludeList));
}
private CharClass parseEsc() throws RegexSyntaxException {
switch (curChar) {
case 'n':
advance();
return new SingleChar('\n');
case 'r':
advance();
return new SingleChar('\r');
case 't':
advance();
return new SingleChar('\t');
case '\\':
case '|':
case '.':
case '-':
case '^':
case '?':
case '*':
case '+':
case '(':
case ')':
case '{':
case '}':
case '[':
case ']':
break;
case 's':
advance();
return ESC_s;
case 'S':
advance();
return ESC_S;
case 'i':
advance();
return ESC_i;
case 'I':
advance();
return ESC_I;
case 'c':
advance();
return ESC_c;
case 'C':
advance();
return ESC_C;
case 'd':
advance();
return ESC_d;
case 'D':
advance();
return ESC_D;
case 'w':
advance();
return ESC_w;
case 'W':
advance();
return ESC_W;
case 'b':
advance();
return ESC_b;
case 'B':
advance();
return ESC_B;
case 'p':
advance();
return parseProp();
case 'P':
advance();
return new Complement(parseProp());
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
if (isXPath) {
char c = curChar;
advance();
return new BackReference(c - '0');
// TODO: test multi-digit back-references
} else {
throw makeException("digit not allowed after \\");
}
case '$':
if (isXPath) {
break;
}
// otherwise fall through
default:
throw makeException("bad escape sequence");
}
CharClass tem = new SingleChar(curChar);
advance();
return tem;
}
private CharClass parseProp() throws RegexSyntaxException {
expect('{');
int start = pos;
for (; ;) {
advance();
if (curChar == '}')
break;
if (!isAsciiAlnum(curChar) && curChar != '-')
expect('}');
}
String propertyName = regExp.substring(start, pos - 1);
advance();
switch (propertyName.length()) {
case 0:
throw makeException("empty property name");
case 2:
int sci = subCategories.indexOf(propertyName);
if (sci == Constants.STRING_NOT_FOUND || sci % 2 == 1)
throw makeException("bad category");
return getSubCategoryCharClass(sci / 2);
case 1:
int ci = categories.indexOf(propertyName.charAt(0));
if (ci == Constants.STRING_NOT_FOUND)
throw makeException("bad category", propertyName);
return getCategoryCharClass(ci);
default:
if (!propertyName.startsWith("Is"))
break;
String blockName = propertyName.substring(2);
for (int i = 0; i < specialBlockNames.length; i++)
if (blockName.equals(specialBlockNames[i]))
return specialBlockCharClasses[i];
if (!isBlock(blockName))
throw makeException("bad block name", blockName);
return new Property("In" + blockName);
}
throw makeException("bad property name", propertyName);
}
static private boolean isBlock(String name) {
for (int i = 0; i < blockNames.length; i++)
if (name.equals(blockNames[i]))
return true;
return false;
}
static private boolean isAsciiAlnum(char c) {
if ('a' <= c && c <= 'z')
return true;
if ('A' <= c && c <= 'Z')
return true;
if ('0' <= c && c <= '9')
return true;
return false;
}
private void expect(char c) throws RegexSyntaxException {
if (curChar != c)
throw makeException("expected", new String(new char[]{c}));
}
private CharClass parseCharClassExpr() throws RegexSyntaxException {
boolean compl;
if (curChar == '^') {
advance();
compl = true;
} else
compl = false;
List members = new Vector();
do {
CharClass lower = parseCharClassEscOrXmlChar();
members.add(lower);
if (curChar == '-') {
advance();
if (curChar == '[')
break;
CharClass upper = parseCharClassEscOrXmlChar();
if (lower.singleChar() == Constants.STRING_NOT_FOUND || upper.singleChar() == Constants.STRING_NOT_FOUND)
throw makeException("multi_range");
if (lower.singleChar() > upper.singleChar())
throw makeException("invalid_range");
members.set(members.size() - 1,
new CharRange(lower.singleChar(), upper.singleChar()));
if (curChar == '-') {
advance();
expect('[');
break;
}
}
} while (curChar != ']');
CharClass result;
if (members.size() == 1)
result = (CharClass)members.get(0);
else
result = new Union(members);
if (compl)
result = new Complement(result);
if (curChar == '[') {
advance();
result = new Subtraction(result, parseCharClassExpr());
expect(']');
}
advance();
return result;
}
private CharClass parseCharClassEscOrXmlChar() throws RegexSyntaxException {
switch (curChar) {
case EOS:
if (eos)
expect(']');
break;
case '\\':
advance();
return parseEsc();
case '[':
case ']':
case '-':
throw makeException("should_quote", new String(new char[]{curChar}));
}
CharClass tem;
if (XMLChar.isSurrogate(curChar)) {
if (!XMLChar.isHighSurrogate(curChar))
throw makeException("invalid_surrogate");
char c1 = curChar;
advance();
if (!XMLChar.isLowSurrogate(curChar))
throw makeException("invalid_surrogate");
tem = new WideSingleChar(XMLChar.supplemental(c1, curChar));
} else
tem = new SingleChar(curChar);
advance();
return tem;
}
private RegexSyntaxException makeException(String key) {
return new RegexSyntaxException("Error at character " + (pos - 1) +
" in regular expression: " + key);
}
private RegexSyntaxException makeException(String key, String arg) {
return new RegexSyntaxException("Error at character " + (pos - 1) +
" in regular expression: " + key +
" (" + arg + ")");
}
static private boolean isJavaMetaChar(char c) {
switch (c) {
case '\\':
case '^':
case '?':
case '*':
case '+':
case '(':
case ')':
case '{':
case '}':
case '|':
case '[':
case ']':
case '-':
case '&':
case '$':
case '.':
return true;
}
return false;
}
static private synchronized CharClass getCategoryCharClass(int ci) {
if (categoryCharClasses[ci] == null)
categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci));
return categoryCharClasses[ci];
}
static private synchronized CharClass getSubCategoryCharClass(int sci) {
if (subCategoryCharClasses[sci] == null)
subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2));
return subCategoryCharClasses[sci];
}
static private final char UNICODE_3_1_ADD_Lu = '\u03F4'; // added in 3.1
static private final char UNICODE_3_1_ADD_Ll = '\u03F5'; // added in 3.1
// 3 characters changed from No to Nl between 3.0 and 3.1
static private final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE';
static private final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0';
static private final String CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; // Java doesn't know about category Pi
static private final String CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; // Java doesn't know about category Pf
static private CharClass computeCategoryCharClass(char code) {
List classes = new Vector();
classes.add(new Property(new String(new char[]{code})));
for (int ci = CATEGORY_NAMES.indexOf(code); ci >= 0; ci = CATEGORY_NAMES.indexOf(code, ci + 1)) {
int[] addRanges = CATEGORY_RANGES[ci / 2];
for (int i = 0; i < addRanges.length; i += 2)
classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
}
if (code == 'P')
classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf));
if (code == 'L') {
classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
}
if (code == 'C') {
// JDK 1.4 leaves Cn out of C?
classes.add(new Subtraction(new Property("Cn"),
new Union(new CharClass[]{new SingleChar(UNICODE_3_1_ADD_Lu),
new SingleChar(UNICODE_3_1_ADD_Ll)})));
List assignedRanges = new Vector();
for (int i = 0; i < CATEGORY_RANGES.length; i++)
for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2)
assignedRanges.add(new CharRange(CATEGORY_RANGES[i][j],
CATEGORY_RANGES[i][j + 1]));
classes.add(new Subtraction(new CharRange(NONBMP_MIN, NONBMP_MAX),
new Union(assignedRanges)));
}
if (classes.size() == 1)
return (CharClass)classes.get(0);
return new Union(classes);
}
static private CharClass computeSubCategoryCharClass(String name) {
CharClass base = new Property(name);
int sci = CATEGORY_NAMES.indexOf(name);
if (sci == Constants.STRING_NOT_FOUND) {
if (name.equals("Cn")) {
// Unassigned
List assignedRanges = new Vector();
assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu));
assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll));
for (int i = 0; i < CATEGORY_RANGES.length; i++)
for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2)
assignedRanges.add(new CharRange(CATEGORY_RANGES[i][j],
CATEGORY_RANGES[i][j + 1]));
return new Subtraction(new Union(new CharClass[]{base, new CharRange(NONBMP_MIN, NONBMP_MAX)}),
new Union(assignedRanges));
}
if (name.equals("Pi"))
return makeCharClass(CATEGORY_Pi);
if (name.equals("Pf"))
return makeCharClass(CATEGORY_Pf);
return base;
}
List classes = new Vector();
classes.add(base);
int[] addRanges = CATEGORY_RANGES[sci / 2];
for (int i = 0; i < addRanges.length; i += 2)
classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
if (name.equals("Lu"))
classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
else if (name.equals("Ll"))
classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
else if (name.equals("Nl"))
classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX));
else if (name.equals("No"))
return new Subtraction(new Union(classes),
new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN,
UNICODE_3_1_CHANGE_No_to_Nl_MAX));
return new Union(classes);
}
private static CharClass makeCharClass(String members) {
List list = new Vector();
for (int i = 0, len = members.length(); i < len; i++)
list.add(new SingleChar(members.charAt(i)));
return new Union(list);
}
public static void main(String[] args) throws RegexSyntaxException {
String s = translate(args[0], args[1].equals("xpath"));
for (int i = 0, len = s.length(); i < len; i++) {
char c = s.charAt(i);
if (c >= 0x20 && c <= 0x7e)
System.err.print(c);
else {
System.err.print("\\u");
for (int shift = 12; shift >= 0; shift -= 4)
System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF));
}
}
System.err.println();
}
//}
}
//
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
// you may not use this file except in compliance with the License. You may obtain a copy of the
// License at http://www.mozilla.org/MPL/
//
// Software distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the License for the specific language governing rights and limitations under the License.
//
// The Original Code is: all this file except changes marked.
//
// The Initial Developer of the Original Code is James Clark
//
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
//
// Contributor(s): Michael Kay
//