package org.basex.query.util;
import static org.basex.query.util.Err.*;
import static org.basex.util.Token.*;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.basex.query.QueryException;
import org.basex.util.InputInfo;
import org.basex.util.TokenBuilder;
/**
* Regular expression class.
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
*/
public final class RegEx {
/** Classes pattern. */
private static final Pattern CLASSES =
Pattern.compile(".*?\\[([a-zA-Z])-([a-zA-Z]).*");
/** Excluded classes pattern. */
private static final Pattern EXCLUDE =
Pattern.compile(".*?\\[(.*?)-\\[(.*?)\\].*");
/** Input info. */
private final InputInfo input;
/** Input pattern. */
private String pattern;
/**
* Constructor.
* @param pat input pattern
* @param ii input info
*/
public RegEx(final String pat, final InputInfo ii) {
pattern = pat;
input = ii;
}
/**
* Returns a regular expression pattern.
* @param mod modifier item
* @param ext XQuery 3.0 syntax
* @return modified pattern
* @throws QueryException query exception
*/
public Pattern pattern(final byte[] mod,
final boolean ext) throws QueryException {
// process modifiers
int m = Pattern.UNIX_LINES;
if(mod != null) {
for(final byte b : mod) {
if(b == 'i') m |= Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
else if(b == 'm') m |= Pattern.MULTILINE;
else if(b == 's') m |= Pattern.DOTALL;
else if(b == 'q' && ext) m |= Pattern.LITERAL;
else if(b == 'x') {
boolean cc = false;
final StringBuilder sb = new StringBuilder();
for(int s = 0; s < pattern.length(); s++) {
final char c = pattern.charAt(s);
if(cc || !ws(c)) sb.append(c);
if(c == '[') cc = true;
else if(c == ']') cc = false;
}
pattern = sb.toString();
} else {
REGMOD.thrw(input, (char) b);
}
}
}
// check escaped characters
final StringBuilder bl = new StringBuilder();
for(int i = 0; i < pattern.length(); ++i) {
char b = pattern.charAt(i);
if(b != '\\') {
bl.append(b);
continue;
}
// backslash
final char c = next(++i);
// character class
if(c != 'p' && c != 'P') {
if("0123456789cCdDniIrsStwW|.-^$?*+{}()[]\\".indexOf(c) == -1)
REGESC.thrw(input, c);
bl.append(b);
bl.append(c);
} else {
b = next(++i);
if(b != '{') REGCC.thrw(input, b);
final StringBuilder tmp = new StringBuilder();
while(true) {
b = next(++i);
if(b == 0) REGCC.thrw(input, b);
if(b == '}') {
bl.append(replace(tmp.toString(), c));
break;
}
tmp.append(b);
}
}
}
pattern = bl.toString();
try {
if((m & Pattern.LITERAL) == 0 && pattern.indexOf('[') != -1 &&
pattern.indexOf('-') != -1) {
// replace classes by single characters to support Unicode matches
while(true) {
final Matcher mt = CLASSES.matcher(pattern);
if(!mt.matches()) break;
final char c1 = mt.group(1).charAt(0);
final char c2 = mt.group(2).charAt(0);
final TokenBuilder tb2 = new TokenBuilder("[");
for(char c = c1; c <= c2; ++c) tb2.add(c);
pattern = pattern.replaceAll("\\[" + c1 + '-' + c2, tb2.toString());
}
// remove excluded characters in classes
String old = "";
for(Matcher mt; (mt = EXCLUDE.matcher(pattern)).matches() &&
!old.equals(pattern);) {
old = pattern;
final String in = mt.group(1);
final String ex = mt.group(2);
String out = in;
for(int e = 0; e < ex.length(); ++e) {
out = out.replaceAll(ex.substring(e, e + 1), "");
}
pattern = pattern.replaceAll("\\[" + in + "-\\[.*?\\]", '[' + out);
}
}
return Pattern.compile(pattern, m);
} catch(final Exception ex) {
throw REGINV.thrw(input, pattern);
}
}
/**
* Returns the next character of the specified pattern, or {@code 0}.
* @param pos position
* @return next character
*/
private char next(final int pos) {
return pos == pattern.length() ? 0 : pattern.charAt(pos);
}
/**
* Replaces a character class with the contained characters.
* @param cls class
* @param incl flag for including/excluding characters ({@code p/P})
* @return next character
*/
private static String replace(final String cls, final char incl) {
final int[] v = cls.startsWith("Is") ? MAP.get(cls.substring(2)) : null;
if(v == null) return "\\" + incl + '{' + cls + '}';
final TokenBuilder tb = new TokenBuilder().add('[');
if(incl == 'P') tb.add("!");
for(int i = 0; i < v.length;) {
tb.add(v[i++]);
tb.add('-');
tb.add(v[i++]);
}
return tb.add(']').toString();
}
/** Character class map. */
private static final HashMap<String, int[]> MAP =
new HashMap<String, int[]>();
/** Character classes. */
private static final Object[] CLS = {
"AegeanNumbers", new int[] { 0x10100, 0x1013F },
"AlphabeticPresentationForms", new int[] { 0xFB00, 0xFB4F },
"AncientGreekMusicalNotation", new int[] { 0x1D200, 0x1D24F },
"AncientGreekNumbers", new int[] { 0x10140, 0x1018F },
"AncientSymbols", new int[] { 0x10190, 0x101CF },
"Arabic", new int[] { 0x0600, 0x06FF },
"ArabicPresentationForms-A", new int[] { 0xFB50, 0xFDFF },
"ArabicPresentationForms-B", new int[] { 0xFE70, 0xFEFF },
"ArabicSupplement", new int[] { 0x0750, 0x077F },
"Armenian", new int[] { 0x0530, 0x058F },
"Arrows", new int[] { 0x2190, 0x21FF },
"Avestan", new int[] { 0x10B00, 0x10B3F },
"Balinese", new int[] { 0x1B00, 0x1B7F },
"Bamum", new int[] { 0xA6A0, 0xA6FF },
"BasicLatin", new int[] { 0x0000, 0x007F },
"Bengali", new int[] { 0x0980, 0x09FF },
"BlockElements", new int[] { 0x2580, 0x259F },
"Bopomofo", new int[] { 0x3100, 0x312F },
"BopomofoExtended", new int[] { 0x31A0, 0x31BF },
"BoxDrawing", new int[] { 0x2500, 0x257F },
"BraillePatterns", new int[] { 0x2800, 0x28FF },
"Buginese", new int[] { 0x1A00, 0x1A1F },
"Buhid", new int[] { 0x1740, 0x175F },
"ByzantineMusicalSymbols", new int[] { 0x1D000, 0x1D0FF },
"Carian", new int[] { 0x102A0, 0x102DF },
"Cham", new int[] { 0xAA00, 0xAA5F },
"Cherokee", new int[] { 0x13A0, 0x13FF },
"CJKCompatibility", new int[] { 0x3300, 0x33FF },
"CJKCompatibilityForms", new int[] { 0xFE30, 0xFE4F },
"CJKCompatibilityIdeographs", new int[] { 0xF900, 0xFAFF },
"CJKCompatibilityIdeographsSupplement", new int[] { 0x2F800, 0x2FA1F },
"CJKRadicalsSupplement", new int[] { 0x2E80, 0x2EFF },
"CJKStrokes", new int[] { 0x31C0, 0x31EF },
"CJKSymbolsandPunctuation", new int[] { 0x3000, 0x303F },
"CJKUnifiedIdeographs", new int[] { 0x4E00, 0x9FFF },
"CJKUnifiedIdeographsExtensionA", new int[] { 0x3400, 0x4DBF },
"CJKUnifiedIdeographsExtensionB", new int[] { 0x20000, 0x2A6DF },
"CJKUnifiedIdeographsExtensionC", new int[] { 0x2A700, 0x2B73F },
"CombiningDiacriticalMarks", new int[] { 0x0300, 0x036F },
"CombiningDiacriticalMarksforSymbols", new int[] { 0x20D0, 0x20FF },
"CombiningDiacriticalMarksSupplement", new int[] { 0x1DC0, 0x1DFF },
"CombiningHalfMarks", new int[] { 0xFE20, 0xFE2F },
"CombiningMarksforSymbols", new int[] { 0x20D0, 0x20FF },
"CommonIndicNumberForms", new int[] { 0xA830, 0xA83F },
"ControlPictures", new int[] { 0x2400, 0x243F },
"Coptic", new int[] { 0x2C80, 0x2CFF },
"CountingRodNumerals", new int[] { 0x1D360, 0x1D37F },
"Cuneiform", new int[] { 0x12000, 0x123FF },
"CuneiformNumbersandPunctuation", new int[] { 0x12400, 0x1247F },
"CurrencySymbols", new int[] { 0x20A0, 0x20CF },
"CypriotSyllabary", new int[] { 0x10800, 0x1083F },
"Cyrillic", new int[] { 0x0400, 0x04FF },
"CyrillicExtended-A", new int[] { 0x2DE0, 0x2DFF },
"CyrillicExtended-B", new int[] { 0xA640, 0xA69F },
"CyrillicSupplement", new int[] { 0x0500, 0x052F },
"Deseret", new int[] { 0x10400, 0x1044F },
"Devanagari", new int[] { 0x0900, 0x097F },
"DevanagariExtended", new int[] { 0xA8E0, 0xA8FF },
"Dingbats", new int[] { 0x2700, 0x27BF },
"DominoTiles", new int[] { 0x1F030, 0x1F09F },
"EgyptianHieroglyphs", new int[] { 0x13000, 0x1342F },
"EnclosedAlphanumerics", new int[] { 0x2460, 0x24FF },
"EnclosedAlphanumericSupplement", new int[] { 0x1F100, 0x1F1FF },
"EnclosedCJKLettersandMonths", new int[] { 0x3200, 0x32FF },
"EnclosedIdeographicSupplement", new int[] { 0x1F200, 0x1F2FF },
"Ethiopic", new int[] { 0x1200, 0x137F },
"EthiopicExtended", new int[] { 0x2D80, 0x2DDF },
"EthiopicSupplement", new int[] { 0x1380, 0x139F },
"GeneralPunctuation", new int[] { 0x2000, 0x206F },
"GeometricShapes", new int[] { 0x25A0, 0x25FF },
"Georgian", new int[] { 0x10A0, 0x10FF },
"GeorgianSupplement", new int[] { 0x2D00, 0x2D2F },
"Glagolitic", new int[] { 0x2C00, 0x2C5F },
"Gothic", new int[] { 0x10330, 0x1034F },
"Greek", new int[] { 0x0370, 0x03FF },
"GreekandCoptic", new int[] { 0x0370, 0x03FF },
"GreekExtended", new int[] { 0x1F00, 0x1FFF },
"Gujarati", new int[] { 0x0A80, 0x0AFF },
"Gurmukhi", new int[] { 0x0A00, 0x0A7F },
"HalfwidthandFullwidthForms", new int[] { 0xFF00, 0xFFEF },
"HangulCompatibilityJamo", new int[] { 0x3130, 0x318F },
"HangulJamo", new int[] { 0x1100, 0x11FF },
"HangulJamoExtended-A", new int[] { 0xA960, 0xA97F },
"HangulJamoExtended-B", new int[] { 0xD7B0, 0xD7FF },
"HangulSyllables", new int[] { 0xAC00, 0xD7AF },
"Hanunoo", new int[] { 0x1720, 0x173F },
"Hebrew", new int[] { 0x0590, 0x05FF },
"HighPrivateUseSurrogates", new int[] { 0xDB80, 0xDBFF },
"HighSurrogates", new int[] { 0xD800, 0xDB7F },
"Hiragana", new int[] { 0x3040, 0x309F },
"IdeographicDescriptionCharacters", new int[] { 0x2FF0, 0x2FFF },
"ImperialAramaic", new int[] { 0x10840, 0x1085F },
"InscriptionalPahlavi", new int[] { 0x10B60, 0x10B7F },
"InscriptionalParthian", new int[] { 0x10B40, 0x10B5F },
"IPAExtensions", new int[] { 0x0250, 0x02AF },
"Javanese", new int[] { 0xA980, 0xA9DF },
"Kaithi", new int[] { 0x11080, 0x110CF },
"Kanbun", new int[] { 0x3190, 0x319F },
"KangxiRadicals", new int[] { 0x2F00, 0x2FDF },
"Kannada", new int[] { 0x0C80, 0x0CFF },
"Katakana", new int[] { 0x30A0, 0x30FF },
"KatakanaPhoneticExtensions", new int[] { 0x31F0, 0x31FF },
"KayahLi", new int[] { 0xA900, 0xA92F },
"Kharoshthi", new int[] { 0x10A00, 0x10A5F },
"Khmer", new int[] { 0x1780, 0x17FF },
"KhmerSymbols", new int[] { 0x19E0, 0x19FF },
"Lao", new int[] { 0x0E80, 0x0EFF },
"Latin-1Supplement", new int[] { 0x0080, 0x00FF },
"LatinExtended-A", new int[] { 0x0100, 0x017F },
"LatinExtendedAdditional", new int[] { 0x1E00, 0x1EFF },
"LatinExtended-B", new int[] { 0x0180, 0x024F },
"LatinExtended-C", new int[] { 0x2C60, 0x2C7F },
"LatinExtended-D", new int[] { 0xA720, 0xA7FF },
"Lepcha", new int[] { 0x1C00, 0x1C4F },
"LetterlikeSymbols", new int[] { 0x2100, 0x214F },
"Limbu", new int[] { 0x1900, 0x194F },
"LinearBIdeograms", new int[] { 0x10080, 0x100FF },
"LinearBSyllabary", new int[] { 0x10000, 0x1007F },
"Lisu", new int[] { 0xA4D0, 0xA4FF },
"LowSurrogates", new int[] { 0xDC00, 0xDFFF },
"Lycian", new int[] { 0x10280, 0x1029F },
"Lydian", new int[] { 0x10920, 0x1093F },
"MahjongTiles", new int[] { 0x1F000, 0x1F02F },
"Malayalam", new int[] { 0x0D00, 0x0D7F },
"MathematicalAlphanumericSymbols", new int[] { 0x1D400, 0x1D7FF },
"MathematicalOperators", new int[] { 0x2200, 0x22FF },
"MeeteiMayek", new int[] { 0xABC0, 0xABFF },
"MiscellaneousMathematicalSymbols-A", new int[] { 0x27C0, 0x27EF },
"MiscellaneousMathematicalSymbols-B", new int[] { 0x2980, 0x29FF },
"MiscellaneousSymbols", new int[] { 0x2600, 0x26FF },
"MiscellaneousSymbolsandArrows", new int[] { 0x2B00, 0x2BFF },
"MiscellaneousTechnical", new int[] { 0x2300, 0x23FF },
"ModifierToneLetters", new int[] { 0xA700, 0xA71F },
"Mongolian", new int[] { 0x1800, 0x18AF },
"MusicalSymbols", new int[] { 0x1D100, 0x1D1FF },
"Myanmar", new int[] { 0x1000, 0x109F },
"MyanmarExtended-A", new int[] { 0xAA60, 0xAA7F },
"NewTaiLue", new int[] { 0x1980, 0x19DF },
"NKo", new int[] { 0x07C0, 0x07FF },
"NumberForms", new int[] { 0x2150, 0x218F },
"Ogham", new int[] { 0x1680, 0x169F },
"OlChiki", new int[] { 0x1C50, 0x1C7F },
"OldItalic", new int[] { 0x10300, 0x1032F },
"OldPersian", new int[] { 0x103A0, 0x103DF },
"OldSouthArabian", new int[] { 0x10A60, 0x10A7F },
"OldTurkic", new int[] { 0x10C00, 0x10C4F },
"OpticalCharacterRecognition", new int[] { 0x2440, 0x245F },
"Oriya", new int[] { 0x0B00, 0x0B7F },
"Osmanya", new int[] { 0x10480, 0x104AF },
"Phags-pa", new int[] { 0xA840, 0xA87F },
"PhaistosDisc", new int[] { 0x101D0, 0x101FF },
"Phoenician", new int[] { 0x10900, 0x1091F },
"PhoneticExtensions", new int[] { 0x1D00, 0x1D7F },
"PhoneticExtensionsSupplement", new int[] { 0x1D80, 0x1DBF },
"PrivateUse", new int[] { 0xE000, 0xF8FF },
"PrivateUseArea", new int[] { 0xE000, 0xF8FF },
"Rejang", new int[] { 0xA930, 0xA95F },
"RumiNumeralSymbols", new int[] { 0x10E60, 0x10E7F },
"Runic", new int[] { 0x16A0, 0x16FF },
"Samaritan", new int[] { 0x0800, 0x083F },
"Saurashtra", new int[] { 0xA880, 0xA8DF },
"Shavian", new int[] { 0x10450, 0x1047F },
"Sinhala", new int[] { 0x0D80, 0x0DFF },
"SmallFormVariants", new int[] { 0xFE50, 0xFE6F },
"SpacingModifierLetters", new int[] { 0x02B0, 0x02FF },
"Specials", new int[] { 0xFFF0, 0xFFFF },
"Sundanese", new int[] { 0x1B80, 0x1BBF },
"SuperscriptsandSubscripts", new int[] { 0x2070, 0x209F },
"SupplementalArrows-A", new int[] { 0x27F0, 0x27FF },
"SupplementalArrows-B", new int[] { 0x2900, 0x297F },
"SupplementalMathematicalOperators", new int[] { 0x2A00, 0x2AFF },
"SupplementalPunctuation", new int[] { 0x2E00, 0x2E7F },
"SupplementaryPrivateUseArea-A", new int[] { 0xF0000, 0xFFFFF },
"SupplementaryPrivateUseArea-B", new int[] { 0x100000, 0x10FFFF },
"SylotiNagri", new int[] { 0xA800, 0xA82F },
"Syriac", new int[] { 0x0700, 0x074F },
"Tagalog", new int[] { 0x1700, 0x171F },
"Tagbanwa", new int[] { 0x1760, 0x177F },
"Tags", new int[] { 0xE0000, 0xE007F },
"TaiLe", new int[] { 0x1950, 0x197F },
"TaiTham", new int[] { 0x1A20, 0x1AAF },
"TaiViet", new int[] { 0xAA80, 0xAADF },
"TaiXuanJingSymbols", new int[] { 0x1D300, 0x1D35F },
"Tamil", new int[] { 0x0B80, 0x0BFF },
"Telugu", new int[] { 0x0C00, 0x0C7F },
"Thaana", new int[] { 0x0780, 0x07BF },
"Thai", new int[] { 0x0E00, 0x0E7F },
"Tibetan", new int[] { 0x0F00, 0x0FFF },
"Tifinagh", new int[] { 0x2D30, 0x2D7F },
"Ugaritic", new int[] { 0x10380, 0x1039F },
"UnifiedCanadianAboriginalSyllabics", new int[] { 0x1400, 0x167F },
"UnifiedCanadianAboriginalSyllabicsExtended", new int[] { 0x18B0, 0x18FF },
"Vai", new int[] { 0xA500, 0xA63F },
"VariationSelectors", new int[] { 0xFE00, 0xFE0F },
"VariationSelectorsSupplement", new int[] { 0xE0100, 0xE01EF },
"VedicExtensions", new int[] { 0x1CD0, 0x1CFF },
"VerticalForms", new int[] { 0xFE10, 0xFE1F },
"YijingHexagramSymbols", new int[] { 0x4DC0, 0x4DFF },
"YiRadicals", new int[] { 0xA490, 0xA4CF },
"YiSyllables", new int[] { 0xA000, 0xA48F },
};
static {
for(int s = 0; s < CLS.length; s += 2) {
MAP.put((String) CLS[s], (int[]) CLS[s + 1]);
}
}
}