/** * Copyright (c) 2012-2016 AndrĂ© Bargull * Alle Rechte vorbehalten / All Rights Reserved. Use is subject to license terms. * * <https://github.com/anba/es6draft> */ package com.github.anba.es6draft.regexp; import static com.github.anba.es6draft.parser.Characters.*; import java.util.Arrays; import java.util.BitSet; import java.util.regex.Pattern; import org.joni.Config; import com.github.anba.es6draft.parser.ParserException; import com.github.anba.es6draft.parser.ParserException.ExceptionType; import com.github.anba.es6draft.runtime.internal.CompatibilityOption; import com.github.anba.es6draft.runtime.internal.Messages; /** * <h1>21 Text Processing</h1><br> * <h2>21.2 RegExp (Regular Expression) Objects</h2> * <ul> * <li>21.2.1 Patterns * <li>21.2.2 Pattern Semantics * </ul> */ public final class RegExpParser { private static final int BACKREF_LIMIT = 0xFFFF; private static final int DEPTH_LIMIT = 0xFFFF; private static final char[] HEXDIGITS = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; // CharacterClass \w ~ [a-zA-Z0-9_] and LATIN SMALL LETTER LONG S and KELVIN SIGN private static final String characterClass_wu = "a-zA-Z0-9_\\u017f\\u212a"; // CharacterClass \W ~ [\u0000-\u002F\u003A-\u0040\u005B-\u005E\u0060\u007B-\x{10ffff}] private static final String characterClass_Wu = "\\u0000-\\u002F\\u003A-\\u0040\\u005B-\\u005E\\u0060\\u007B-\\x{10ffff}\\x53\\x73\\x4B\\x6B"; // [] => matches nothing private static final String emptyCharacterClass = "(?:\\Z )"; // [^] => matches everything private static final String emptyNegCharacterClass = "(?s:.)"; private final String source; private final int length; private final String sourceFile; private final int sourceLine; private final int sourceColumn; private final int flags; private final boolean webRegExp; private final StringBuilder out; // Current source position. private int pos = 0; // Map of groups created within negative lookahead. private final BitSet negativeLAGroups = new BitSet(); private RegExpParser(String source, String flags, String sourceFile, int sourceLine, int sourceColumn, boolean webRegExp) { this.source = source; this.length = source.length(); this.sourceFile = sourceFile; this.sourceLine = sourceLine; this.sourceColumn = sourceColumn; // Call after source information was set this.flags = toFlags(flags); this.webRegExp = webRegExp; this.out = new StringBuilder(length); } public static RegExpMatcher parse(String pattern, String flags, String sourceFile, int sourceLine, int sourceColumn, boolean webRegExp) throws ParserException { RegExpParser parser = new RegExpParser(pattern, flags, sourceFile, sourceLine, sourceColumn, webRegExp); parser.pattern(); return new JoniRegExpMatcher(parser.out.toString(), parser.flags, parser.negativeLAGroups); } public static void syntaxParse(String pattern, String flags, String sourceFile, int sourceLine, int sourceColumn, boolean webRegExp) throws ParserException { RegExpParser parser = new RegExpParser(pattern, flags, sourceFile, sourceLine, sourceColumn, webRegExp); parser.pattern(); } private ParserException error(Messages.Key messageKey, String... args) { throw new ParserException(ExceptionType.SyntaxError, sourceFile, sourceLine, sourceColumn + pos, messageKey, args); } private ParserException error(Messages.Key messageKey, int offset, char offending) { throw new ParserException(ExceptionType.SyntaxError, sourceFile, sourceLine, sourceColumn + pos + offset, messageKey, String.valueOf(offending)); } private int toFlags(String flags) { // flags :: g | i | m | u | y final int global = 0b00001, ignoreCase = 0b00010, multiline = 0b00100, unicode = 0b01000, sticky = 0b10000; int mask = 0b00000; for (int i = 0, len = flags.length(); i < len; ++i) { char c = flags.charAt(i); int flag; String name; switch (c) { case 'g': flag = global; name = "global"; break; case 'i': flag = ignoreCase; name = "ignoreCase"; break; case 'm': flag = multiline; name = "multiline"; break; case 'u': flag = unicode; name = "unicode"; break; case 'y': flag = sticky; name = "sticky"; break; default: throw error(Messages.Key.RegExpInvalidFlag, String.valueOf(c)); } if ((mask & flag) == 0) { mask |= flag; } else { throw error(Messages.Key.RegExpDuplicateFlag, name); } } int iflags = 0; if ((mask & ignoreCase) != 0) { iflags |= Pattern.CASE_INSENSITIVE; } if ((mask & unicode) != 0) { iflags |= Pattern.UNICODE_CASE; } if ((mask & multiline) != 0) { iflags |= Pattern.MULTILINE; } return iflags; } /** * Returns {@code true} if regular expression patterns support the {@link CompatibilityOption#WebRegularExpressions} * extension. * * @return {@code true} if patterns are in web-compatibility mode */ private boolean isWebRegularExpression() { return webRegExp; } private boolean isIgnoreCase() { return (flags & Pattern.CASE_INSENSITIVE) != 0; } private boolean isUnicode() { return (flags & Pattern.UNICODE_CASE) != 0; } private boolean isMultiline() { return (flags & Pattern.MULTILINE) != 0; } private void reset(int p) { pos = p; } private char peek(int i) { return pos + i < length ? source.charAt(pos + i) : '\0'; } private char get() { return source.charAt(pos++); } private int get(boolean unicode) { char c = source.charAt(pos++); if (unicode && Character.isHighSurrogate(c) && Character.isLowSurrogate(peek(0))) { return Character.toCodePoint(c, get()); } return c; } private boolean match(char c) { if (c == peek(0)) { get(); return true; } return false; } private char mustMatch(char c) { if (c != get()) { throw error(Messages.Key.RegExpUnexpectedCharacter, String.valueOf(c)); } return c; } private boolean eof() { return pos >= length; } /** * <pre> * DecimalDigits :: * DecimalDigit * DecimalDigits DecimalDigit * </pre> * * @return the parsed decimal integer value */ private long decimal() { if (!isDecimalDigit(peek(0))) { return -1; } long num = get() - '0'; for (;;) { if (!isDecimalDigit(peek(0))) { return num; } num = num * 10 + (get() - '0'); if (num >= 0xFFFF_FFFFL) { throw error(Messages.Key.RegExpInvalidQuantifier); } } } private int readOctalEscapeSequence() { int num = get() - '0'; if (isOctalDigit(peek(0))) { num = num * 8 + (get() - '0'); if (num <= 037) { if (isOctalDigit(peek(0))) { num = num * 8 + (get() - '0'); } } } assert 0 <= num && num <= 0377; return num; } private int readDecimalEscape() { int num = get() - '0'; for (;;) { if (!isDecimalDigit(peek(0))) { break; } num = num * 10 + (get() - '0'); if (num > BACKREF_LIMIT) { num = BACKREF_LIMIT; break; } } return num; } private int readHexEscapeSequence() { int start = pos; int c = hex2Digits(); if (c < 0) { // invalid hex escape sequence, discard parsed characters reset(start); } return c; } private int readUnicodeEscapeSequence() { int start = pos; int c = hex4Digits(); if (c < 0) { // invalid unicode escape sequence, discard parsed characters reset(start); return c; } if (isUnicode() && Character.isHighSurrogate((char) c)) { int startLow = pos; if (match('\\') && match('u')) { int d = hex4Digits(); if (Character.isLowSurrogate((char) d)) { return Character.toCodePoint((char) c, (char) d); } } // lone high surrogate, discard parsed characters reset(startLow); } return c; } private int readExtendedUnicodeEscapeSequence() { assert isUnicode(); if (eof() || match('}')) { throw error(Messages.Key.InvalidUnicodeEscape); } int c = 0; for (char d; (d = get()) != '}';) { c = (c << 4) | hexDigit(d); if (!Character.isValidCodePoint(c) || eof()) { throw error(Messages.Key.InvalidUnicodeEscape); } } return c; } private int hex2Digits() { if (pos + 2 <= length) { return (hexDigit(get()) << 4) | hexDigit(get()); } return -1; } private int hex4Digits() { if (pos + 4 <= length) { return (hexDigit(get()) << 12) | (hexDigit(get()) << 8) | (hexDigit(get()) << 4) | hexDigit(get()); } return -1; } /** * <pre> * CharacterClass<span><sub>[U]</sub></span> :: * [ [LA ∉ {<b>^</b>}] ClassRanges<span><sub>[?U]</sub></span> ] * [ <b>^</b> ClassRanges<span><sub>[?U]</sub></span> ] * ClassRanges<span><sub>[U]</sub></span> :: * [empty] * NonemptyClassRanges<span><sub>[?U]</sub></span> * NonemptyClassRanges<span><sub>[U]</sub></span> :: * ClassAtom<span><sub>[?U]</sub></span> * ClassAtom<span><sub>[?U]</sub></span> NonemptyClassRangesNoDash<span><sub>[?U]</sub></span> * ClassAtom<span><sub>[?U]</sub></span> <b>-</b> ClassAtom<span><sub>[?U]</sub></span> ClassRanges<span><sub>[?U]</sub></span> * * <span><sub>[+U]</sub></span> ClassAtom<span><sub>[U]</sub></span> <b>-</b> ClassAtom<span><sub>[U]</sub></span> ClassRanges<span><sub>[U]</sub></span> * <span><sub>[~U]</sub></span> ClassAtomInRange <b>-</b> ClassAtomInRange ClassRanges * NonemptyClassRangesNoDash<span><sub>[U]</sub></span> :: * ClassAtom<span><sub>[?U]</sub></span> * ClassAtomNoDash<span><sub>[?U]</sub></span> NonemptyClassRangesNoDash<span><sub>[?U]</sub></span> * ClassAtomNoDash<span><sub>[?U]</sub></span> <b>-</b> ClassAtom<span><sub>[?U]</sub></span> ClassRanges<span><sub>[?U]</sub></span> * * <span><sub>[+U]</sub></span> ClassAtomNoDash<span><sub>[U]</sub></span> <b>-</b> ClassAtom<span><sub>[U]</sub></span> ClassRanges<span><sub>[U]</sub></span> * <span><sub>[~U]</sub></span> ClassAtomNoDashInRange <b>-</b> ClassAtomInRange ClassRanges * ClassAtom<span><sub>[U]</sub></span> :: * <b>-</b> * ClassAtomNoDash<span><sub>[?U]</sub></span> * ClassAtomNoDash<span><sub>[U]</sub></span> :: * SourceCharacter <b>but not one of \ or ] or -</b> * <b>\</b> ClassEscape<span><sub>[?U]</sub></span> * ClassAtomInRange :: * <b>-</b> * ClassAtomNoDashInRange * ClassAtomNoDashInRange :: * SourceCharacter <b>but not one of \ or ] or -</b> * <b>\</b> ClassEscape but only if ClassEscape evaluates to a CharSet with exactly one character * <b>\</b> IdentityEscape * ClassEscape<span><sub>[U]</sub></span> :: * DecimalEscape * <b>b</b> * CharacterEscape<span><sub>[?U]</sub></span> * CharacterClassEscape * * <span><sub>[+U]</sub></span> DecimalEscape * <span><sub>[~U]</sub></span> DecimalEscape but only if integer value of DecimalEscape is <= NCapturingParens * <b>b</b> * <span><sub>[+U]</sub></span> CharacterEscape<span><sub>[U]</sub></span> * <span><sub>[+U]</sub></span> CharacterClassEscape * <span><sub>[~U]</sub></span> CharacterClassEscape * <span><sub>[~U]</sub></span> CharacterEscape * </pre> */ private void characterClass() { final StringBuilder out = this.out; boolean negation = match('^'); if (match(']')) { // empty character class out.append(!negation ? emptyCharacterClass : emptyNegCharacterClass); return; } out.append('['); if (negation) { out.append('^'); } final boolean unicode = isUnicode(); final boolean web = isWebRegularExpression(); int rangeStartCV = 0; boolean inRange = false, inCCRange = false; charclass: for (;;) { if (eof()) { throw error(Messages.Key.RegExpUnmatchedCharacter, "["); } final int cv, c = get(unicode); classatom: switch (c) { case ']': out.append(']'); return; case '\\': { switch (peek(0)) { case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { // ClassEscape :: CharacterClassEscape char classEscape = get(); if (inRange) { if (!web || unicode) { throw error(Messages.Key.RegExpInvalidCharacterRange); } if (!inCCRange) { // escape range character "-" assert out.charAt(out.length() - 1) == '-'; out.setCharAt(out.length() - 1, '\\'); out.append('-'); } inRange = inCCRange = false; } else if (peek(0) == '-' && peek(1) != ']') { if (!web || unicode) { throw error(Messages.Key.RegExpInvalidCharacterRange); } inRange = inCCRange = true; } appendCharacterClassEscape(classEscape, true); if (inRange) { out.append('\\').append(mustMatch('-')); } continue charclass; } case '-': { // ClassEscape :: [+U] - out.append('\\').append(get()); cv = '-'; break classatom; } case 'b': // ClassEscape :: b mustMatch('b'); out.append('\u0008'); cv = 0x08; break classatom; case 'f': // CharacterEscape :: ControlEscape out.append('\\').append(get()); cv = '\f'; break classatom; case 'n': // CharacterEscape :: ControlEscape out.append('\\').append(get()); cv = '\n'; break classatom; case 'r': // CharacterEscape :: ControlEscape out.append('\\').append(get()); cv = '\r'; break classatom; case 't': // CharacterEscape :: ControlEscape out.append('\\').append(get()); cv = '\t'; break classatom; case 'v': // CharacterEscape :: ControlEscape mustMatch('v'); out.append('\u000B'); cv = 0x0B; break classatom; case 'c': { // CharacterEscape :: c ControlLetter char cc = peek(1); if ((!web || unicode) ? isASCIIAlpha(cc) : isASCIIAlphaNumericUnderscore(cc)) { // extended control letters with 0-9 and _ in web-compat mode out.append('\\').append(get()); int d = get() & 0x1F; out.append(toControlLetter(d)); cv = d; } else if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, +2, cc); } else { // convert invalid ControlLetter to \\ out.append("\\\\"); cv = '\\'; } break classatom; } case 'x': { // CharacterEscape :: HexEscapeSequence mustMatch('x'); int x = readHexEscapeSequence(); if (x >= 0x00 && x <= 0xff) { appendByteCodeUnit(x); cv = x; } else if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, "x"); } else { // invalid hex escape sequence, use "x" out.append('x'); cv = 'x'; } break classatom; } case 'u': { // CharacterEscape :: RegExpUnicodeEscapeSequence mustMatch('u'); if (unicode && match('{')) { int u = readExtendedUnicodeEscapeSequence(); appendCodePoint(u); cv = u; } else { int u = readUnicodeEscapeSequence(); if (u >= 0) { if (Character.isBmpCodePoint(u)) { appendCodeUnit(u); } else { appendCodePoint(u); } cv = u; } else if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, "u"); } else { // invalid unicode escape sequence, use "u" out.append('u'); cv = 'u'; } } break classatom; } case '0': if (!isDecimalDigit(peek(1))) { mustMatch('0'); out.append('\u0000'); cv = 0; break classatom; } if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, +2, peek(1)); } case '1': case '2': case '3': case '4': case '5': case '6': case '7': { if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, +1, peek(0)); } int num = readOctalEscapeSequence(); appendByteCodeUnit(num); cv = num; break classatom; } case '8': case '9': { if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, +1, peek(0)); } char d = get(); appendByteCodeUnit(d); cv = d; break classatom; } default: { if (eof()) { throw error(Messages.Key.RegExpTrailingSlash); } int d = get(unicode); if (unicode ? !isSyntaxCharacterOrSlash(d) : !web && isUnicodeIDContinue(d)) { throw error(Messages.Key.RegExpInvalidEscape, new String(Character.toChars(d))); } appendIdentityEscape(d); cv = d; break classatom; } } } case '-': case '[': case '&': // need to escape these characters for Java out.append('\\').append((char) c); cv = c; break classatom; default: { out.appendCodePoint(c); cv = c; break classatom; } } if (inRange) { // end range inRange = false; if (inCCRange) { inCCRange = false; continue charclass; } if (cv < rangeStartCV) { throw error(Messages.Key.RegExpInvalidCharacterRange); } } else if (peek(0) == '-' && peek(1) != ']') { // start range out.append(mustMatch('-')); inRange = true; rangeStartCV = cv; } else { // no range } continue charclass; } } /** * <pre> * Pattern<span><sub>[U]</sub></span> :: * Disjunction<span><sub>[?U]</sub></span> * Disjunction<span><sub>[U]</sub></span> :: * Alternative<span><sub>[?U]</sub></span> * Alternative<span><sub>[?U]</sub></span> <b>|</b> Disjunction<span><sub>[?U]</sub></span> * Alternative<span><sub>[U]</sub></span> :: * [empty] * Alternative<span><sub>[?U]</sub></span> Term<span><sub>[?U]</sub></span> * Term<span><sub>[U]</sub></span> :: * Assertion<span><sub>[?U]</sub></span> * Atom<span><sub>[?U]</sub></span> * Atom<span><sub>[?U]</sub></span> Quantifier * * <span><sub>[~U]</sub></span> ExtendedTerm * <span><sub>[+U]</sub></span> Assertion<span><sub>[U]</sub></span> * <span><sub>[+U]</sub></span> Atom<span><sub>[U]</sub></span> * <span><sub>[+U]</sub></span> Atom<span><sub>[U]</sub></span> Quantifier * ExtendedTerm :: * Assertion * AtomNoBrace Quantifier * Atom * QuantifiableAssertion Quantifier * Assertion<span><sub>[U]</sub></span> :: * <b>^</b> * <b>$</b> * <b>\ b</b> * <b>\ B</b> * <b>( ? =</b> Disjunction<span><sub>[?U]</sub></span> <b>)</b> * <b>( ? !</b> Disjunction<span><sub>[?U]</sub></span> <b>)</b> * * <span><sub>[+U]</sub></span> <b>( ? =</b> Disjunction<span><sub>[?U]</sub></span> <b>)</b> * <span><sub>[+U]</sub></span> <b>( ? !</b> Disjunction<span><sub>[?U]</sub></span> <b>)</b> * <span><sub>[~U]</sub></span> QuantifiableAssertion * AtomNoBrace :: * PatternCharacterNoBrace * <b>.</b> * <b>\</b> AtomEscape * CharacterClass * <b>(</b> Disjunction <b>)</b> * <b>( ? :</b> Disjunction <b>)</b> * Atom<span><sub>[U]</sub></span> :: * PatternCharacter * <b>.</b> * <b>\</b> AtomEscape<span><sub>[?U]</sub></span> * CharacterClass<span><sub>[?U]</sub></span> * <b>(</b> Disjunction<span><sub>[?U]</sub></span> <b>)</b> * <b>( ? :</b> Disjunction<span><sub>[?U]</sub></span> <b>)</b> * SyntaxCharacter :: <b>one of</b> * <b>^ $ \ . * + ? ( ) [ ] { } |</b> * PatternCharacterNoBrace :: * SourceCharacter but not one of <b>^ $ \ . * + ? ( ) [ ] { } |</b> * PatternCharacter :: * SourceCharacter but not SyntaxCharacter * * SourceCharacter but not one of <b>^ $ \ . * + ? ( ) [ ] |</b> * QuantifiableAssertion :: * <b>( ? =</b> Disjunction <b>)</b> * <b>( ? !</b> Disjunction <b>)</b> * AtomEscape<span><sub>[U]</sub></span> :: * DecimalEscape * CharacterEscape<span><sub>[?U]</sub></span> * CharacterClassEscape * * <span><sub>[+U]</sub></span> DecimalEscape * <span><sub>[~U]</sub></span> DecimalEscape but only if the integer value of DecimalEscape is <= NCapturingParens * <span><sub>[+U]</sub></span> CharacterEscape<span><sub>[U]</sub></span> * <span><sub>[+U]</sub></span> CharacterClassEscape * <span><sub>[~U]</sub></span> CharacterClassEscape * <span><sub>[~U]</sub></span> CharacterEscape * CharacterEscape<span><sub>[U]</sub></span> :: * ControlEscape * <b>c</b> ControlLetter * HexEscapeSequence * RegExpUnicodeEscapeSequence<span><sub>[?U]</sub></span> * IdentityEscape<span><sub>[?U]</sub></span> * * <span><sub>[~U]</sub></span>LegacyOctalEscapeSequence * ControlEscape :: one of * <b>f n r t v</b> * ControlLetter :: one of * <b>a b c d e f g h i j k l m n o p q r s t u v w x y z</b> * <b>A B C D E F G H I J K L M N O P Q R S T U V W X Y Z</b> * RegExpUnicodeEscapeSequence<span><sub>[U]</sub></span> :: * <span><sub>[+U]</sub></span> <b>u</b> LeadSurrogate <b>\\u</b> TrailSurrogate * <b>u</b> Hex4Digits * <span><sub>[+U]</sub></span> <b>u {</b> HexDigits <b>} </b> * LeadSurrogate :: * Hex4Digits <span><sub>[match only if the CV of Hex4Digits is in the inclusive range of 0xD800 and 0xDBFF]</sub></span> * TrailSurrogate :: * Hex4Digits <span><sub>[match only if the CV of Hex4Digits is in the inclusive range of 0xDC00 and 0xDFFF]</sub></span> * IdentityEscape<span><sub>[U]</sub></span> :: * <span><sub>[+U]</sub></span> SyntaxCharacter * <span><sub>[~U]</sub></span> SourceCharacter but not IdentifierPart * <span><sub>[~U]</sub></span> <ZWJ> * <span><sub>[~U]</sub></span> <ZWNJ> * * <span><sub>[~U]</sub></span> SourceCharacter <b>but not c</b> * DecimalEscape :: * DecimalIntegerLiteral [LA ∉ DecimalDigit] * CharacterClassEscape :: one of * <b>d D s S w W</b> * </pre> */ private void pattern() { final boolean unicode = isUnicode(); final boolean web = isWebRegularExpression(); final StringBuilder out = this.out; // map of valid groups BitSet validGroups = new BitSet(); // number of groups int groups = 0; // maximum back-reference found int backrefmax = 0; // back-reference limit int backreflimit = BACKREF_LIMIT; // current depths int depth = 0; int negativedepth = 0; // map: depth -> negative BitSet negativeGroup = new BitSet(); // map: depth -> positive BitSet positiveGroup = new BitSet(); // map: depth -> capturing BitSet capturingGroup = new BitSet(); // stack: groups int[] groupStack = new int[8]; int groupStackSP = 0; term: for (;;) { if (eof()) { if (depth > 0) { throw error(Messages.Key.RegExpUnmatchedCharacter, "("); } if (backrefmax > groups && backreflimit == BACKREF_LIMIT) { // discard state and restart parsing out.setLength(0); pos = 0; negativeLAGroups.clear(); // remember correct back reference limit backreflimit = groups; assert backreflimit != BACKREF_LIMIT; // reset locals validGroups.clear(); groups = 0; backrefmax = 0; // assert other locals don't carry any state assert depth == 0; assert negativedepth == 0; assert negativeGroup.isEmpty(); assert positiveGroup.isEmpty(); assert capturingGroup.isEmpty(); assert groupStackSP == 0; continue term; } assert backrefmax <= groups; return; } final int c = get(unicode); /* Disjunction, Assertion and Atom */ atom: switch (c) { case '|': /* Disjunction */ out.append((char) c); continue term; case '^': /* Assertion */ if (isMultiline()) { out.append((char) c); } else { out.append("\\A"); } continue term; case '$': /* Assertion */ if (isMultiline()) { out.append((char) c); } else { out.append("\\z"); } continue term; case '\\': { /* Assertion, AtomEscape */ switch (peek(0)) { case 'b': case 'B': // Assertion out.append('\\').append(get()); continue term; case 'f': case 'n': case 'r': case 't': // CharacterEscape :: ControlEscape out.append('\\').append(get()); break atom; case 'v': // CharacterEscape :: ControlEscape mustMatch('v'); out.append('\u000B'); break atom; case 'c': { // CharacterEscape :: c ControlLetter if (isASCIIAlpha(peek(1))) { out.append('\\').append(get()).append(toControlLetter(get())); } else if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, +2, peek(1)); } else { // convert invalid ControlLetter to \ out.append("\\\\"); } break atom; } case 'x': { // CharacterEscape :: HexEscapeSequence mustMatch('x'); int x = readHexEscapeSequence(); if (x >= 0x00 && x <= 0xff) { appendByteCodeUnit(x); } else if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, "x"); } else { // invalid hex escape sequence, use "x" out.append('x'); } break atom; } case 'u': { // CharacterEscape :: RegExpUnicodeEscapeSequence mustMatch('u'); if (unicode && match('{')) { int u = readExtendedUnicodeEscapeSequence(); appendCodePoint(u); } else { int u = readUnicodeEscapeSequence(); if (u >= 0) { if (Character.isBmpCodePoint(u)) { appendCodeUnit(u); } else { appendCodePoint(u); } } else if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, "u"); } else { // invalid unicode escape sequence, use "u" out.append('u'); } } break atom; } case 'd': case 'D': case 'w': case 'W': case 's': case 'S': // CharacterClassEscape appendCharacterClassEscape(get(), false); break atom; case '0': // "\0" or octal sequence if ((!web || unicode) && isDecimalDigit(peek(1))) { throw error(Messages.Key.RegExpInvalidEscape, +2, peek(1)); } appendByteCodeUnit(readOctalEscapeSequence()); break atom; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { // DecimalEscape - back-reference or invalid escape int start = pos; int num = readDecimalEscape(); if (num > backreflimit) { // invalid backreference -> roll back to start of decimal escape reset(start); if (!web || unicode) { throw error(Messages.Key.RegExpInvalidEscape, +1, peek(0)); } if (peek(0) < '8') { // case 1: octal escape sequence appendByteCodeUnit(readOctalEscapeSequence()); } else { // case 2 (\8 or \9): invalid octal escape sequence appendByteCodeUnit(get()); } } else { if (num > backrefmax) { backrefmax = num; } if (num <= groups && validGroups.get(num)) { out.append('\\').append(num); } else { // omit forward reference or backward reference into capturing group // from negative lookahead out.append("(?:)"); } } break atom; } default: { // CharacterEscape :: IdentityEscape if (eof()) { throw error(Messages.Key.RegExpTrailingSlash); } int d = get(unicode); if (unicode ? !isSyntaxCharacterOrSlash(d) : !web && isUnicodeIDContinue(d)) { throw error(Messages.Key.RegExpInvalidEscape, new String(Character.toChars(d))); } appendIdentityEscape(d); break atom; } } } case '(': { boolean negative = false, positive = false, capturing = false; if (match('?')) { // (?=X) or (?!X) or (?:X) if (eof()) { throw error(Messages.Key.RegExpUnexpectedCharacter, "?"); } char d = get(); switch (d) { case '!': negative = true; break; case '=': positive = true; break; case ':': // non-capturing break; default: throw error(Messages.Key.RegExpUnexpectedCharacter, String.valueOf(d)); } out.append("(?").append(d); } else { capturing = true; out.append('('); } depth += 1; if (capturing) { groups += 1; capturingGroup.set(depth); } else if (negative) { negativedepth += 1; negativeGroup.set(depth); } else if (positive) { positiveGroup.set(depth); } if (capturing || negative) { if (groupStackSP == groupStack.length) { groupStack = Arrays.copyOf(groupStack, groupStackSP << 1); } groupStack[groupStackSP++] = groups; } if (depth >= DEPTH_LIMIT || groups >= BACKREF_LIMIT) { throw error(Messages.Key.RegExpPatternTooComplex); } continue term; } case ')': { out.append(')'); if (depth == 0) { throw error(Messages.Key.RegExpUnmatchedCharacter, ")"); } boolean lookaround = false; if (capturingGroup.get(depth)) { capturingGroup.clear(depth); // update group information after parsing ")" int g = groupStack[--groupStackSP]; validGroups.set(g); if (negativedepth > 0) { negativeLAGroups.set(g); } } else if (negativeGroup.get(depth)) { negativeGroup.clear(depth); // invalidate all capturing groups created within the negative lookahead int g = groupStack[--groupStackSP]; for (int v = groups; v != g; --v) { validGroups.clear(v); } negativedepth -= 1; lookaround = true; } else if (positiveGroup.get(depth)) { positiveGroup.clear(depth); lookaround = true; } depth -= 1; if (lookaround && (!web || unicode)) { continue term; } break atom; } case '[': { // CharacterClass characterClass(); break atom; } case '*': case '+': case '?': // quantifier without applicable atom throw error(Messages.Key.RegExpInvalidQuantifier); case '{': { if (quantifier((char) c)) { // quantifier without applicable atom throw error(Messages.Key.RegExpInvalidQuantifier); } // fall-through } case ']': case '}': if (unicode || !web) { throw error(Messages.Key.RegExpUnexpectedCharacter, String.valueOf((char) c)); } out.append('\\').append((char) c); break atom; case '.': out.append('.'); break atom; default: { out.appendCodePoint(c); break atom; } } /* Quantifier (optional) */ switch (peek(0)) { case '*': case '+': case '?': case '{': if (!quantifier(get())) { if (unicode || !web) { throw error(Messages.Key.RegExpUnexpectedCharacter, "{"); } reset(pos - 1); } } continue term; } } /** * <pre> * Quantifier :: * QuantifierPrefix * QuantifierPrefix <b>?</b> * QuantifierPrefix :: * <b>*</b> * <b>+</b> * <b>?</b> * <b>{</b> DecimalDigits <b>}</b> * <b>{</b> DecimalDigits <b>, }</b> * <b>{</b> DecimalDigits <b>,</b> DecimalDigits <b>}</b> * </pre> * * @param c * the start character of the quantifier * @return {@code true} if the input could be parsed as a quantifier */ private boolean quantifier(char c) { StringBuilder out = this.out; // Greedy/Reluctant quantifiers quantifier: switch (c) { case '*': case '+': case '?': out.append(c); break quantifier; case '{': { int start = pos; long min = decimal(); if (min < 0) { reset(start); return false; } boolean comma; long max = -1; if ((comma = match(',')) && peek(0) != '}') { max = decimal(); if (max < 0) { reset(start); return false; } } if (!match('}')) { reset(start); return false; } if (max != -1 && min > max) { throw error(Messages.Key.RegExpInvalidQuantifier); } // output result out.append('{').append((int) Math.min(min, Config.MAX_REPEAT_NUM)); if (comma) { if (max != -1) { out.append(',').append((int) Math.min(max, Config.MAX_REPEAT_NUM)); } else { out.append(','); } } out.append('}'); break quantifier; } default: throw new AssertionError("unreachable"); } // Reluctant quantifiers if (match('?')) { out.append('?'); } return true; } private void appendCharacterClassEscape(char c, boolean cclass) { if ((c == 'w' || c == 'W') && isIgnoreCase() && isUnicode()) { if (!cclass) { out.append('['); } out.append(c == 'w' ? characterClass_wu : characterClass_Wu); if (!cclass) { out.append(']'); } } else { char mod = (char) ('P' | (c & 0x20)); String propertyName = getCharacterClassPropertyName(c); out.append('\\').append(mod).append('{').append(propertyName).append('}'); } } private static String getCharacterClassPropertyName(char c) { switch (c) { case 'd': case 'D': return "Digit"; case 'w': case 'W': return "Word"; case 's': case 'S': return "Space"; default: throw new AssertionError("unreachable"); } } private void appendIdentityEscape(int ch) { if (isASCIIAlpha(ch)) { // Don't escape ASCII alpha characters to avoid turning them into flags. out.append((char) ch); } else if (ch <= 0x7f) { // Apply identity escape for other ASCII characters. out.append('\\').append((char) ch); } else if (ch < 0x100) { appendByteCodeUnit(ch); } else if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { appendCodeUnit(ch); } else { appendCodePoint(ch); } } private void appendByteCodeUnit(int codeUnit) { assert codeUnit >>> 8 == 0; if (isUnicode()) { out.append("\\u00").append(toHexDigit(codeUnit >> 4)).append(toHexDigit(codeUnit >> 0)); } else { out.append("\\x00").append("\\x").append(toHexDigit(codeUnit >> 4)).append(toHexDigit(codeUnit >> 0)); } } private void appendCodeUnit(int codeUnit) { assert Character.isBmpCodePoint(codeUnit); out.append("\\u").append(toHexDigit(codeUnit >> 12)).append(toHexDigit(codeUnit >> 8)) .append(toHexDigit(codeUnit >> 4)).append(toHexDigit(codeUnit >> 0)); } private void appendCodePoint(int codePoint) { out.append("\\x{").append(Integer.toHexString(codePoint)).append('}'); } private static char toControlLetter(int c) { return (char) ('A' - 1 + (c & 0x1f)); } private static char toHexDigit(int c) { return HEXDIGITS[c & 0xf]; } /** * <pre> * SyntaxCharacter :: <b>one of</b> * <b>^ $ \ . * + ? ( ) [ ] { } |</b> * </pre> * * @param c * the character to inspect * @return {@code true} if the character is a syntax character or a forward slash ({@code /}) */ private static boolean isSyntaxCharacterOrSlash(int c) { switch (c) { case '^': case '$': case '\\': case '.': case '*': case '+': case '?': case '(': case ')': case '[': case ']': case '{': case '}': case '|': case '/': return true; default: return false; } } }