/* * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * * Free Software Foundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Scott Ferguson */ /* * XXX: anchored expressions should have flags for quick matching. */ package com.caucho.quercus.lib.regexp; import java.util.*; import java.util.concurrent.*; import java.util.logging.*; import com.caucho.quercus.env.ConstStringValue; import com.caucho.quercus.env.StringValue; import com.caucho.quercus.env.StringBuilderValue; import com.caucho.util.*; /** * Regular expression compilation. */ class Regcomp { private static final Logger log = Logger.getLogger(Regcomp.class.getName()); private static final L10N L = new L10N(RegexpNode.class); // #2526, JIT issues with Integer.MAX_VALUE private static final int INTEGER_MAX = Integer.MAX_VALUE - 1; static final int MULTILINE = 0x1; static final int SINGLE_LINE = 0x2; static final int IGNORE_CASE = 0x4; static final int IGNORE_WS = 0x8; static final int GLOBAL = 0x10; static final int ANCHORED = 0x20; static final int END_ONLY = 0x40; static final int UNGREEDY = 0x80; static final int STRICT = 0x100; static final int UTF8 = 0x200; static final HashMap<String,Integer> _characterClassMap = new HashMap<String,Integer>(); static final ConcurrentHashMap<String,RegexpSet> _unicodeBlockMap = new ConcurrentHashMap<String,RegexpSet>(); private PeekStream _pattern; int _nGroup; int _nLoop; int _maxGroup; int _flags; HashMap<Integer,StringValue> _groupNameMap = new HashMap<Integer,StringValue>(); HashMap<StringValue,Integer> _groupNameReverseMap = new HashMap<StringValue,Integer>(); ArrayList<RegexpNode.Recursive> _recursiveList = new ArrayList<RegexpNode.Recursive>(); RegexpNode _groupTail; boolean _isLookbehind; boolean _isOr; Regcomp(int flags) { _flags = flags; } boolean isGreedy() { return (_flags & UNGREEDY) != UNGREEDY; } boolean isIgnoreCase() { return (_flags & IGNORE_CASE) == IGNORE_CASE; } boolean isIgnoreWs() { return (_flags & IGNORE_WS) == IGNORE_WS; } boolean isMultiline() { return (_flags & MULTILINE) == MULTILINE; } boolean isDollarEndOnly() { return (_flags & END_ONLY) == END_ONLY; } int nextLoopIndex() { return _nLoop++; } RegexpNode parse(PeekStream pattern) throws IllegalRegexpException { _pattern = pattern; _nGroup = 1; RegexpNode begin = null; if ((_flags & ANCHORED) != 0) begin = RegexpNode.ANCHOR_BEGIN_RELATIVE; RegexpNode value = parseRec(pattern, begin); int ch; while ((ch = pattern.read()) == '|') { value = RegexpNode.Or.create(value, parseRec(pattern, begin)); } value = value != null ? value.getHead() : RegexpNode.N_END; if (_maxGroup < _nGroup) _maxGroup = _nGroup; for (RegexpNode.Recursive rec : _recursiveList) { RegexpNode top = value; if (top instanceof RegexpNode.Concat) { RegexpNode.Concat topConcat = (RegexpNode.Concat) top; if (topConcat.getConcatHead() instanceof RegexpNode.AnchorBegin || topConcat.getConcatHead() instanceof RegexpNode .AnchorBeginRelative) { top = topConcat.getConcatNext(); } } rec.setTop(top); } if (log.isLoggable(Level.FINEST)) log.finest("regexp[] " + value); return value; } /** * Recursively compile a RegexpNode. * * first -- The first node of this sub-RegexpNode * prev -- The previous node of this sub-RegexpNode * last_begin -- When the last grouping began * last_end -- When the last grouping ended * * head -> node * v -- rest * ... * v -- rest * node * * last -> node * v -- rest * ... * v -- rest * node */ private RegexpNode parseRec(PeekStream pattern, RegexpNode tail) throws IllegalRegexpException { int ch = pattern.read(); RegexpNode next; RegexpNode groupTail; switch (ch) { case -1: return tail != null ? tail.getHead() : null; case '?': if (tail == null) throw error(L.l("'?' requires a preceeding regexp")); tail = createLoop(pattern, tail, 0, 1); return parseRec(pattern, tail.getTail()); case '*': if (tail == null) throw error(L.l("'*' requires a preceeding regexp")); tail = createLoop(pattern, tail, 0, INTEGER_MAX); return parseRec(pattern, tail.getTail()); case '+': if (tail == null) throw error(L.l("'+' requires a preceeding regexp")); tail = createLoop(pattern, tail, 1, INTEGER_MAX); return parseRec(pattern, tail.getTail()); case '{': if (tail == null || ! ('0' <= pattern.peek() && pattern.peek() <= '9')) { next = parseString('{', pattern); return concat(tail, parseRec(pattern, next)); } return parseRec(pattern, parseBrace(pattern, tail).getTail()); case '.': if ((_flags & SINGLE_LINE) == 0) next = RegexpNode.DOT; else next = RegexpNode.ANY_CHAR; return concat(tail, parseRec(pattern, next)); case '|': pattern.ungetc(ch); if (_groupTail != null) return concat(tail, _groupTail); else return tail.getHead(); case '(': { switch (pattern.peek()) { case '?': pattern.read(); switch (pattern.peek()) { case ':': pattern.read(); return parseGroup(pattern, tail, 0, _flags); case '#': parseCommentGroup(pattern); return parseRec(pattern, tail); case '(': return parseConditional(pattern, tail); case '=': case '!': ch = pattern.read(); boolean isPositive = (ch == '='); groupTail = _groupTail; _groupTail = null; next = parseRec(pattern, null); while ((ch = pattern.read()) == '|') { RegexpNode nextHead = parseRec(pattern, null); next = next.createOr(nextHead); } if (isPositive) next = new RegexpNode.Lookahead(next); else next = new RegexpNode.NotLookahead(next); if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); _groupTail = groupTail; return concat(tail, parseRec(pattern, next)); case '<': pattern.read(); switch (pattern.read()) { case '=': isPositive = true; break; case '!': isPositive = false; break; default: throw error(L.l("expected '=' or '!'")); } groupTail = _groupTail; _groupTail = null; next = parseRec(pattern, null); if (next == null) { } else if (isPositive) next = new RegexpNode.Lookbehind(next); else next = new RegexpNode.NotLookbehind(next); while ((ch = pattern.read()) == '|') { RegexpNode second = parseRec(pattern, null); if (second == null) { } else if (isPositive) second = new RegexpNode.Lookbehind(second); else second = new RegexpNode.NotLookbehind(second); if (second != null) next = next.createOr(second); } if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); _groupTail = groupTail; return concat(tail, parseRec(pattern, next)); // XXX: once-only subpatterns (mostly an optimization feature) case '>': pattern.read(); return parseGroup(pattern, tail, 0, _flags); case 'P': pattern.read(); return parseNamedGroup(pattern, tail); case 'R': pattern.read(); RegexpNode.Recursive rec = new RegexpNode.Recursive(); _recursiveList.add(rec); ch = pattern.read(); if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); return concat(tail, parseRec(pattern, rec)); case 'm': case 's': case 'i': case 'x': case 'g': case 'U': case 'X': { int flags = _flags; while ((ch = pattern.read()) > 0 && ch != ')') { switch (ch) { case 'm': _flags |= MULTILINE; break; case 's': _flags |= SINGLE_LINE; break; case 'i': _flags |= IGNORE_CASE; break; case 'x': _flags |= IGNORE_WS; break; case 'g': _flags |= GLOBAL; break; case 'U': _flags |= UNGREEDY; break; case 'X': _flags |= STRICT; break; case ':': { return parseGroup(pattern, tail, 0, flags); } default: throw error( L.l("'{0}' is an unknown (? code", String.valueOf((char) ch))); } } if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); RegexpNode node = parseRec(pattern, tail); _flags = flags; return node; } default: throw error(L.l("'{0}' is an unknown (? code", String.valueOf((char) pattern.peek()))); } default: return parseGroup(pattern, tail, _nGroup++, _flags); } } case ')': pattern.ungetc(ch); if (_groupTail != null) return concat(tail, _groupTail); else return tail; case '[': next = parseSet(pattern); return concat(tail, parseRec(pattern, next)); case '\\': next = parseSlash(pattern); return concat(tail, parseRec(pattern, next)); case '^': if (isMultiline()) next = RegexpNode.ANCHOR_BEGIN_OR_NEWLINE; else next = RegexpNode.ANCHOR_BEGIN; return concat(tail, parseRec(pattern, next)); case '$': if (isMultiline()) next = RegexpNode.ANCHOR_END_OR_NEWLINE; else if (isDollarEndOnly()) next = RegexpNode.ANCHOR_END_ONLY; else next = RegexpNode.ANCHOR_END; return concat(tail, parseRec(pattern, next)); case ' ': case '\n': case '\t': case '\r': if (isIgnoreWs()) { while (Character.isSpace((char) pattern.peek())) pattern.read(); return parseRec(pattern, tail); } else { next = parseString(ch, pattern); return concat(tail, parseRec(pattern, next)); } case '#': if (isIgnoreWs()) { while ((ch = pattern.read()) > 0 && ch != '\n') { } return parseRec(pattern, tail); } else { next = parseString(ch, pattern); return concat(tail, parseRec(pattern, next)); } default: next = parseString(ch, pattern); return concat(tail, parseRec(pattern, next)); } } private void parseCommentGroup(PeekStream pattern) { int ch; // (?#...) Comment while ((ch = pattern.read()) >= 0 && ch != ')') { } } private RegexpNode parseNamedGroup(PeekStream pattern, RegexpNode tail) throws IllegalRegexpException { int ch = pattern.read(); if (ch == '=') { StringBuilder sb = new StringBuilder(); while ((ch = pattern.read()) != ')' && ch >= 0) { sb.append((char) ch); } if (ch != ')') throw error(L.l("expected ')'")); String name = sb.toString(); Integer v = _groupNameReverseMap.get(new ConstStringValue(name)); if (v != null) { RegexpNode next = new RegexpNode.GroupRef(v); return concat(tail, parseRec(pattern, next)); } else throw error(L.l("'{0}' is an unknown regexp group", name)); } else if (ch == '<') { StringBuilder sb = new StringBuilder(); while ((ch = pattern.read()) != '>' && ch >= 0) { sb.append((char) ch); } if (ch != '>') throw error(L.l("expected '>'")); String name = sb.toString(); int group = _nGroup++; _groupNameMap.put(group, new StringBuilderValue(name)); _groupNameReverseMap.put(new StringBuilderValue(name), group); return parseGroup(pattern, tail, group, _flags); } else throw error(L.l("Expected '(?:P=name' or '(?:P<name' for named group")); } private RegexpNode parseConditional(PeekStream pattern, RegexpNode tail) throws IllegalRegexpException { int ch = pattern.read(); if (ch != '(') throw error(L.l("expected '('")); RegexpNode.ConditionalHead groupHead = null;; RegexpNode groupTail = null; if ('1' <= (ch = pattern.peek()) && ch <= '9') { int value = 0; while ('0' <= (ch = pattern.read()) && ch <= '9') { value = 10 * value + ch - '0'; } if (ch != ')') throw error(L.l("expected ')'")); if (_nGroup <= value) throw error(L.l("conditional value less than number of groups")); groupHead = new RegexpNode.ConditionalHead(value); groupTail = groupHead.getTail(); } else throw error(L.l("conditional requires number")); RegexpNode oldTail = _groupTail; _groupTail = groupTail; RegexpNode first = parseRec(pattern, null); RegexpNode second = null; if ((ch = pattern.read()) == '|') { second = parseRec(pattern, null); ch = pattern.read(); } if (ch != ')') throw error(L.l("expected ')' at '{0}'", String.valueOf((char) ch))); _groupTail = oldTail; groupHead.setFirst(first); groupHead.setSecond(second); return concat(tail, parseRec(pattern, groupHead)); } private RegexpNode parseGroup(PeekStream pattern, RegexpNode tail, int group, int oldFlags) throws IllegalRegexpException { RegexpNode.GroupHead groupHead = new RegexpNode.GroupHead(group); RegexpNode groupTail = groupHead.getTail(); RegexpNode oldTail = _groupTail; _groupTail = groupTail; RegexpNode body = parseRec(pattern, null); int ch; while ((ch = pattern.read()) == '|') { RegexpNode nextBody = parseRec(pattern, null); body = body.createOr(nextBody); } if (ch != ')') throw error(L.l("expected ')'")); _flags = oldFlags; _groupTail = oldTail; groupHead.setNode(body.getHead()); return concat(tail, parseRec(pattern, groupTail).getHead()); } private void expect(char expected, int value) throws IllegalRegexpException { if (expected != value) throw error(L.l("expected '{0}'", String.valueOf(expected))); } private IllegalRegexpException error(String msg) { return new IllegalRegexpException(msg + " " + _pattern.getPattern()); } /** * Parse the repetition construct. * * {n} -- exactly n * {n,} -- at least n * {n,m} -- from n to m * {,m} -- at most m */ private RegexpNode parseBrace(PeekStream pattern, RegexpNode node) throws IllegalRegexpException { int ch; int min = 0; int max = INTEGER_MAX; while ((ch = pattern.read()) >= '0' && ch <= '9') { min = 10 * min + ch - '0'; } if (ch == ',') { while ('0' <= (ch = pattern.read()) && ch <= '9') { if (max == INTEGER_MAX) max = 0; max = 10 * max + ch - '0'; } } else max = min; if (ch != '}') throw error(L.l("Expected '}'")); return createLoop(pattern, node, min, max); } private RegexpNode createLoop(PeekStream pattern, RegexpNode node, int min, int max) { if (pattern.peek() == '+') { pattern.read(); return node.createPossessiveLoop(min, max); } else if (pattern.peek() == '?') { pattern.read(); if (isGreedy()) return node.createLoopUngreedy(this, min, max); else return node.createLoop(this, min, max); } else { if (isGreedy()) return node.createLoop(this, min, max); else return node.createLoopUngreedy(this, min, max); } } static RegexpNode concat(RegexpNode prev, RegexpNode next) { if (prev != null) { return prev.concat(next).getHead(); } else return next; } private String hex(int value) { CharBuffer cb = new CharBuffer(); for (int b = 3; b >= 0; b--) { int v = (value >> (4 * b)) & 0xf; if (v < 10) cb.append((char) (v + '0')); else cb.append((char) (v - 10 + 'a')); } return cb.toString(); } private String badChar(int ch) { if (0x20 <= ch && ch <= 0x7f) return "'" + (char) ch + "'"; else if ((ch & 0xffff) == 0xffff) return "end of expression"; else return "'" + (char) ch + "' (\\u" + hex(ch) + ")"; } /** * Collect the characters in a set, e.g. [a-z@@^!"] * * Variables: * * last -- Contains last read character. * lastdash -- Contains character before dash or -1 if not after dash. */ private RegexpNode parseSet(PeekStream pattern) throws IllegalRegexpException { int first = pattern.peek(); boolean isNot = false; if (first == '^') { pattern.read(); isNot = true; } RegexpSet set = new RegexpSet(); int last = -1; int lastdash = -1; int ch; int charRead = 0; ArrayList<RegexpNode> nodeList = null; while ((ch = pattern.read()) >= 0) { charRead++; // php/4e3o // first literal closing bracket need not be escaped if (ch == ']') { if (charRead == 1) { pattern.ungetc(ch); ch = '\\'; } else break; } boolean isChar = true; boolean isDash = ch == '-'; if (ch == '\\') { isChar = false; switch ((ch = pattern.read())) { case 's': set.mergeOr(RegexpSet.SPACE); break; case 'S': set.mergeOrInv(RegexpSet.SPACE); break; case 'd': set.mergeOr(RegexpSet.DIGIT); break; case 'D': set.mergeOrInv(RegexpSet.DIGIT); break; case 'w': set.mergeOr(RegexpSet.WORD); break; case 'W': set.mergeOrInv(RegexpSet.WORD); break; case 'p': int ch2 = pattern.read(); if (ch2 != '{') { if (nodeList == null) nodeList = new ArrayList<RegexpNode>(); nodeList.add(parseUnicodeProperty(ch2, false)); } else { StringBuilder sb = new StringBuilder(); int ch3; while ((ch3 = pattern.read()) >= 0 && ch3 != '}') { sb.append((char) ch3); } String name = sb.toString(); if (ch3 != '}') throw new IllegalRegexpException(L.l("expected '}' at " + badChar(ch3))); int len = name.length(); if (len == 1) { if (nodeList == null) nodeList = new ArrayList<RegexpNode>(); nodeList.add(parseUnicodeProperty(name.charAt(0), false)); } else if (len == 2) { if (nodeList == null) nodeList = new ArrayList<RegexpNode>(); nodeList.add(parseUnicodeProperty(name.charAt(0), name.charAt(1), false)); } else { set.mergeOr(getUnicodeSet(name)); } } break; case 'b': ch = '\b'; isChar = true; break; case 'n': ch = '\n'; isChar = true; break; case 't': ch = '\t'; isChar = true; break; case 'r': ch = '\r'; isChar = true; break; case 'f': ch = '\f'; isChar = true; break; case 'x': ch = parseHex(pattern); isChar = true; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': ch = parseOctal(ch, pattern); isChar = true; break; default: isChar = true; } } else if (ch == '[') { if (pattern.peek() == ':') { isChar = false; pattern.read(); set.mergeOr(parseCharacterClass(pattern)); } } if (isDash && last != -1 && lastdash == -1) { lastdash = last; } // c1-c2 else if (isChar && lastdash != -1) { if (lastdash > ch) throw new IllegalRegexpException("expected increasing range at " + badChar(ch)); setRange(set, lastdash, ch); last = -1; lastdash = -1; } else if (lastdash != -1) { setRange(set, lastdash, lastdash); setRange(set, '-', '-'); last = -1; lastdash = -1; } else if (last != -1) { setRange(set, last, last); if (isChar) last = ch; } else if (isChar) last = ch; } // Dash at end of set: [a-z1-] if (lastdash != -1) { setRange(set, lastdash, lastdash); setRange(set, '-', '-'); } else if (last != -1) { setRange(set, last, last); } if (ch != ']') throw error(L.l("Expected ']'")); if (nodeList == null) { if (isNot) return set.createNotNode(); else return set.createNode(); } else { RegexpNode setNode = set.createNode(); for (RegexpNode node : nodeList) { setNode = setNode.createOr(node); } if (isNot) return setNode.createNot(); else return setNode; } } private void setRange(RegexpSet set, int a, int b) { set.setRange(a, b); if (isIgnoreCase()) { if (Character.isLowerCase(a) && Character.isLowerCase(b)) { set.setRange(Character.toUpperCase(a), Character.toUpperCase(b)); } if (Character.isUpperCase(a) && Character.isUpperCase(b)) { set.setRange(Character.toLowerCase(a), Character.toLowerCase(b)); } } } private RegexpSet getUnicodeSet(String name) throws IllegalRegexpException { _flags |= UTF8; RegexpSet set = _unicodeBlockMap.get(name); if (set == null) { Character.UnicodeBlock block = Character.UnicodeBlock.forName(name); if (block == null) throw new IllegalRegexpException( L.l("'{0}' is an unknown unicode block", name)); set = new RegexpSet(); for (int ch = 0; ch < 65536; ch++) { if (Character.UnicodeBlock.of(ch) == block) { set.setRange(ch, ch); } } _unicodeBlockMap.put(name, set); } return set; } /** * Returns a node for sequences starting with a backslash. */ private RegexpNode parseSlash(PeekStream pattern) throws IllegalRegexpException { int ch; switch (ch = pattern.read()) { case 's': return RegexpNode.SPACE; case 'S': return RegexpNode.NOT_SPACE; case 'd': return RegexpNode.DIGIT; case 'D': return RegexpNode.NOT_DIGIT; case 'w': return RegexpNode.S_WORD; case 'W': return RegexpNode.NOT_S_WORD; case 'b': return RegexpNode.WORD; case 'B': return RegexpNode.NOT_WORD; case 'A': return RegexpNode.STRING_BEGIN; case 'z': return RegexpNode.STRING_END; case 'Z': return RegexpNode.STRING_NEWLINE; case 'G': return RegexpNode.STRING_FIRST; case 'a': return parseString('\u0007', pattern); case 'c': ch = pattern.read(); ch = Character.toUpperCase(ch); ch ^= 0x40; return parseString(ch, pattern); case 'e': return parseString('\u001B', pattern, true); case 'n': return parseString('\n', pattern, true); case 'r': return parseString('\r', pattern, true); case 'f': return parseString('\f', pattern, true); case 't': return parseString('\t', pattern, true); case 'x': int hex = parseHex(pattern); return parseString(hex, pattern, true); case '0': int oct = parseOctal(ch, pattern); return parseString(oct, pattern, true); case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return parseBackReference(ch, pattern); case 'p': return parseUnicodeProperty(pattern, false); case 'P': return parseUnicodeProperty(pattern, true); case 'Q': return parseQuotedString(pattern); case '#': return parseString('#', pattern, true); default: if ((_flags & STRICT) != 0) throw new IllegalRegexpException("unrecognized escape at " + badChar(ch)); return parseString(ch, pattern); } } /** * Returns a node for sequences starting with a '[:'. */ private RegexpSet parseCharacterClass(PeekStream pattern) throws IllegalRegexpException { StringBuilder sb = new StringBuilder(); int ch; while ((ch = pattern.read()) != ':' && ch >= 0) { sb.append((char)ch); } if (ch != ':') { throw new IllegalRegexpException( "expected character class closing colon ':' at " + badChar(ch)); } if ((ch = pattern.read()) != ']') { throw new IllegalRegexpException( "expected character class closing bracket ']' at " + badChar(ch)); } String name = sb.toString(); RegexpSet set = RegexpSet.CLASS_MAP.get(name); if (set == null) { throw new IllegalRegexpException("unrecognized POSIX character class " + name); } return set; } private int parseHex(PeekStream pattern) throws IllegalRegexpException { int ch = pattern.read(); int hex = 0; StringBuilder sb = new StringBuilder(); if (ch == '{') { while ((ch = pattern.read()) != '}') { if (ch < 0) throw new IllegalRegexpException("no more input; expected '}'"); sb.append((char)ch); } } else { if (ch < 0) throw new IllegalRegexpException("expected hex digit at " + badChar(ch)); sb.append((char)ch); ch = pattern.read(); if (ch < 0) { throw new IllegalRegexpException("expected hex digit at " + badChar(ch)); } sb.append((char)ch); } int len = sb.length(); for (int i = 0; i < len; i++) { ch = sb.charAt(i); if ('0' <= ch && ch <= '9') hex = hex * 16 + ch - '0'; else if ('a' <= ch && ch <= 'f') hex = hex * 16 + ch - 'a' + 10; else if ('A' <= ch && ch <= 'F') hex = hex * 16 + ch - 'A' + 10; else throw new IllegalRegexpException("expected hex digit at " + badChar(ch)); } return hex; } private RegexpNode parseBackReference(int ch, PeekStream pattern) throws IllegalRegexpException { int value = ch - '0'; int ch2 = pattern.peek(); if ('0' <= ch2 && ch2 <= '9') { pattern.read(); value = value * 10 + ch2 - '0'; } int ch3 = pattern.peek(); if (value < 10 || value <= _nGroup && ! ('0' <= ch3 && ch3 <= '7')) { return new RegexpNode.GroupRef(value); } else if (! ('0' <= ch2 && ch2 <= '7') && ! ('0' <= ch3 && ch3 <= '7')) throw new IllegalRegexpException( "back referencing to a non-existent group: " + value); if (value > 10) pattern.ungetc(ch2); if (ch == '8' || ch == '9' || '0' <= ch3 && ch3 <= '9' && value * 10 + ch3 - '0' > 0xFF) { //out of byte range or not an octal, //need to parse backslash as the NULL character pattern.ungetc(ch); return parseString('\u0000', pattern); } int oct = parseOctal(ch, pattern); return parseString(oct, pattern, true); //return createString((char) oct); } private RegexpNode parseString(int ch, PeekStream pattern) throws IllegalRegexpException { return parseString(ch, pattern, false); } /** * parseString */ private RegexpNode parseString(int ch, PeekStream pattern, boolean isEscaped) throws IllegalRegexpException { CharBuffer cb = new CharBuffer(); cb.append((char) ch); for (ch = pattern.read(); ch >= 0; ch = pattern.read()) { switch (ch) { case ' ': case '\t': case '\n': case '\r': if (! isIgnoreWs() || isEscaped) cb.append((char) ch); break; case '#': if (! isIgnoreWs() || isEscaped) cb.append((char) ch); else { while ((ch = pattern.read()) != '\n' && ch >= 0) { } } break; case '(': case ')': case '[': case '+': case '?': case '*': case '.': case '$': case '^': case '|': pattern.ungetc(ch); return createString(cb); case '{': if ('0' <= pattern.peek() && pattern.peek() <= '9') { pattern.ungetc(ch); return createString(cb); } cb.append('{'); break; case '\\': ch = pattern.read(); switch (ch) { case -1: cb.append('\\'); return createString(cb); case 's': case 'S': case 'd': case 'D': case 'w': case 'W': case 'b': case 'B': case 'A': case 'z': case 'Z': case 'G': case 'p': case 'P': pattern.ungetc(ch); pattern.ungetc('\\'); return createString(cb); case 'a': cb.append('\u0007'); break; case 'c': ch = pattern.read(); ch = Character.toUpperCase(ch); ch ^= 0x40; cb.append((char) ch); break; case 'e': cb.append('\u001b'); break; case 't': cb.append('\t'); break; case 'f': cb.append('\f'); break; case 'n': cb.append('\n'); break; case 'r': cb.append('\r'); break; case 'x': int hex = parseHex(pattern); cb.append((char) hex); break; case 'Q': while ((ch = pattern.read()) >= 0) { if (ch == '\\' && pattern.peek() == 'E') { pattern.read(); break; } cb.append((char) ch); } break; case '0': int oct = parseOctal(ch, pattern); cb.append((char) oct); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (ch - '0' <= _nGroup) { pattern.ungetc(ch); pattern.ungetc('\\'); return createString(cb); } else { oct = parseOctal(ch, pattern); cb.append((char) oct); } break; case '#': cb.append('#'); break; default: if ((_flags & STRICT) != 0) throw error(L.l("unrecognized escape at " + badChar(ch))); cb.append((char) ch); break; } break; default: cb.append((char) ch); } } return createString(cb); } private RegexpNode parseQuotedString(PeekStream pattern) { CharBuffer cb = new CharBuffer(); int ch; while ((ch = pattern.read()) >= 0) { if (ch == '\\' && pattern.peek() == 'E') { pattern.read(); break; } cb.append((char) ch); } return createString(cb); } private RegexpNode createString(CharBuffer cb) { if (isIgnoreCase()) return new RegexpNode.StringIgnoreCase(cb); else return new RegexpNode.StringNode(cb); } private RegexpNode createString(char ch) { if (isIgnoreCase()) return new RegexpNode.StringIgnoreCase(ch); else return new RegexpNode.StringNode(ch); } private int parseOctal(int ch, PeekStream pattern) throws IllegalRegexpException { if ('0' > ch || ch > '7') throw new IllegalRegexpException("expected octal digit at " + badChar(ch)); int oct = ch - '0'; int ch2 = pattern.peek(); if ('0' <= ch2 && ch2 <= '7') { pattern.read(); oct = oct * 8 + ch2 - '0'; ch = pattern.peek(); if ('0' <= ch && ch <= '7') { pattern.read(); oct = oct * 8 + ch - '0'; } } return oct; } private RegexpNode parseUnicodeProperty(PeekStream pattern, boolean isNegated) throws IllegalRegexpException { int ch = pattern.read(); boolean isBraced = false; if (ch == '{') { isBraced = true; ch = pattern.read(); if (ch == '^') { isNegated = ! isNegated; ch = pattern.read(); } } RegexpNode node; if (isBraced) { int ch2 = pattern.read(); if (ch2 == '}') node = parseUnicodeProperty(ch, isNegated); else { node = parseUnicodeProperty(ch, ch2, isNegated); expect('}', pattern.read()); } } else node = parseUnicodeProperty(ch, isNegated); return node; } private RegexpNode parseUnicodeProperty(int ch, int ch2, boolean isNegated) throws IllegalRegexpException { byte category = 0; switch (ch) { case 'C': switch (ch2) { case 'c': return isNegated ? RegexpNode.PROP_NOT_Cc : RegexpNode.PROP_Cc; case 'f': return isNegated ? RegexpNode.PROP_NOT_Cf : RegexpNode.PROP_Cf; case 'n': return isNegated ? RegexpNode.PROP_NOT_Cn : RegexpNode.PROP_Cn; case 'o': return isNegated ? RegexpNode.PROP_NOT_Co : RegexpNode.PROP_Co; case 's': return isNegated ? RegexpNode.PROP_NOT_Cs : RegexpNode.PROP_Cs; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } case 'L': switch (ch2) { case 'l': return isNegated ? RegexpNode.PROP_NOT_Ll : RegexpNode.PROP_Ll; case 'm': return isNegated ? RegexpNode.PROP_NOT_Lm : RegexpNode.PROP_Lm; case 'o': return isNegated ? RegexpNode.PROP_NOT_Lo : RegexpNode.PROP_Lo; case 't': return isNegated ? RegexpNode.PROP_NOT_Lt : RegexpNode.PROP_Lt; case 'u': return isNegated ? RegexpNode.PROP_NOT_Lu : RegexpNode.PROP_Lu; case '}': return isNegated ? RegexpNode.PROP_NOT_L : RegexpNode.PROP_L; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } case 'M': switch (ch2) { case 'c': return isNegated ? RegexpNode.PROP_NOT_Mc : RegexpNode.PROP_Mc; case 'e': return isNegated ? RegexpNode.PROP_NOT_Me : RegexpNode.PROP_Me; case 'n': return isNegated ? RegexpNode.PROP_NOT_Mn : RegexpNode.PROP_Mn; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } case 'N': switch (ch2) { case 'd': return isNegated ? RegexpNode.PROP_NOT_Nd : RegexpNode.PROP_Nd; case 'l': return isNegated ? RegexpNode.PROP_NOT_Nl : RegexpNode.PROP_Nl; case 'o': return isNegated ? RegexpNode.PROP_NOT_No : RegexpNode.PROP_No; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } case 'P': switch (ch2) { case 'c': return isNegated ? RegexpNode.PROP_NOT_Pc : RegexpNode.PROP_Pc; case 'd': return isNegated ? RegexpNode.PROP_NOT_Pd : RegexpNode.PROP_Pd; case 'e': return isNegated ? RegexpNode.PROP_NOT_Pe : RegexpNode.PROP_Pe; case 'f': return isNegated ? RegexpNode.PROP_NOT_Pf : RegexpNode.PROP_Pf; case 'i': return isNegated ? RegexpNode.PROP_NOT_Pi : RegexpNode.PROP_Pi; case 'o': return isNegated ? RegexpNode.PROP_NOT_Po : RegexpNode.PROP_Po; case 's': return isNegated ? RegexpNode.PROP_NOT_Ps : RegexpNode.PROP_Ps; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } case 'S': switch (ch2) { case 'c': return isNegated ? RegexpNode.PROP_NOT_Sc : RegexpNode.PROP_Sc; case 'k': return isNegated ? RegexpNode.PROP_NOT_Sk : RegexpNode.PROP_Sk; case 'm': return isNegated ? RegexpNode.PROP_NOT_Sm : RegexpNode.PROP_Sm; case 'o': return isNegated ? RegexpNode.PROP_NOT_So : RegexpNode.PROP_So; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } case 'Z': switch (ch2) { case 'l': return isNegated ? RegexpNode.PROP_NOT_Zl : RegexpNode.PROP_Zl; case 'p': return isNegated ? RegexpNode.PROP_NOT_Zp : RegexpNode.PROP_Zp; case 's': return isNegated ? RegexpNode.PROP_NOT_Zs : RegexpNode.PROP_Zs; default: throw error(L.l("invalid Unicode category {0}{1}", badChar(ch), badChar(ch2))); } } throw new UnsupportedOperationException(); } private RegexpNode parseUnicodeProperty(int ch, boolean isNegated) throws IllegalRegexpException { switch (ch) { case 'C': return isNegated ? RegexpNode.PROP_NOT_C : RegexpNode.PROP_C; case 'L': return isNegated ? RegexpNode.PROP_NOT_L : RegexpNode.PROP_L; case 'M': return isNegated ? RegexpNode.PROP_NOT_M : RegexpNode.PROP_M; case 'N': return isNegated ? RegexpNode.PROP_NOT_N : RegexpNode.PROP_N; case 'P': return isNegated ? RegexpNode.PROP_NOT_P : RegexpNode.PROP_P; case 'S': return isNegated ? RegexpNode.PROP_NOT_S : RegexpNode.PROP_S; case 'Z': return isNegated ? RegexpNode.PROP_NOT_Z : RegexpNode.PROP_Z; default: throw new IllegalRegexpException("invalid Unicode property " + badChar(ch)); } } /* static { _characterClassMap.put("alnum", RegexpNode.RC_ALNUM); _characterClassMap.put("alpha", RegexpNode.RC_ALPHA); _characterClassMap.put("blank", RegexpNode.RC_BLANK); _characterClassMap.put("cntrl", RegexpNode.RC_CNTRL); _characterClassMap.put("digit", RegexpNode.RC_DIGIT); _characterClassMap.put("graph", RegexpNode.RC_GRAPH); _characterClassMap.put("lower", RegexpNode.RC_LOWER); _characterClassMap.put("print", RegexpNode.RC_PRINT); _characterClassMap.put("punct", RegexpNode.RC_PUNCT); _characterClassMap.put("space", RegexpNode.RC_SPACE); _characterClassMap.put("upper", RegexpNode.RC_UPPER); _characterClassMap.put("xdigit", RegexpNode.RC_XDIGIT); } */ }