/* * reserved comment block * DO NOT REMOVE OR ALTER! */ /* * Copyright 1999-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.sun.org.apache.regexp.internal; import com.sun.org.apache.regexp.internal.RE; import java.util.Hashtable; /** * A regular expression compiler class. This class compiles a pattern string into a * regular expression program interpretable by the RE evaluator class. The 'recompile' * command line tool uses this compiler to pre-compile regular expressions for use * with RE. For a description of the syntax accepted by RECompiler and what you can * do with regular expressions, see the documentation for the RE matcher class. * * @see RE * @see recompile * * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> * @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a> * @version $Id: RECompiler.java,v 1.1.2.1 2005/08/01 00:02:55 jeffsuttor Exp $ */ public class RECompiler { // The compiled program char[] instruction; // The compiled RE 'program' instruction buffer int lenInstruction; // The amount of the program buffer currently in use // Input state for compiling regular expression String pattern; // Input string int len; // Length of the pattern string int idx; // Current input index into ac int parens; // Total number of paren pairs // Node flags static final int NODE_NORMAL = 0; // No flags (nothing special) static final int NODE_NULLABLE = 1; // True if node is potentially null static final int NODE_TOPLEVEL = 2; // True if top level expr // Special types of 'escapes' static final int ESC_MASK = 0xffff0; // Escape complexity mask static final int ESC_BACKREF = 0xfffff; // Escape is really a backreference static final int ESC_COMPLEX = 0xffffe; // Escape isn't really a true character static final int ESC_CLASS = 0xffffd; // Escape represents a whole class of characters // {m,n} stacks int maxBrackets = 10; // Maximum number of bracket pairs static final int bracketUnbounded = -1; // Unbounded value int brackets = 0; // Number of bracket sets int[] bracketStart = null; // Starting point int[] bracketEnd = null; // Ending point int[] bracketMin = null; // Minimum number of matches int[] bracketOpt = null; // Additional optional matches // Lookup table for POSIX character class names static Hashtable hashPOSIX = new Hashtable(); static { hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM)); hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA)); hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK)); hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL)); hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT)); hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH)); hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER)); hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT)); hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT)); hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE)); hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER)); hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT)); hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART)); hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART)); } /** * Constructor. Creates (initially empty) storage for a regular expression program. */ public RECompiler() { // Start off with a generous, yet reasonable, initial size instruction = new char[128]; lenInstruction = 0; } /** * Ensures that n more characters can fit in the program buffer. * If n more can't fit, then the size is doubled until it can. * @param n Number of additional characters to ensure will fit. */ void ensure(int n) { // Get current program length int curlen = instruction.length; // If the current length + n more is too much if (lenInstruction + n >= curlen) { // Double the size of the program array until n more will fit while (lenInstruction + n >= curlen) { curlen *= 2; } // Allocate new program array and move data into it char[] newInstruction = new char[curlen]; System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction); instruction = newInstruction; } } /** * Emit a single character into the program stream. * @param c Character to add */ void emit(char c) { // Make room for character ensure(1); // Add character instruction[lenInstruction++] = c; } /** * Inserts a node with a given opcode and opdata at insertAt. The node relative next * pointer is initialized to 0. * @param opcode Opcode for new node * @param opdata Opdata for new node (only the low 16 bits are currently used) * @param insertAt Index at which to insert the new node in the program */ void nodeInsert(char opcode, int opdata, int insertAt) { // Make room for a new node ensure(RE.nodeSize); // Move everything from insertAt to the end down nodeSize elements System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt); instruction[insertAt + RE.offsetOpcode] = opcode; instruction[insertAt + RE.offsetOpdata] = (char)opdata; instruction[insertAt + RE.offsetNext] = 0; lenInstruction += RE.nodeSize; } /** * Appends a node to the end of a node chain * @param node Start of node chain to traverse * @param pointTo Node to have the tail of the chain point to */ void setNextOfEnd(int node, int pointTo) { // Traverse the chain until the next offset is 0 int next = instruction[node + RE.offsetNext]; // while the 'node' is not the last in the chain // and the 'node' is not the last in the program. while ( next != 0 && node < lenInstruction ) { // if the node we are supposed to point to is in the chain then // point to the end of the program instead. // Michael McCallum <gholam@xtra.co.nz> // FIXME: // This is a _hack_ to stop infinite programs. // I believe that the implementation of the reluctant matches is wrong but // have not worked out a better way yet. if ( node == pointTo ) { pointTo = lenInstruction; } node += next; next = instruction[node + RE.offsetNext]; } // if we have reached the end of the program then dont set the pointTo. // im not sure if this will break any thing but passes all the tests. if ( node < lenInstruction ) { // Point the last node in the chain to pointTo. instruction[node + RE.offsetNext] = (char)(short)(pointTo - node); } } /** * Adds a new node * @param opcode Opcode for node * @param opdata Opdata for node (only the low 16 bits are currently used) * @return Index of new node in program */ int node(char opcode, int opdata) { // Make room for a new node ensure(RE.nodeSize); // Add new node at end instruction[lenInstruction + RE.offsetOpcode] = opcode; instruction[lenInstruction + RE.offsetOpdata] = (char)opdata; instruction[lenInstruction + RE.offsetNext] = 0; lenInstruction += RE.nodeSize; // Return index of new node return lenInstruction - RE.nodeSize; } /** * Throws a new internal error exception * @exception Error Thrown in the event of an internal error. */ void internalError() throws Error { throw new Error("Internal error!"); } /** * Throws a new syntax error exception * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ void syntaxError(String s) throws RESyntaxException { throw new RESyntaxException(s); } /** * Allocate storage for brackets only as needed */ void allocBrackets() { // Allocate bracket stacks if not already done if (bracketStart == null) { // Allocate storage bracketStart = new int[maxBrackets]; bracketEnd = new int[maxBrackets]; bracketMin = new int[maxBrackets]; bracketOpt = new int[maxBrackets]; // Initialize to invalid values for (int i = 0; i < maxBrackets; i++) { bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; } } } /** Enlarge storage for brackets only as needed. */ synchronized void reallocBrackets() { // trick the tricky if (bracketStart == null) { allocBrackets(); } int new_size = maxBrackets * 2; int[] new_bS = new int[new_size]; int[] new_bE = new int[new_size]; int[] new_bM = new int[new_size]; int[] new_bO = new int[new_size]; // Initialize to invalid values for (int i=brackets; i<new_size; i++) { new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1; } System.arraycopy(bracketStart,0, new_bS,0, brackets); System.arraycopy(bracketEnd,0, new_bE,0, brackets); System.arraycopy(bracketMin,0, new_bM,0, brackets); System.arraycopy(bracketOpt,0, new_bO,0, brackets); bracketStart = new_bS; bracketEnd = new_bE; bracketMin = new_bM; bracketOpt = new_bO; maxBrackets = new_size; } /** * Match bracket {m,n} expression put results in bracket member variables * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ void bracket() throws RESyntaxException { // Current character must be a '{' if (idx >= len || pattern.charAt(idx++) != '{') { internalError(); } // Next char must be a digit if (idx >= len || !Character.isDigit(pattern.charAt(idx))) { syntaxError("Expected digit"); } // Get min ('m' of {m,n}) number StringBuffer number = new StringBuffer(); while (idx < len && Character.isDigit(pattern.charAt(idx))) { number.append(pattern.charAt(idx++)); } try { bracketMin[brackets] = Integer.parseInt(number.toString()); } catch (NumberFormatException e) { syntaxError("Expected valid number"); } // If out of input, fail if (idx >= len) { syntaxError("Expected comma or right bracket"); } // If end of expr, optional limit is 0 if (pattern.charAt(idx) == '}') { idx++; bracketOpt[brackets] = 0; return; } // Must have at least {m,} and maybe {m,n}. if (idx >= len || pattern.charAt(idx++) != ',') { syntaxError("Expected comma"); } // If out of input, fail if (idx >= len) { syntaxError("Expected comma or right bracket"); } // If {m,} max is unlimited if (pattern.charAt(idx) == '}') { idx++; bracketOpt[brackets] = bracketUnbounded; return; } // Next char must be a digit if (idx >= len || !Character.isDigit(pattern.charAt(idx))) { syntaxError("Expected digit"); } // Get max number number.setLength(0); while (idx < len && Character.isDigit(pattern.charAt(idx))) { number.append(pattern.charAt(idx++)); } try { bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets]; } catch (NumberFormatException e) { syntaxError("Expected valid number"); } // Optional repetitions must be >= 0 if (bracketOpt[brackets] < 0) { syntaxError("Bad range"); } // Must have close brace if (idx >= len || pattern.charAt(idx++) != '}') { syntaxError("Missing close brace"); } } /** * Match an escape sequence. Handles quoted chars and octal escapes as well * as normal escape characters. Always advances the input stream by the * right amount. This code "understands" the subtle difference between an * octal escape and a backref. You can access the type of ESC_CLASS or * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1]. * @return ESC_* code or character if simple escape * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int escape() throws RESyntaxException { // "Shouldn't" happen if (pattern.charAt(idx) != '\\') { internalError(); } // Escape shouldn't occur as last character in string! if (idx + 1 == len) { syntaxError("Escape terminates string"); } // Switch on character after backslash idx += 2; char escapeChar = pattern.charAt(idx - 1); switch (escapeChar) { case RE.E_BOUND: case RE.E_NBOUND: return ESC_COMPLEX; case RE.E_ALNUM: case RE.E_NALNUM: case RE.E_SPACE: case RE.E_NSPACE: case RE.E_DIGIT: case RE.E_NDIGIT: return ESC_CLASS; case 'u': case 'x': { // Exact required hex digits for escape type int hexDigits = (escapeChar == 'u' ? 4 : 2); // Parse up to hexDigits characters from input int val = 0; for ( ; idx < len && hexDigits-- > 0; idx++) { // Get char char c = pattern.charAt(idx); // If it's a hexadecimal digit (0-9) if (c >= '0' && c <= '9') { // Compute new value val = (val << 4) + c - '0'; } else { // If it's a hexadecimal letter (a-f) c = Character.toLowerCase(c); if (c >= 'a' && c <= 'f') { // Compute new value val = (val << 4) + (c - 'a') + 10; } else { // If it's not a valid digit or hex letter, the escape must be invalid // because hexDigits of input have not been absorbed yet. syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar); } } } return val; } case 't': return '\t'; case 'n': return '\n'; case 'r': return '\r'; case 'f': return '\f'; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // An octal escape starts with a 0 or has two digits in a row if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') { // Handle \nnn octal escapes int val = escapeChar - '0'; if (idx < len && Character.isDigit(pattern.charAt(idx))) { val = ((val << 3) + (pattern.charAt(idx++) - '0')); if (idx < len && Character.isDigit(pattern.charAt(idx))) { val = ((val << 3) + (pattern.charAt(idx++) - '0')); } } return val; } // It's actually a backreference (\[1-9]), not an escape return ESC_BACKREF; default: // Simple quoting of a character return escapeChar; } } /** * Compile a character class * @return Index of class node * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int characterClass() throws RESyntaxException { // Check for bad calling or empty class if (pattern.charAt(idx) != '[') { internalError(); } // Check for unterminated or empty class if ((idx + 1) >= len || pattern.charAt(++idx) == ']') { syntaxError("Empty or unterminated class"); } // Check for POSIX character class if (idx < len && pattern.charAt(idx) == ':') { // Skip colon idx++; // POSIX character classes are denoted with lowercase ASCII strings int idxStart = idx; while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') { idx++; } // Should be a ":]" to terminate the POSIX character class if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') { // Get character class String charClass = pattern.substring(idxStart, idx); // Select the POSIX class id Character i = (Character)hashPOSIX.get(charClass); if (i != null) { // Move past colon and right bracket idx += 2; // Return new POSIX character class node return node(RE.OP_POSIXCLASS, i.charValue()); } syntaxError("Invalid POSIX character class '" + charClass + "'"); } syntaxError("Invalid POSIX character class syntax"); } // Try to build a class. Create OP_ANYOF node int ret = node(RE.OP_ANYOF, 0); // Parse class declaration char CHAR_INVALID = Character.MAX_VALUE; char last = CHAR_INVALID; char simpleChar = 0; boolean include = true; boolean definingRange = false; int idxFirst = idx; char rangeStart = Character.MIN_VALUE; char rangeEnd; RERange range = new RERange(); while (idx < len && pattern.charAt(idx) != ']') { switchOnCharacter: // Switch on character switch (pattern.charAt(idx)) { case '^': include = !include; if (idx == idxFirst) { range.include(Character.MIN_VALUE, Character.MAX_VALUE, true); } idx++; continue; case '\\': { // Escape always advances the stream int c; switch (c = escape ()) { case ESC_COMPLEX: case ESC_BACKREF: // Word boundaries and backrefs not allowed in a character class! syntaxError("Bad character class"); case ESC_CLASS: // Classes can't be an endpoint of a range if (definingRange) { syntaxError("Bad character class"); } // Handle specific type of class (some are ok) switch (pattern.charAt(idx - 1)) { case RE.E_NSPACE: case RE.E_NDIGIT: case RE.E_NALNUM: syntaxError("Bad character class"); case RE.E_SPACE: range.include('\t', include); range.include('\r', include); range.include('\f', include); range.include('\n', include); range.include('\b', include); range.include(' ', include); break; case RE.E_ALNUM: range.include('a', 'z', include); range.include('A', 'Z', include); range.include('_', include); // Fall through! case RE.E_DIGIT: range.include('0', '9', include); break; } // Make last char invalid (can't be a range start) last = CHAR_INVALID; break; default: // Escape is simple so treat as a simple char simpleChar = (char) c; break switchOnCharacter; } } continue; case '-': // Start a range if one isn't already started if (definingRange) { syntaxError("Bad class range"); } definingRange = true; // If no last character, start of range is 0 rangeStart = (last == CHAR_INVALID ? 0 : last); // Premature end of range. define up to Character.MAX_VALUE if ((idx + 1) < len && pattern.charAt(++idx) == ']') { simpleChar = Character.MAX_VALUE; break; } continue; default: simpleChar = pattern.charAt(idx++); break; } // Handle simple character simpleChar if (definingRange) { // if we are defining a range make it now rangeEnd = simpleChar; // Actually create a range if the range is ok if (rangeStart >= rangeEnd) { syntaxError("Bad character class"); } range.include(rangeStart, rangeEnd, include); // We are done defining the range last = CHAR_INVALID; definingRange = false; } else { // If simple character and not start of range, include it if (idx >= len || pattern.charAt(idx) != '-') { range.include(simpleChar, include); } last = simpleChar; } } // Shouldn't be out of input if (idx == len) { syntaxError("Unterminated character class"); } // Absorb the ']' end of class marker idx++; // Emit character class definition instruction[ret + RE.offsetOpdata] = (char)range.num; for (int i = 0; i < range.num; i++) { emit((char)range.minRange[i]); emit((char)range.maxRange[i]); } return ret; } /** * Absorb an atomic character string. This method is a little tricky because * it can un-include the last character of string if a closure operator follows. * This is correct because *+? have higher precedence than concatentation (thus * ABC* means AB(C*) and NOT (ABC)*). * @return Index of new atom node * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int atom() throws RESyntaxException { // Create a string node int ret = node(RE.OP_ATOM, 0); // Length of atom int lenAtom = 0; // Loop while we've got input atomLoop: while (idx < len) { // Is there a next char? if ((idx + 1) < len) { char c = pattern.charAt(idx + 1); // If the next 'char' is an escape, look past the whole escape if (pattern.charAt(idx) == '\\') { int idxEscape = idx; escape(); if (idx < len) { c = pattern.charAt(idx); } idx = idxEscape; } // Switch on next char switch (c) { case '{': case '?': case '*': case '+': // If the next character is a closure operator and our atom is non-empty, the // current character should bind to the closure operator rather than the atom if (lenAtom != 0) { break atomLoop; } } } // Switch on current char switch (pattern.charAt(idx)) { case ']': case '^': case '$': case '.': case '[': case '(': case ')': case '|': break atomLoop; case '{': case '?': case '*': case '+': // We should have an atom by now if (lenAtom == 0) { // No atom before closure syntaxError("Missing operand to closure"); } break atomLoop; case '\\': { // Get the escaped character (advances input automatically) int idxBeforeEscape = idx; int c = escape(); // Check if it's a simple escape (as opposed to, say, a backreference) if ((c & ESC_MASK) == ESC_MASK) { // Not a simple escape, so backup to where we were before the escape. idx = idxBeforeEscape; break atomLoop; } // Add escaped char to atom emit((char) c); lenAtom++; } break; default: // Add normal character to atom emit(pattern.charAt(idx++)); lenAtom++; break; } } // This "shouldn't" happen if (lenAtom == 0) { internalError(); } // Emit the atom length into the program instruction[ret + RE.offsetOpdata] = (char)lenAtom; return ret; } /** * Match a terminal node. * @param flags Flags * @return Index of terminal node (closeable) * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int terminal(int[] flags) throws RESyntaxException { switch (pattern.charAt(idx)) { case RE.OP_EOL: case RE.OP_BOL: case RE.OP_ANY: return node(pattern.charAt(idx++), 0); case '[': return characterClass(); case '(': return expr(flags); case ')': syntaxError("Unexpected close paren"); case '|': internalError(); case ']': syntaxError("Mismatched class"); case 0: syntaxError("Unexpected end of input"); case '?': case '+': case '{': case '*': syntaxError("Missing operand to closure"); case '\\': { // Don't forget, escape() advances the input stream! int idxBeforeEscape = idx; // Switch on escaped character switch (escape()) { case ESC_CLASS: case ESC_COMPLEX: flags[0] &= ~NODE_NULLABLE; return node(RE.OP_ESCAPE, pattern.charAt(idx - 1)); case ESC_BACKREF: { char backreference = (char)(pattern.charAt(idx - 1) - '0'); if (parens <= backreference) { syntaxError("Bad backreference"); } flags[0] |= NODE_NULLABLE; return node(RE.OP_BACKREF, backreference); } default: // We had a simple escape and we want to have it end up in // an atom, so we back up and fall though to the default handling idx = idxBeforeEscape; flags[0] &= ~NODE_NULLABLE; break; } } } // Everything above either fails or returns. // If it wasn't one of the above, it must be the start of an atom. flags[0] &= ~NODE_NULLABLE; return atom(); } /** * Compile a possibly closured terminal * @param flags Flags passed by reference * @return Index of closured node * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int closure(int[] flags) throws RESyntaxException { // Before terminal int idxBeforeTerminal = idx; // Values to pass by reference to terminal() int[] terminalFlags = { NODE_NORMAL }; // Get terminal symbol int ret = terminal(terminalFlags); // Or in flags from terminal symbol flags[0] |= terminalFlags[0]; // Advance input, set NODE_NULLABLE flag and do sanity checks if (idx >= len) { return ret; } boolean greedy = true; char closureType = pattern.charAt(idx); switch (closureType) { case '?': case '*': // The current node can be null flags[0] |= NODE_NULLABLE; case '+': // Eat closure character idx++; case '{': // Don't allow blantant stupidity int opcode = instruction[ret + RE.offsetOpcode]; if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) { syntaxError("Bad closure operand"); } if ((terminalFlags[0] & NODE_NULLABLE) != 0) { syntaxError("Closure operand can't be nullable"); } break; } // If the next character is a '?', make the closure non-greedy (reluctant) if (idx < len && pattern.charAt(idx) == '?') { idx++; greedy = false; } if (greedy) { // Actually do the closure now switch (closureType) { case '{': { // We look for our bracket in the list boolean found = false; int i; allocBrackets(); for (i = 0; i < brackets; i++) { if (bracketStart[i] == idx) { found = true; break; } } // If its not in the list we parse the {m,n} if (!found) { if (brackets >= maxBrackets) { reallocBrackets(); } bracketStart[brackets] = idx; bracket(); bracketEnd[brackets] = idx; i = brackets++; } // Process min first if (bracketMin[i]-- > 0) { if (bracketMin[i] > 0 || bracketOpt[i] != 0) { // Rewind stream and run it through again - more matchers coming for (int j = 0; j < brackets; j++) { if (j != i && bracketStart[j] < idx && bracketStart[j] >= idxBeforeTerminal) { brackets--; bracketStart[j] = bracketStart[brackets]; bracketEnd[j] = bracketEnd[brackets]; bracketMin[j] = bracketMin[brackets]; bracketOpt[j] = bracketOpt[brackets]; } } idx = idxBeforeTerminal; } else { // Bug #1030: No optinal matches - no need to rewind idx = bracketEnd[i]; } break; } // Do the right thing for maximum ({m,}) if (bracketOpt[i] == bracketUnbounded) { // Drop through now and closure expression. // We are done with the {m,} expr, so skip rest closureType = '*'; bracketOpt[i] = 0; idx = bracketEnd[i]; } else if (bracketOpt[i]-- > 0) { if (bracketOpt[i] > 0) { // More optional matchers - 'play it again sam!' idx = idxBeforeTerminal; } else { // Bug #1030: We are done - this one is last and optional idx = bracketEnd[i]; } // Drop through to optionally close closureType = '?'; } else { // Rollback terminal - neither min nor opt matchers present lenInstruction = ret; node(RE.OP_NOTHING, 0); // We are done. skip the rest of {m,n} expr idx = bracketEnd[i]; break; } } // Fall through! case '?': case '*': if (!greedy) { break; } if (closureType == '?') { // X? is compiled as (X|) nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node } if (closureType == '*') { // X* is compiled as (X{gotoX}|) nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto setNextOfEnd(ret + RE.nodeSize, ret); // the start again setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING } break; case '+': { // X+ is compiled as X({gotoX}|) int branch; branch = node(RE.OP_BRANCH, 0); // a new branch setNextOfEnd(ret, branch); // is added to the end of X setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING } break; } } else { // Add end after closured subexpr setNextOfEnd(ret, node(RE.OP_END, 0)); // Actually do the closure now switch (closureType) { case '?': nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret); break; case '*': nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret); break; case '+': nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret); break; } // Point to the expr after the closure setNextOfEnd(ret, lenInstruction); } return ret; } /** * Compile one branch of an or operator (implements concatenation) * @param flags Flags passed by reference * @return Pointer to branch node * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int branch(int[] flags) throws RESyntaxException { // Get each possibly closured piece and concat int node; int ret = node(RE.OP_BRANCH, 0); int chain = -1; int[] closureFlags = new int[1]; boolean nullable = true; while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') { // Get new node closureFlags[0] = NODE_NORMAL; node = closure(closureFlags); if (closureFlags[0] == NODE_NORMAL) { nullable = false; } // If there's a chain, append to the end if (chain != -1) { setNextOfEnd(chain, node); } // Chain starts at current chain = node; } // If we don't run loop, make a nothing node if (chain == -1) { node(RE.OP_NOTHING, 0); } // Set nullable flag for this branch if (nullable) { flags[0] |= NODE_NULLABLE; } return ret; } /** * Compile an expression with possible parens around it. Paren matching * is done at this level so we can tie the branch tails together. * @param flags Flag value passed by reference * @return Node index of expression in instruction array * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ int expr(int[] flags) throws RESyntaxException { // Create open paren node unless we were called from the top level (which has no parens) int paren = -1; int ret = -1; int closeParens = parens; if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') { // if its a cluster ( rather than a proper subexpression ie with backrefs ) if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' ) { paren = 2; idx += 3; ret = node( RE.OP_OPEN_CLUSTER, 0 ); } else { paren = 1; idx++; ret = node(RE.OP_OPEN, parens++); } } flags[0] &= ~NODE_TOPLEVEL; // Create a branch node int branch = branch(flags); if (ret == -1) { ret = branch; } else { setNextOfEnd(ret, branch); } // Loop through branches while (idx < len && pattern.charAt(idx) == '|') { idx++; branch = branch(flags); setNextOfEnd(ret, branch); } // Create an ending node (either a close paren or an OP_END) int end; if ( paren > 0 ) { if (idx < len && pattern.charAt(idx) == ')') { idx++; } else { syntaxError("Missing close paren"); } if ( paren == 1 ) { end = node(RE.OP_CLOSE, closeParens); } else { end = node( RE.OP_CLOSE_CLUSTER, 0 ); } } else { end = node(RE.OP_END, 0); } // Append the ending node to the ret nodelist setNextOfEnd(ret, end); // Hook the ends of each branch to the end node int currentNode = ret; int nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; // while the next node o while ( nextNodeOffset != 0 && currentNode < lenInstruction ) { // If branch, make the end of the branch's operand chain point to the end node. if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH ) { setNextOfEnd( currentNode + RE.nodeSize, end ); } nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; currentNode += nextNodeOffset; } // Return the node list return ret; } /** * Compiles a regular expression pattern into a program runnable by the pattern * matcher class 'RE'. * @param pattern Regular expression pattern to compile (see RECompiler class * for details). * @return A compiled regular expression program. * @exception RESyntaxException Thrown if the regular expression has invalid syntax. * @see RECompiler * @see RE */ public REProgram compile(String pattern) throws RESyntaxException { // Initialize variables for compilation this.pattern = pattern; // Save pattern in instance variable len = pattern.length(); // Precompute pattern length for speed idx = 0; // Set parsing index to the first character lenInstruction = 0; // Set emitted instruction count to zero parens = 1; // Set paren level to 1 (the implicit outer parens) brackets = 0; // No bracketed closures yet // Initialize pass by reference flags value int[] flags = { NODE_TOPLEVEL }; // Parse expression expr(flags); // Should be at end of input if (idx != len) { if (pattern.charAt(idx) == ')') { syntaxError("Unmatched close paren"); } syntaxError("Unexpected input remains"); } // Return the result char[] ins = new char[lenInstruction]; System.arraycopy(instruction, 0, ins, 0, lenInstruction); return new REProgram(parens, ins); } /** * Local, nested class for maintaining character ranges for character classes. */ class RERange { int size = 16; // Capacity of current range arrays int[] minRange = new int[size]; // Range minima int[] maxRange = new int[size]; // Range maxima int num = 0; // Number of range array elements in use /** * Deletes the range at a given index from the range lists * @param index Index of range to delete from minRange and maxRange arrays. */ void delete(int index) { // Return if no elements left or index is out of range if (num == 0 || index >= num) { return; } // Move elements down while (++index < num) { if (index - 1 >= 0) { minRange[index-1] = minRange[index]; maxRange[index-1] = maxRange[index]; } } // One less element now num--; } /** * Merges a range into the range list, coalescing ranges if possible. * @param min Minimum end of range * @param max Maximum end of range */ void merge(int min, int max) { // Loop through ranges for (int i = 0; i < num; i++) { // Min-max is subsumed by minRange[i]-maxRange[i] if (min >= minRange[i] && max <= maxRange[i]) { return; } // Min-max subsumes minRange[i]-maxRange[i] else if (min <= minRange[i] && max >= maxRange[i]) { delete(i); merge(min, max); return; } // Min is in the range, but max is outside else if (min >= minRange[i] && min <= maxRange[i]) { delete(i); min = minRange[i]; merge(min, max); return; } // Max is in the range, but min is outside else if (max >= minRange[i] && max <= maxRange[i]) { delete(i); max = maxRange[i]; merge(min, max); return; } } // Must not overlap any other ranges if (num >= size) { size *= 2; int[] newMin = new int[size]; int[] newMax = new int[size]; System.arraycopy(minRange, 0, newMin, 0, num); System.arraycopy(maxRange, 0, newMax, 0, num); minRange = newMin; maxRange = newMax; } minRange[num] = min; maxRange[num] = max; num++; } /** * Removes a range by deleting or shrinking all other ranges * @param min Minimum end of range * @param max Maximum end of range */ void remove(int min, int max) { // Loop through ranges for (int i = 0; i < num; i++) { // minRange[i]-maxRange[i] is subsumed by min-max if (minRange[i] >= min && maxRange[i] <= max) { delete(i); i--; return; } // min-max is subsumed by minRange[i]-maxRange[i] else if (min >= minRange[i] && max <= maxRange[i]) { int minr = minRange[i]; int maxr = maxRange[i]; delete(i); if (minr < min) { merge(minr, min - 1); } if (max < maxr) { merge(max + 1, maxr); } return; } // minRange is in the range, but maxRange is outside else if (minRange[i] >= min && minRange[i] <= max) { minRange[i] = max + 1; return; } // maxRange is in the range, but minRange is outside else if (maxRange[i] >= min && maxRange[i] <= max) { maxRange[i] = min - 1; return; } } } /** * Includes (or excludes) the range from min to max, inclusive. * @param min Minimum end of range * @param max Maximum end of range * @param include True if range should be included. False otherwise. */ void include(int min, int max, boolean include) { if (include) { merge(min, max); } else { remove(min, max); } } /** * Includes a range with the same min and max * @param minmax Minimum and maximum end of range (inclusive) * @param include True if range should be included. False otherwise. */ void include(char minmax, boolean include) { include(minmax, minmax, include); } } }