/** * This file Copyright (c) 2005-2008 Aptana, Inc. This program is * dual-licensed under both the Aptana Public License and the GNU General * Public license. You may elect to use one or the other of these licenses. * * This program is distributed in the hope that it will be useful, but * AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or * NONINFRINGEMENT. Redistribution, except as permitted by whichever of * the GPL or APL you select, is prohibited. * * 1. For the GPL license (GPL), you can redistribute and/or modify this * program under the terms of the GNU General Public License, * Version 3, as published by the Free Software Foundation. You should * have received a copy of the GNU General Public License, Version 3 along * with this program; if not, write to the Free Software Foundation, Inc., 51 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Aptana provides a special exception to allow redistribution of this file * with certain other free and open source software ("FOSS") code and certain additional terms * pursuant to Section 7 of the GPL. You may view the exception and these * terms on the web at http://www.aptana.com/legal/gpl/. * * 2. For the Aptana Public License (APL), this program and the * accompanying materials are made available under the terms of the APL * v1.0 which accompanies this distribution, and is available at * http://www.aptana.com/legal/apl/. * * You may view the GPL, Aptana's exception and additional terms, and the * APL in the file titled license.html at the root of the corresponding * plugin containing this source file. * * Any modifications to this file must keep this entire header intact. */ package com.aptana.ide.regex; import java.text.MessageFormat; import java.text.ParseException; import java.util.Stack; import com.aptana.ide.regex.dfa.DFAGraph; import com.aptana.ide.regex.inputs.AnyInput; import com.aptana.ide.regex.inputs.CharacterClassInput; import com.aptana.ide.regex.inputs.CharacterInput; import com.aptana.ide.regex.inputs.DigitInput; import com.aptana.ide.regex.inputs.Input; import com.aptana.ide.regex.inputs.WhiteSpaceInput; import com.aptana.ide.regex.inputs.WordInput; import com.aptana.ide.regex.nfa.NFAGraph; /** * Converts a regular expression expressed as a string into a state machine. This state machine can be used to recognize * text patterns as described by the original regular expression * <p> * <code> * This parser recognizes the following grammar: * <p> * Expression * : OrExpression * | '^' OrExpression * | OrExpression '$' * ; * OrExpression * : OrExpression '|' AndExpression * | AndExpression * ; * AndExpression * : AndExpression Factor * | Factor * ; * Factor * : Term '*' * | Term '+' * | Term '?' * | Term * ; * Term * : '[' string ']' * | '[^' string ']' * | '[]' * | '[^]' * | '.' * | * character * | * '(' Expression ')' * ; * </code> * * @author Kevin Lindsey */ public class RegexParser { private char[] _regex; private int _index; Stack<NFAGraph> _nfaStack; int _acceptState; DFAGraph _dfa; /** * @return The current character being processed in the regular expression */ private char getCurrentChar() { char result; if (this.getEOS() == false) { result = this._regex[this._index]; } else { result = '\0'; } return result; } /** * Determine if the specified digit is a hexadecimal digit * * @return Returns true if the character is a valid hexadecimal digit */ private boolean isHexDigit() { char digit = this.getCurrentChar(); return ('0' <= digit && digit <= '9' || 'A' <= digit && digit <= 'F' || 'a' <= digit && digit <= 'f'); } /** * Return the DFA that recognizes the parsed regular expression * * @return The DFA that recognizes the parsed regular expression */ public DFAGraph getDFAGraph() { return this._dfa; } /** * Determines if we have reached the end of the regular expression string * * @return A boolean that returns true once we have processed the entire regular expression string */ private boolean getEOS() { return (this._index >= this._regex.length); } /** * Get the resulting NFA graph associated with the regex * * @return Returns the NFA Graph associated with this regex */ public NFAGraph getNFAGraph() { NFAGraph result = null; if (this._nfaStack.size() > 0) { result = this._nfaStack.peek(); } return result; } /** * Get the stack of NFA machines created by this parser * * @return The NFA stack */ public Stack<NFAGraph> getNFAStack() { return this._nfaStack; } /** * Determines if the current character is a valid character to start a new And expression * * @return Returns true if the current character can start an And expression */ private boolean isFirstInAndExpression() { boolean result = true; if (this.getEOS() == false) { switch (this.getCurrentChar()) { case '\0': case '|': case ')': case '*': case '+': case '?': case '^': case '$': result = false; break; default: break; } } else { result = false; } return result; } /* * Constructors */ /** * Create a new instance of RegexParser */ public RegexParser() { this._nfaStack = new Stack<NFAGraph>(); } /* * Methods */ /** * Advance to the next non-whitespace character */ private void advance() { int length = this._regex.length; if (this._index < length) { // advance at least one character this._index++; // now skip whitespace while (this._index < length && Character.isWhitespace(this._regex[this._index])) { this._index++; } } } /** * Convert a regular expression expressed as a string into a DFA that recognizes the text pattern described by the * regular expression. This method will associate a token index, lexer state, and new lexer state with this regular * expression when it matches * * @param regex * The regex to parse * @param acceptState * The accept state to associate with this regex * @throws ParseException */ public void parse(String regex, int acceptState) throws ParseException { if (regex == null) { throw new NullPointerException(Messages.RegexParser_Undefined); } if (regex.length() == 0) { throw new ParseException(Messages.RegexParser_Empty, 0); } this._regex = regex.toCharArray(); this._index = -1; this._acceptState = acceptState; // prime current character this.advance(); // parse regex if (this.parseExpression() == false) { Object[] messageArgs = new Object[] { regex, Integer.toString(this._index) }; String message = MessageFormat.format(Messages.RegexParser_Parse_Error, messageArgs); throw new ParseException(message, 0); } } /** * Parse a regular expression * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private boolean parseExpression() { boolean success = true; if (this.getCurrentChar() == '^') { // advance over '^' this.advance(); // tag to anchor at the beginning of a line } while (success && this.getEOS() == false) { success = this.parseOrExpression(); if (this._index == this._regex.length - 1 && this.getCurrentChar() == '$') { // advance over '$' this.advance(); // tag to anchor at the end of a line } } return success; } /** * Parse a regular expression OR expression * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private boolean parseOrExpression() { boolean success = true; if (this.parseAndExpression()) { while (this.getCurrentChar() == '|') { // advance over '|' this.advance(); // parse |'s right hand side if (this.parseAndExpression()) { // build OR machine NFAGraph rhs = this._nfaStack.pop(); NFAGraph lhs = this._nfaStack.peek(); lhs.orMachines(rhs); } else { success = false; break; } } } else { success = false; } return success; } /** * Parse a regular expression AND expression * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private boolean parseAndExpression() { boolean success = true; if (this.isFirstInAndExpression()) { // get left-hand side this.parseFactor(); // get remaining right-hand sides while (this.isFirstInAndExpression()) { if (this.parseFactor()) { // build AND machine NFAGraph rhs = this._nfaStack.pop(); NFAGraph lhs = this._nfaStack.peek(); lhs.andMachines(rhs); } else { success = false; break; } } } else { success = false; } return success; } /** * Parse a regular expression factor * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private boolean parseFactor() { boolean success = true; if (this.parseTerm()) { NFAGraph nfa = this._nfaStack.peek(); switch (this.getCurrentChar()) { case '*': // advance over '*' and build kleene closure this.advance(); nfa.kleeneClosure(); break; case '+': // advance over '+' and build positive closure this.advance(); nfa.positiveClosure(); break; case '?': // advance over '?' and build option this.advance(); nfa.option(); break; default: break; } } else { success = false; } return success; } /** * Parse a regular expression term * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private boolean parseTerm() { boolean success = true; if (this.getCurrentChar() == '(') { // parse parenthetical sub-expression success = this.parseSubExpression(); } else { NFAGraph newState = new NFAGraph(this._acceptState); switch (this.getCurrentChar()) { case '.': // advance over '.' and add input type this.advance(); newState.add(new AnyInput()); break; case '\\': // parse escaped term newState.add(this.parseEscapedTerm()); break; case '[': // parse character class newState.add(this.parseCharacterClass()); break; default: // add character state newState.add(new CharacterInput(this.getCurrentChar())); // advance over character this.advance(); } if (success) { this._nfaStack.push(newState); } } return success; } /** * Parse a character class */ private Input parseCharacterClass() { // advance over '[' this.advance(); CharacterClassInput cci = new CharacterClassInput(); char last = '\0'; if (this.getCurrentChar() == '^') { // advance over '^' this.advance(); // find complement of character class cci.setComplement(true); } if (this.getCurrentChar() == '-') { // add dash as input cci.addInput('-'); // advance over '-' this.advance(); } while (this.getEOS() == false && this.getCurrentChar() != ']') { switch (this.getCurrentChar()) { case '-': // advance over '-' this.advance(); if (last != '\0') { // build character set cci.addInputs(last, this.getCurrentChar()); // advance over character this.advance(); // rest last to catch hyphen errors last = '\0'; } else { // error break; } break; case '\\': Input input = this.parseEscapedTerm(); cci.addInputs(input.getCharacters()); break; default: last = this.getCurrentChar(); cci.addInput(last); // advance over character this.advance(); } // if (this.getCurrentChar() == '-') // { // // advance over '-' // this.advance(); // if (last != '\0') // { // // build character set // cci.addInputs(last, this.getCurrentChar()); // // advance over character // this.advance(); // // rest last to catch hyphen errors // last = '\0'; // } // else // { // // error break; // } // } // else if (this.getCurrentChar() == '\\') // { // Input input = this.parseEscapedTerm(); // cci.addInputs(input.getCharacters()); // } // else // { // last = this.getCurrentChar(); // cci.addInput(last); // // advance over character // this.advance(); // } } if (this.getCurrentChar() == ']') { // advance over ']' this.advance(); // save input // newState.addState(cci); } else { // success = false } return cci; } /** * Parse a term escaped with a backslash * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private Input parseEscapedTerm() { Input result = null; // advance over '\' // this.advance(); // NOTE: we can't use advance because it might skip over the next // character // if it is whitespace this._index++; switch (this.getCurrentChar()) { case 'd': result = new DigitInput(); break; case 'D': result = new DigitInput(); result.setComplement(true); break; case 'f': result = new CharacterInput('\f'); break; case 'n': result = new CharacterInput('\n'); break; case 'r': result = new CharacterInput('\r'); break; case 's': result = new WhiteSpaceInput(); break; case 'S': result = new WhiteSpaceInput(); result.setComplement(true); break; case 't': result = new CharacterInput('\t'); break; case 'v': result = new CharacterInput('\u000B'); break; case 'w': result = new WordInput(); break; case 'W': result = new WordInput(); result.setComplement(true); break; case 'x': int hi = 0; int lo = 0; this._index++; if (this.isHexDigit()) { hi = Character.digit(getCurrentChar(), 16); // NOTE: we can't use advance because it might skip over the next // character // if it is whitespace this._index++; if (this.isHexDigit()) { lo = Character.digit(getCurrentChar(), 16); } else { throw new IllegalStateException(Messages.RegexParser_Malformed_Hex); } } else { throw new IllegalStateException(Messages.RegexParser_Malformed_Hex); } result = new CharacterInput((char) (hi * 16 + lo)); break; default: result = new CharacterInput(this.getCurrentChar()); break; } // advance over term this.advance(); return result; } /** * Parse an expression inside of a parenthetical expression * * @return A boolean that indicates whether the expression was successfully parsed or not. */ private boolean parseSubExpression() { boolean success = true; // advance over '(' this.advance(); if (this.parseOrExpression()) { if (this.getCurrentChar() == ')') { // advance over ')' this.advance(); } else { success = false; } } else { success = false; } return success; } /** * Reset the parser in preparation for a new parse */ public void reset() { NFAGraph.reset(); this._nfaStack.clear(); } }