LL1Parser.java example

Explorer
GATECH-master
package project.phase2.ll1parsergenerator;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;

import project.phase2.ll1parsergenerator.dfastuff.Parser;

/**
 * The representation of an LL1Parser.
 * 
 */
public class LL1Parser {

	/**
	 * The names for some special cased tokens (epsilon, reg ex, ascii string,
	 * id).
	 */
	public static final String EPSILON = null;
	public static final String REG_EX = "REGEX";
	public static final String ASCII = "ASCII-STR";
	public static final String ID = "ID";

	/**
	 * Whether or not the scanning DFA should be minimized. This will
	 * significantly increase preprocessing time.
	 */
	public static boolean MINIMIZE_SCANNER = true;

	/**
	 * The ending token used to tell us that we are done parsing.
	 */
	public static final String END = "$";

	/**
	 * The rules representation.
	 */
	private Map<String, Rule> mRules;

	/**
	 * The transition table.
	 */
	private Map<String, Map<String, Integer>> mParseTable;

	/**
	 * The valid tokens scanner for this parser.
	 */
	private DFAScanner mScanner;

	/**
	 * The start rule.
	 */
	private String mStartRule;

	//
	// CTOR
	//
	public LL1Parser() {
		mRules = new HashMap<String, Rule>();
		mParseTable = new HashMap<String, Map<String, Integer>>();

		mScanner = new DFAScanner(MINIMIZE_SCANNER);

		String letters = "[a-zA-Z]";
		String numbers = "[0-9]";
		String anything = "[" + (char) 0x20 + "-" + (char) 0x7e + "]";

		String ascii = "\\\"([^\"] IN " + anything + ")*\\\"";
		mScanner.addRegex(ascii, ASCII);

		String regex = "\\\'([^\'] IN " + anything + ")*\\\'";
		mScanner.addRegex(regex, REG_EX);

		String id = letters;
		for (int i = 0; i < 9; i++) {
			id = id + "((" + letters + "|" + numbers + "|_)";
		}

		for (int i = 0; i < 9; i++) {
			id = id + "|)";
		}

		mScanner.addRegex(id, ID);

		mScanner.addRegex(END, END);
	}

	//
	// PUBLIC METHODS
	//
	/**
	 * Adds the given rule to the parser.
	 * 
	 * @param rule
	 *            the rule to add.
	 */
	public String addRule(Rule rule) {
		String name = rule.getName();

		if (rule.isTerminal()) {
			if (name == EPSILON || name.equals(ASCII) || name.equals(END)
					|| name.equals(ID) || name.equals(REG_EX))
				return null;

			String newName = name;
			int added = 0;

			for (int i = 0; i < name.length(); i++) {
				for (int j = 0; j < Parser.escape.length; j++) {
					if (Parser.escape[j] == name.charAt(i)) {
						newName = newName.substring(0, i + added) + "\\"
								+ name.substring(i);
						added++;
					}
				}
			}

			mScanner.addRegex(newName, name);

			return null;
		}

		if (mRules.containsKey(name))
			return null;

		mRules.put(name, rule);
		mParseTable.put(name, new HashMap<String, Integer>());

		return name;
	}

	/**
	 * Adds the given rule selection criteria to the parse table.
	 * 
	 * @param selection
	 *            rules to be added to the parse table.
	 */
	public void addRuleSelection(RuleSelection selection) {
		if (mParseTable.containsKey(selection.mRuleName)) {
			Rule rule = mRules.get(selection.mRuleName);

			if ((selection.mRule >= 0)
					&& (selection.mRule < rule.getRules().length)) {
				if (mParseTable.get(selection.mRuleName).containsKey(
						selection.mToken)) {
					System.out
							.println("Warning: Ambiguous grammar detected on: ("
									+ selection.mRuleName
									+ ", \""
									+ selection.mToken
									+ "\").  Results may not be correct to grammar specifications.");
				}

				mParseTable.get(selection.mRuleName).put(selection.mToken,
						selection.mRule);
			}
		}
	}

	/**
	 * Sets the given rule as the start rule.
	 * 
	 * @param rule
	 *            the rule to set as the start rule.
	 */
	public void setStartRule(Rule rule) {
		mStartRule = rule.getName();
	}

	/**
	 * Parses the given file and returns an abstract syntax tree for the file.
	 * 
	 * @param stream
	 *            the stream to parse.
	 * @return an abstract syntax tree.
	 * @throws IOException
	 * @throws ParseException
	 */
	public AST<String> parse(InputStream stream) throws IOException,
			ParseException {
		ParseStack p = new ParseStack();

		return p.parse(stream);
	}

	/**
	 * Returns a string representation of this parser.
	 * 
	 * @return a string representation.
	 */
	public String toString() {
		String ret = "LL1Parser:(\nRules:[\n";
		for (Rule rule : mRules.values()) {
			String currName = rule.getName();
			for (Rule[] prod : rule.getRules()) {
				ret = ret + currName + "->";
				for (Rule elt : prod) {
					ret += elt.getName() + " ";
				}
				ret = ret.trim();
				ret += ",\n";
			}
		}

		ret = ret.substring(0, ret.length() - 2);

		ret += "],\n";
		ret += "\nParse Table:[\n";

		for (Map.Entry<String, Map<String, Integer>> ruleName : mParseTable
				.entrySet()) {
			String currName = ruleName.getKey();
			ret += currName + ": {";
			for (Map.Entry<String, Integer> transition : ruleName.getValue()
					.entrySet()) {
				ret += currName + "-" + transition.getKey() + "->"
						+ transition.getValue() + ", ";
			}

			ret = ret.substring(0, ret.length() - 2);
			ret += "},\n";
		}

		ret = ret.substring(0, ret.length() - 2);
		ret += "])";

		return ret;
	}

	//
	// INNER CLASS
	//
	/**
	 * Provides an easy interface for representing an entry in the parse table.
	 */
	public static class RuleSelection {
		//
		// CLASS/INSTANCE DATA
		//
		/**
		 * The rule name this selection applies to.
		 */
		private String mRuleName;

		/**
		 * The token to transition on.
		 */
		private String mToken;

		/**
		 * The rule to transition to.
		 */
		private int mRule;

		//
		// CTOR
		//
		public RuleSelection(String name, String token, int rule) {
			mRuleName = name;
			mToken = token;
			mRule = rule;
		}
	}

	/**
	 * Helper class used during parsing.
	 */
	private class ParseStack extends InputStream {
		//
		// CLASS/INSTANCE DATA
		//
		/**
		 * The back-end stack that this class manages.
		 */
		private Stack<Rule> mStack;

		/**
		 * The abstract syntax tree that this class builds during parsing.
		 */
		private AST<String> mResult;

		/**
		 * The item being read.
		 */
		private InputStreamReader mStream;

		/**
		 * Used in the case that a token needs to be put back into the stream.
		 * Tokens will be retrieved in reverse order that they
		 */
		private String mBuffer;

		/**
		 * The character location in the stream.
		 */
		private int mLocation;

		/**
		 * The line location in the stream.
		 */
		@SuppressWarnings("unused")
		private int mLine;

		//
		// PUBLIC METHODS
		//
		/**
		 * Parses the given input stream and returns the abstract syntax tree.
		 * 
		 * @param stream
		 *            the stream to parse.
		 * @return the resultant tree.
		 */
		public AST<String> parse(InputStream stream) throws IOException,
				ParseException {
			mStream = new InputStreamReader(stream);
			mBuffer = "";
			mStack = new Stack<Rule>();

			Rule endRule = new Rule(END);
			endRule.setTerminal(true);

			mStack.push(endRule);
			mStack.push(mRules.get(mStartRule));

			mResult = new AST<String>();

			mResult.setRoot(parse());

			char next = getNextCharacter();
			Rule r = mStack.pop();

			if (r.getName().equals(END) && Character.toString(next).equals(END))
				return mResult;
			else if (mStream.ready())
				throw new ParseException(
						"Finished parsing with content remaining in the file.",
						mLocation);
			else if (!mStack.empty())
				throw new ParseException("Input ended unexpectedly.", mLocation);
			else
				throw new ParseException("An unexpected error occured.",
						mLocation);
		}

		//
		// PRIVATE METHODS
		//
		private char getNextCharacter() throws IOException, ParseException {
			char next;

			if (mBuffer.length() > 0) {
				next = mBuffer.charAt(0);

				if (next != (char) -1)
					mBuffer = mBuffer.substring(1);
				else
					return next;
			} else {
				next = (char) mStream.read();
				if (next == (char) -1) {
					next = '$';
					mBuffer = ((char) -1) + mBuffer;
				}
			}

			mLocation++;

			if (next == '\n') {
				mLine++;
			}

			return next;
		}

		private void replaceCharacter(char c) {
			mBuffer = c + mBuffer;

			mLocation--;

			if (c == '\n') {
				mLine--;
			}
		}

		private ASTNode<String> parse() throws IOException, ParseException {
			Rule currRule = mStack.pop();

			if (currRule.isTerminal()) {
				String currName = currRule.getName();

				ASTNode<String> ret;

				if (currName == EPSILON) {
					ret = new ASTNode<String>(EPSILON);
					ret.setTerminal(true);

					return ret;
				}

				// Remove leading whitespace.
				Character next;
				while (Character.isWhitespace((next = getNextCharacter()))
						&& (next != (char) -1))
					;

				int backTrack = 1;

				if (currName.equals(END)) {
					if (!END.equals("" + next))
						throw new ParseException(
								"Expected end of input. Instead got: "
										+ Character.toString(next), mLocation
										- backTrack);

					return null;
				} else if (currName.equals(ASCII)) {
					if (next != '\"')
						throw new ParseException(
								"Expected ASCII string enclosed with \"\"\".",
								mLocation - backTrack);

					String asc = "";

					while ((next = getNextCharacter()) != '\"'
							&& (next != (char) -1)) {
						asc += next;
						backTrack++;
					}

					if (next == (char) -1)
						throw new ParseException(
								"Expected ASCII string enclosed with \"\"\".",
								mLocation - backTrack);

					ret = new ASTNode<String>(asc);
					ret.setTerminal(true);

					return ret;
				} else if (currName.equals(REG_EX)) {
					if (next != '\'')
						throw new ParseException(
								"Expected regular expression enclosed with \"\'\".",
								mLocation - backTrack);

					String reg = "";

					while ((next = getNextCharacter()) != '\''
							&& (next != (char) -1)) {
						reg += next;
						backTrack++;
					}

					if (next == (char) -1)
						throw new ParseException(
								"Expected regular expression enclosed with \"\'\".",
								mLocation - backTrack);

					// Create our DFA that we will need to recognize the
					// content.
					try {
						Parser.fromString(reg);
					} catch (Exception e) {
						throw new ParseException(
								"An error occured while parsing the given regular expression: "
										+ reg, mLocation - backTrack);
					}

					ret = new ASTNode<String>(reg);
					ret.setTerminal(true);

					return ret;
				} else if (currName.equals(ID)) {
					if (!Character.isLetter(next))
						throw new ParseException(
								"Identifiers must begin with a letter.",
								mLocation - backTrack);

					String id = "" + next;

					while ((id.length() < 10)
							&& (Character
									.isLetterOrDigit((next = getNextCharacter())) || next == '_')) {
						id += next;
					}

					// Replace a character if we determined it was not part of
					// our identifier.
					if (!(Character.isLetterOrDigit(next) || next == '_'))
						replaceCharacter(next);

					ret = new ASTNode<String>(id);
					ret.setTerminal(true);

					return ret;
				} else {
					String curr = "" + next;

					while (curr.length() < currName.length()) {
						curr += getNextCharacter();
						backTrack++;
					}

					if (!curr.equals(currName))
						throw new ParseException(
								"Unexpected token encounterd: " + curr
										+ "; Expected: " + currName, mLocation
										- backTrack);

					ret = new ASTNode<String>(curr);
					ret.setTerminal(true);

					return ret;
				}
			} else {
				String type = determineNextTokenType();

				if (!mParseTable.get(currRule.getName()).containsKey(type)) {
					throw new ParseException(
							"No Grammar Rule Found for Token Type: " + type,
							mLocation);
				} else {
					int rule = mParseTable.get(currRule.getName()).get(type);

					Rule[] prodRule = currRule.getRules()[rule];
					for (int i = 0; i < prodRule.length; i++) {
						mStack.push(prodRule[prodRule.length - (i + 1)]);
					}

					ASTNode<String> ret = new ASTNode<String>(
							currRule.getName());
					for (int i = 0; i < prodRule.length; i++) {
						ret.insert(parse());
					}

					if ("<file-names>".equals(currRule.getName())) {
						String src = ret.getChildren().get(0).getChildren()
								.get(0).getValue();
						String dest = ret.getChildren().get(2).getChildren()
								.get(0).getValue();

						if (src.equals(dest))
							throw new ParseException(
									"Source and destination file same in Replace or Recursive Replace.",
									mLocation);
					} else if (prodRule.length > 0
							&& "recursivereplace".equals(prodRule[0].getName())) {
						String regex = ret.getChildren().get(1).getValue();
						String ascii = ret.getChildren().get(3).getValue();

						if (regex.equals(ascii))
							throw new ParseException(
									"Replace detection and replace target are same in Recursive Replace.",
									mLocation);
					}

					return ret;
				}
			}
		}

		private String determineNextTokenType() throws IOException,
				ParseException {
			// Remove leading whitespace.
			Character next;
			while (Character.isWhitespace((next = getNextCharacter()))
					&& (next != (char) -1))
				;

			replaceCharacter(next);

			String nextToken;

			try {
				nextToken = mScanner.labelToken(this);
			} catch (ParseException ex) {
				String rep = mScanner.getBuffer();
				for (int i = 0; i < rep.length(); i++) {
					replaceCharacter(rep.charAt(rep.length() - (i + 1)));
				}

				throw new ParseException("Token Unrecognized by Scanner: "
						+ ex.getMessage(), mLocation);
			}

			String rep = mScanner.getBuffer();
			for (int i = 0; i < rep.length(); i++) {
				replaceCharacter(rep.charAt(rep.length() - (i + 1)));
			}

			String[] identifiers = nextToken.split("\\+");
			boolean selected = false;
			if (identifiers.length > 1) {
				for (String s : identifiers) {
					if (!s.equals(ID)) {
						selected = true;
						nextToken = s;
						break;
					}
				}

				if (!selected)
					nextToken = identifiers[0];
			}

			return nextToken;
		}

		/*
		 * (non-Javadoc)
		 * 
		 * @see java.io.InputStream#read()
		 */
		@Override
		public int read() throws IOException {
			try {
				return getNextCharacter();
			} catch (ParseException ex) {
				throw new IOException(
						"Error while attempting to determine next token.", ex);
			}
		}
	}
}