Parser.java example

Explorer
craken-master
/*****************************************************************************
 * Copyright (C) Codehaus.org                                                *
 * ------------------------------------------------------------------------- *
 * Licensed under the Apache License, Version 2.0 (the "License");           *
 * you may not use this file except in compliance with the License.          *
 * You may obtain a copy of the License at                                   *
 *                                                                           *
 * http://www.apache.org/licenses/LICENSE-2.0                                *
 *                                                                           *
 * Unless required by applicable law or agreed to in writing, software       *
 * distributed under the License is distributed on an "AS IS" BASIS,         *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
 * See the License for the specific language governing permissions and       *
 * limitations under the License.                                            *
 *****************************************************************************/
package net.ion.rosetta;

import static net.ion.rosetta.util.Checks.checkArgument;

import java.io.IOException;
import java.nio.CharBuffer;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicReference;

import net.ion.rosetta.annotations.Private;
import net.ion.rosetta.error.ParserException;
import net.ion.rosetta.functors.Map;
import net.ion.rosetta.functors.Map2;
import net.ion.rosetta.functors.Maps;
import net.ion.rosetta.util.Checks;

/**
 * Defines grammar and encapsulates parsing logic. A {@link Parser} takes as input a {@link CharSequence} source and parses it when the {@link #parse(CharSequence)} method is called. A value of type {@code T} will be returned if parsing succeeds, or a {@link ParserException} is thrown to indicate parsing error. For example:
 * 
 * <pre>
 * Parser<String> scanner = Scanners.IDENTIFIER;
 * assertEquals("foo", scanner.parse("foo"));
 * </pre>
 * 
 * <p>
 * {@code Parser}s are immutable and inherently covariant on the type parameter {@code T}. Because Java generics has no native support for covariant type parameter, a workaround is to use the {@link Parser#cast()} method to explicitly force covariance whenever needed.
 * 
 * <p>
 * {@code Parser}s run either on character level to scan the source, or on token level to parse a list of {@link Token} objects returned from another parser. This other parser that returns the list of tokens for token level parsing is hooked up via the {@link #from(Parser)} or {@link #from(Parser, Parser)} method.
 * 
 * <p>
 * The following are important naming conventions used throughout the library:
 * <ul>
 * <li>A character level parser object that recognizes a single lexical word is called a scanner.
 * <li>A scanner that translates the recognized lexical word into a token is called a tokenizer.
 * <li >A character level parser object that does lexical analysis and returns a list of {@link Token} is called a lexer.
 * <li>All {@code index} parameters are 0-based indexes in the original source.
 * </ul>
 * 
 * @author Ben Yu
 */
public abstract class Parser<T> {

	Parser() {
	}

	/**
	 * An atomic mutable reference to {@link Parser}. Is useful to work around circular dependency between parser objects.
	 * 
	 * <p>
	 * Example usage:
	 * 
	 * <pre>
	 * Parser.Reference<Foo> ref = Parser.newReference();
	 * ...
	 * Parser<Bar> barParser = barParser(ref.lazy());
	 * Parser<Foo> fooParser = fooParser(barParser);
	 * ref.set(fooParser);
	 * </pre>
	 */
	public static final class Reference<T> extends AtomicReference<Parser<T>> {
		private static final long serialVersionUID = -8778697271614979497L;

		private final Parser<T> lazy = new LazyParser<T>(this);

		/**
		 * A {@link Parser} that delegates to the parser object referenced by {@code this} during parsing time.
		 */
		public Parser<T> lazy() {
			return lazy;
		}
	}

	/** Creates a new instance of {@link Reference}. */
	public static <T> Reference<T> newReference() {
		return new Reference<T>();
	}

	/**
	 * A {@link Parser} that executes {@code this}, and returns {@code value} if succeeds.
	 */
	public final <R> Parser<R> retn(R value) {
		return next(Parsers.constant(value));
	}

	/**
	 * A {@link Parser} that sequentially executes {@code this} and then {@code parser}. The return value of {@code parser} is preserved.
	 */
	public final <R> Parser<R> next(Parser<R> parser) {
		return Parsers.sequence(this, parser);
	}

	/**
	 * A {@link Parser} that executes {@code this}, maps the result using {@code map} to another {@code Parser} object to be executed as the next step.
	 */
	public final <To> Parser<To> next(Map<? super T, ? extends Parser<? extends To>> map) {
		return new BindNextParser<T, To>(this, map);
	}

	/**
	 * A {@link Parser} that sequentially executes {@code this} and then {@code parser}, whose return value is ignored.
	 */
	public final Parser<T> followedBy(Parser<?> parser) {
		return Parsers.sequence(this, parser, InternalFunctors.<T, Object> firstOfTwo());
	}

	/**
	 * A {@link Parser} that succeeds if {@code this} succeeds and the pattern recognized by {@code parser} isn't following.
	 */
	public final Parser<T> notFollowedBy(Parser<?> parser) {
		return followedBy(parser.not());
	}

	/**
	 * {@code p.many()} is equivalent to {@code p*} in EBNF. The return values are collected and returned in a {@link List}.
	 */
	public final Parser<List<T>> many() {
		return atLeast(0);
	}

	/**
	 * {@code p.skipMany()} is equivalent to {@code p*} in EBNF. The return values are discarded.
	 */
	public final Parser<Void> skipMany() {
		return skipAtLeast(0);
	}

	/**
	 * {@code p.many1()} is equivalent to {@code p+} in EBNF. The return values are collected and returned in a {@link List}.
	 */
	public final Parser<List<T>> many1() {
		return atLeast(1);
	}

	/**
	 * {@code p.skipMany1()} is equivalent to {@code p+} in EBNF. The return values are discarded.
	 */
	public final Parser<Void> skipMany1() {
		return skipAtLeast(1);
	}

	/**
	 * A {@link Parser} that runs {@code this} parser greedily for at least {@code min} times. The return values are collected and returned in a {@link List}.
	 */
	public final Parser<List<T>> atLeast(int min) {
		return new RepeatAtLeastParser<T>(this, Checks.checkMin(min));
	}

	/**
	 * A {@link Parser} that runs {@code this} parser greedily for at least {@code min} times and ignores the return values.
	 */
	public final Parser<Void> skipAtLeast(int min) {
		return new SkipAtLeastParser(this, Checks.checkMin(min));
	}

	/**
	 * A {@link Parser} that sequentially runs {@code this} for {@code n} times and ignores the return values.
	 */
	public final Parser<Void> skipTimes(int n) {
		return skipTimes(n, n);
	}

	/**
	 * A {@link Parser} that runs {@code this} for {@code n} times and collects the return values in a {@link List}.
	 */
	public final Parser<List<T>> times(int n) {
		return times(n, n);
	}

	/**
	 * A {@link Parser} that runs {@code this} parser for at least {@code min} times and up to {@code max} times. The return values are collected and returned in {@link List}.
	 */
	public final Parser<List<T>> times(int min, int max) {
		Checks.checkMinMax(min, max);
		return new RepeatTimesParser<T>(this, min, max);
	}

	/**
	 * A {@link Parser} that runs {@code this} parser for at least {@code min} times and up to {@code max} times, with all the return values ignored.
	 */
	public final Parser<Void> skipTimes(int min, int max) {
		Checks.checkMinMax(min, max);
		return new SkipTimesParser(this, min, max);
	}

	/**
	 * A {@link Parser} that runs {@code this} parser and transforms the return value using {@code map}.
	 */
	public final <R> Parser<R> map(Map<? super T, ? extends R> map) {
		return new MapParser<T, R>(this, map);
	}

	/**
	 * {@code p1.or(p2)} is equivalent to {@code p1 | p2} in EBNF.
	 * 
	 * @param alternative
	 *            the alternative parser to run if this fails.
	 */
	@SuppressWarnings("unchecked")
	public final Parser<T> or(Parser<? extends T> alternative) {
		return Parsers.or(this, alternative);
	}

	/**
	 * {@code p.optional()} is equivalent to {@code p?} in EBNF. {@code null} is the result when {@code this} fails with no partial match.
	 */
	public final Parser<T> optional() {
		return Parsers.plus(this, Parsers.<T> always());
	}

	/**
	 * A {@link Parser} that returns {@code defaultValue} if {@code this} fails with no partial match.
	 */
	public final Parser<T> optional(T defaultValue) {
		return Parsers.plus(this, Parsers.constant(defaultValue));
	}

	/**
	 * A {@link Parser} that fails if {@code this} succeeds. Any input consumption is undone.
	 */
	public final Parser<?> not() {
		return not(toString());
	}

	/**
	 * A {@link Parser} that fails if {@code this} succeeds. Any input consumption is undone.
	 * 
	 * @param unexpected
	 *            the name of what we don't expect.
	 */
	public final Parser<?> not(String unexpected) {
		return peek().ifelse(Parsers.unexpected(unexpected), Parsers.always());
	}

	/**
	 * A {@link Parser} that runs {@code this} and undoes any input consumption if succeeds.
	 */
	public final Parser<T> peek() {
		return new PeekParser<T>(this);
	}

	/** A {@link Parser} that undoes any partial match if {@code this} fails. */
	public final Parser<T> atomic() {
		return new AtomicParser<T>(this);
	}

	/**
	 * A {@link Parser} that runs {@code this} parser and sets the number of logical steps explicitly to {@code n}.
	 */
	final Parser<T> step(int n) {
		checkArgument(n >= 0, "step < 0");
		return new StepParser<T>(this, n);
	}

	/**
	 * A {@link Parser} that returns {@code true} if {@code this} succeeds, {@code false} otherwise.
	 */
	public final Parser<Boolean> succeeds() {
		return ifelse(Parsers.TRUE, Parsers.FALSE);
	}

	/**
	 * A {@link Parser} that returns {@code true} if {@code this} fails, {@code false} otherwise.
	 */
	public final Parser<Boolean> fails() {
		return ifelse(Parsers.FALSE, Parsers.TRUE);
	}

	/**
	 * A {@link Parser} that runs {@code consequence} if {@code this} succeeds, or {@code alternative} otherwise.
	 */
	public final <R> Parser<R> ifelse(Parser<? extends R> consequence, Parser<? extends R> alternative) {
		return ifelse(Maps.constant(consequence), alternative);
	}

	/**
	 * A {@link Parser} that runs {@code consequence} if {@code this} succeeds, or {@code alternative} otherwise.
	 */
	public final <R> Parser<R> ifelse(Map<? super T, ? extends Parser<? extends R>> consequence, Parser<? extends R> alternative) {
		return new IfElseParser<R, T>(this, consequence, alternative);
	}

	/**
	 * A {@link Parser} that reports reports an error about {@code name} expected, if {@code this} fails with no partial match.
	 */
	public final Parser<T> label(String name) {
		return Parsers.plus(this, Parsers.<T> expect(name));
	}

	/**
	 * Casts {@code this} to a {@link Parser} of type {@code R}. Use it only if you know the parser actually returns value of type {@code R}.
	 */
	@SuppressWarnings("unchecked")
	public final <R> Parser<R> cast() {
		return (Parser<R>) this;
	}

	/**
	 * A {@link Parser} that runs {@code this} between {@code before} and {@code after}. The return value of {@code this} is preserved.
	 * 
	 * <p>
	 * Equivalent to {@link Parsers#between(Parser, Parser, Parser)}, which preserves the natural order of the parsers in the argument list, but is a bit more verbose.
	 */
	public final Parser<T> between(Parser<?> before, Parser<?> after) {
		return before.next(followedBy(after));
	}

	/**
	 * A {@link Parser} that runs {@code this} 1 or more times separated by {@code delim}.
	 * 
	 * <p>
	 * The return values are collected in a {@link List}.
	 */
	public final Parser<List<T>> sepBy1(Parser<?> delim) {
		final Parser<T> afterFirst = delim.step(0).next(this);
		Map<T, Parser<List<T>>> binder = new Map<T, Parser<List<T>>>() {
			public Parser<List<T>> map(T firstValue) {
				return new RepeatAtLeastParser<T>(afterFirst, 0, ListFactories.arrayListFactoryWithFirstElement(firstValue));
			}
		};
		return next(binder);
	}

	/**
	 * A {@link Parser} that runs {@code this} 0 or more times separated by {@code delim}.
	 * 
	 * <p>
	 * The return values are collected in a {@link List}.
	 */
	public final Parser<List<T>> sepBy(Parser<?> delim) {
		return Parsers.plus(sepBy1(delim), EmptyListParser.<T> instance());
	}

	/**
	 * A {@link Parser} that runs {@code this} for 0 or more times delimited and terminated by {@code delim}.
	 * 
	 * <p>
	 * The return values are collected in a {@link List}.
	 */
	public final Parser<List<T>> endBy(Parser<?> delim) {
		return followedBy(delim).many();
	}

	/**
	 * A {@link Parser} that runs {@code this} for 1 or more times delimited and terminated by {@code delim}.
	 * 
	 * <p>
	 * The return values are collected in a {@link List}.
	 */
	public final Parser<List<T>> endBy1(Parser<?> delim) {
		return followedBy(delim).many1();
	}

	/**
	 * A {@link Parser} that runs {@code this} for 1 ore more times separated and optionally terminated by {@code delim}. For example: {@code "foo;foo;foo"} and {@code "foo;foo;"} both matches {@code foo.sepEndBy1(semicolon)}.
	 * 
	 * <p>
	 * The return values are collected in a {@link List}.
	 */
	public final Parser<List<T>> sepEndBy1(final Parser<?> delim) {
		return next(new Map<T, Parser<List<T>>>() {
			public Parser<List<T>> map(T first) {
				return new DelimitedListParser<T>(Parser.this, delim, ListFactories.arrayListFactoryWithFirstElement(first));
			}
		});
	}

	/**
	 * A {@link Parser} that runs {@code this} for 0 ore more times separated and optionally terminated by {@code delim}. For example: {@code "foo;foo;foo"} and {@code "foo;foo;"} both matches {@code foo.sepEndBy(semicolon)}.
	 * 
	 * <p>
	 * The return values are collected in a {@link List}.
	 */
	public final Parser<List<T>> sepEndBy(Parser<?> delim) {
		return Parsers.plus(sepEndBy1(delim), EmptyListParser.<T> instance());
	}

	/**
	 * A {@link Parser} that runs {@code op} for 0 or more times greedily, then runs {@code this}. The {@link Map} objects returned from {@code op} are applied from right to left to the return value of {@code p}.
	 * 
	 * <p>
	 * {@code p.prefix(op)} is equivalent to {@code op* p} in EBNF.
	 */
	@SuppressWarnings("unchecked")
	public final Parser<T> prefix(Parser<? extends Map<? super T, ? extends T>> op) {
		return Parsers.sequence(op.many(), this, Parsers.PREFIX_OPERATOR_MAP2);
	}

	/**
	 * A {@link Parser} that runs {@code this} and then runs {@code op} for 0 or more times greedily. The {@link Map} objects returned from {@code op} are applied from left to right to the return value of p.
	 * 
	 * <p>
	 * {@code p.postfix(op)} is equivalent to {@code p op*} in EBNF.
	 */
	@SuppressWarnings("unchecked")
	public final Parser<T> postfix(Parser<? extends Map<? super T, ? extends T>> op) {
		return Parsers.sequence(this, op.many(), Parsers.POSTFIX_OPERATOR_MAP2);
	}

	/**
	 * A {@link Parser} that parses non-associative infix operator. Runs {@code this} for the left operand, and then runs {@code op} and {@code this} for the operator and the right operand optionally. The {@link Map2} objects returned from {@code op} are applied to the return values of the two operands, if any.
	 * 
	 * <p>
	 * {@code p.infixn(op)} is equivalent to {@code p (op p)?} in EBNF.
	 */
	public final Parser<T> infixn(Parser<? extends Map2<? super T, ? super T, ? extends T>> op) {
		return Parsers.infixn(this, op);
	}

	/**
	 * A {@link Parser} for left-associative infix operator. Runs {@code this} for the left operand, and then runs {@code op} and {@code this} for the operator and the right operand for 0 or more times greedily. The {@link Map2} objects returned from {@code op} are applied from left to right to the return values of {@code this}, if any. For example: {@code a + b + c + d} is evaluated as
	 * {@code (((a + b)+c)+d)}.
	 * 
	 * <p>
	 * {@code p.infixl(op)} is equivalent to {@code p (op p)*} in EBNF.
	 */
	public final Parser<T> infixl(Parser<? extends Map2<? super T, ? super T, ? extends T>> op) {
		// somehow generics doesn't work if we inline the code here.
		return Parsers.infixl(this, op);
	}

	/**
	 * A {@link Parser} for right-associative infix operator. Runs {@code this} for the left operand, and then runs {@code op} and {@code this} for the operator and the right operand for 0 or more times greedily. The {@link Map2} objects returned from {@code op} are applied from right to left to the return values of {@code this}, if any. For example: {@code a + b + c + d} is evaluated as
	 * {@code a + (b + (c + d))}.
	 * 
	 * <p>
	 * {@code p.infixr(op)} is equivalent to {@code p (op p)*} in EBNF.
	 */
	public final Parser<T> infixr(Parser<? extends Map2<? super T, ? super T, ? extends T>> op) {
		return Parsers.infixr(this, op);
	}

	/**
	 * A {@link Parser} that runs {@code this} and wraps the return value in a {@link Token}.
	 * 
	 * <p>
	 * It is normally not necessary to call this method explicitly. {@link #lexer(Parser)} and {@link #from(Parser, Parser)} both do the conversion automatically.
	 */
	public final Parser<Token> token() {
		return new ToTokenParser(this);
	}

	/** A {@link Parser} that returns the matched string in the original source. */
	public final Parser<String> source() {
		return new ReturnSourceParser(this);
	}

	/**
	 * A {@link Parser} that takes as input the {@link Token} collection returned by {@code lexer}, and runs {@code this} to parse the tokens.
	 * 
	 * <p>
	 * {@code this} must be a token level parser.
	 */
	public final Parser<T> from(Parser<? extends Collection<Token>> lexer) {
		return Parsers.nested(Parsers.tokens(lexer), followedBy(Parsers.EOF));
	}

	/**
	 * A {@link Parser} that takes as input the tokens returned by {@code tokenizer} delimited by {@code delim}, and runs {@code this} to parse the tokens.
	 * 
	 * <p>
	 * {@code this} must be a token level parser.
	 */
	public final Parser<T> from(Parser<?> tokenizer, Parser<Void> delim) {
		return from(tokenizer.lexer(delim));
	}

	/**
	 * A {@link Parser} that greedily runs {@code this} repeatedly, and ignores the pattern recognized by {@code delim} before and after each occurrence. The result tokens are wrapped in {@link Token} and are collected and returned in a {@link List}.
	 * 
	 * <p>
	 * It is normally not necessary to call this method explicitly. {@link #from(Parser, Parser)} is more convenient for simple uses that just need to connect a token level parser with a lexer that produces the tokens. When more flexible control over the token list is needed, for example, to parse indentation sensitive language, a pre-processor of the token list may be needed.
	 * 
	 * <p>
	 * {@code this} must be a tokenizer that returns a token value.
	 */
	public Parser<List<Token>> lexer(Parser<?> delim) {
		return delim.optional().next(token().sepEndBy(delim));
	}

	/**
	 * Parses a source string.
	 * 
	 * @param source
	 *            the source string
	 * @param moduleName
	 *            the name of the module, this name appears in error message
	 * @param sourceLocator
	 *            maps an index of char into line and column numbers
	 * @return the result
	 */
	final T parse(CharSequence source, String moduleName, SourceLocator sourceLocator) {
		return Parsers.parse(source, followedBy(Parsers.EOF), sourceLocator, moduleName);
	}

	/**
	 * Parses {@code source}.
	 * 
	 * @param source
	 *            the source string
	 * @param moduleName
	 *            the name of the module, this name appears in error message
	 * @return the result
	 */
	public final T parse(CharSequence source, String moduleName) {
		return parse(source, moduleName, new DefaultSourceLocator(source));
	}

	/** Parses {@code source}. */
	public final T parse(CharSequence source) {
		return parse(source, null);
	}

	/** Parses source read from {@code readable}. */
	public final T parse(Readable readable) throws IOException {
		return parse(readable, null);
	}

	/**
	 * Parses source read from {@code readable}.
	 * 
	 * @param readable
	 *            where the source is read from
	 * @param moduleName
	 *            the name of the module, this name appears in error message
	 * @return the result
	 */
	public final T parse(Readable readable, String moduleName) throws IOException {
		StringBuilder builder = new StringBuilder();
		copy(readable, builder);
		return parse(builder, moduleName);
	}

	/** Copies all content from {@code from} to {@code to}. */
	@Private
	static void copy(Readable from, Appendable to) throws IOException {
		CharBuffer buf = CharBuffer.allocate(2048);
		for (;;) {
			int r = from.read(buf);
			if (r == -1)
				break;
			buf.flip();
			to.append(buf, 0, r);
		}
	}

	@SuppressWarnings("unchecked")
	final T getReturn(ParseContext ctxt) {
		return (T) ctxt.result;
	}

	private ParserException asParserException(Throwable e, ParseContext ctxt) {
		if (e instanceof ParserException)
			return (ParserException) e;
		return new ParserException(e, null, ctxt.module, ctxt.locator.locate(ctxt.getIndex()));
	}

	final boolean run(ParseContext ctxt) {
		try {
			return apply(ctxt);
		} catch (RuntimeException e) {
			throw asParserException(e, ctxt);
		}
	}

	abstract boolean apply(ParseContext ctxt);
}