//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.gdata.util.parser;
import java.io.*;
/**
* The com.google.gdata.util.parser package provides a framework for creating recursive
* descent parsers. A fairly straightforward transformation exists between EBNF
* (extended Backus-Naur form) grammars and code used to construct a parser
* using this framework that will match the grammar. The intention of this
* package is to obviate the need to create small mini-parsers for tasks that
* don't feel large enough for a standard compiler-compiler (like JavaCC) but
* still need more formalism than simple string tokenization.
*
* The basic approach this framework takes to parsing is to define several
* types of leaf parsers which know how to parse a particular type of object
* (character set, string literal, etc.) and to then combine them together in
* interesting ways. For example, a parser for a comma separated list of
* integers would look like:
*
* Parser Chset.DIGIT.plus().list(new Chset(','));
*
* The EBNF this represents is:
*
* sent: [0-9]+ ("," [0-9]+)*
*
* The Parser.list() functionality is sometimes represented as the '%' operator
* in EBNF extensions. It performs the transformation:
*
* a % b --> a (b a)*
*
* The leaf parsers that are currently defined are:
*
* @see Chset
* @see Strlit
* @see Strcaselit
*
* The operators which combine 1 or 2 parsers together are:
*
* @see Action
* @see Alternative
* @see Difference
* @see Intersection
* @see Repeat
* @see Rule
* @see Sequence
*
* In general, it isn't necessary to create an operator-type parser directly as
* an appropriate member function usually exists in Parser for creating
* them. Note that these are purely convenience routines.
*
* In general, the parsers are greedy. For example, the Sequence parser will
* match as much as possible with the left sub-parser before trying the right
* sub-parser. This behavior can normally be avoided by using a recursive
* grammar. Consider the following grammar:
*
* token: foo* bar
* foo: [a-z]+
* bar: foo [0-9]+
*
* This grammar will fail to parse the string "aa0" because the 'foo*' rule
* will consume all of the letters and not leave one left for the 'bar'
* rule. An alternate definition of 'token' can prevent this behavior:
*
* token: (foo token) | bar
*
* The parsers created by this parser framework use infinite lookahead. In
* extreme cases, a parser can be constructed which scans over the parse buffer
* many times trying to find a match. In practice, this doesn't happen very
* often.
*
* @param <T>
*
*/
public abstract class Parser<T> {
public static final int NO_MATCH = -1;
/**
* The parse interface that subclasses must implement.
*
* @param buf The character array to match against.
*
* @param start The start offset of data within the character array to match
* against.
*
* @param end The end offset of data within the character array to match
* against.
*
* @param udata User defined object that is passed to
* <code>Callback.handle</code> when an <code>Action</code> fires.
*/
public abstract int parse(char[] buf, int start, int end, T udata);
/**
* Convenience routine to parse a character array.
*/
public final int parse(char[] buf, T udata) {
return parse(buf, 0, buf.length, udata);
}
/**
* Convenience routine to parse a string.
*/
public final int parse(String str, T udata) {
return parse(str.toCharArray(), udata);
}
/**
* Convenience routine to parse a <code>java.io.Reader</code>.
*/
public final int parse(Reader reader, T udata) {
CharArrayWriter writer = new CharArrayWriter();
try {
char[] buf = new char[1024];
int count;
while ((count = reader.read(buf)) >= 0) {
writer.write(buf, 0, count);
}
} catch (IOException e) {
}
return parse(writer.toCharArray(), udata);
}
/**
* Creates a <code>Repeat</code> parser that matches <code>this</code>
* exactly <code>count</code> times.
*
* <code>this{count}</code>
*
* @param count The number of times <code>this</code> must match in sequence.
*/
public final Parser<T> repeat(int count) {
return new Repeat<T>(this, count, count);
}
/**
* Creates a <code>Repeat</code> parser that matches <code>this</code> at
* least <code>min</code> times and not mroe than <code>max</code> times.
*
* <code>this{min,max}</code>
*
* @param min The minimum number of times <code>this</code> must match in
* sequence.
*
* @param max The maximum number of times <code>this</code> is allowed to
* match in sequence.
*/
public final Parser<T> repeat(int min, int max) {
return new Repeat<T>(this, min, max);
}
/**
* Creates a <code>Repeat</code> parser that matches <code>this</code> 0 or
* more times.
*
* <code>this*</code>
*/
public final Parser<T> star() {
return new Repeat<T>(this, 0);
}
/**
* Creates a <code>Repeat</code> parser that matches <code>this</code> 1 or
* more times.
*
* <code>this+</code>
*/
public final Parser<T> plus() {
return new Repeat<T>(this, 1);
}
/**
* Creates a <code>Repeat</code> parser that matches <code>this</code> either
* 0 or 1 times.
*
* <code>this?</code>
*/
public final Parser<T> optional() {
return repeat(0, 1);
}
/**
* Creates a <code>Parser</code> that matches a sequence of <code>this</code>
* parsers separated by <code>sep</code> parsers. These sequences occur
* often: space separated words, comma separated words, etc.
*
* @param sep The parser which separates instances of <code>this</code>.
*/
public final Parser<T> list(Parser<? super T> sep) {
return Parser.<T>sequence(this, Parser.<T>sequence(sep, this).star());
}
/**
* Creates a <code>Action</code> that will fire and call
* <code>Callback.handle</code> whenever <code>this</code> matches.
*
* @param callback The <code>Callback</code> to call when
* <code>this</code> matches.
*/
public final <U extends T> Parser<U> action(Callback<U> callback) {
return new Action<T, U>(this, callback);
}
/**
* Creates an <code>Alternative</code> parser from the <code>left</code> and
* <code>right</code> sub-parsers.
*
* <code>left | right</code>
*/
public static <T> Parser<T> alternative(Parser<? super T> left,
Parser<? super T> right) {
return new Alternative<T>(left, right);
}
/**
* Creates an <code>Intersection</code> parser from the <code>left</code> and
* <code>right</code> sub-parsers.
*
* <code>left & right</code>
*/
public static <T> Parser<T> intersection(Parser<? super T> left,
Parser<? super T> right) {
return new Intersection<T>(left, right);
}
/**
* Creates a <code>Difference</code> parser from the <code>left</code> and
* <code>right</code> sub-parsers.
*
* <code>left - right</code>
*/
public static <T> Parser<T> difference(Parser<? super T> left,
Parser<? super T> right) {
return new Difference<T>(left, right);
}
/**
* Creates a <code>Sequence</code> parser from the <code>left</code> and
* <code>right</code> sub-parsers.
*
* <code>left right</code>
*/
public static <T> Parser<T> sequence(Parser<? super T> left,
Parser<? super T> right) {
return new Sequence<T>(left, right);
}
/**
* Creates a <code>Sequence</code> parser from parsers <code>one</code>,
* <code>two</code> and <code>three</code> sub-parsers. Equivalent to calling
* Parser.sequence(one, Parser.sequence(two, three)).
*/
public static <T> Parser<T> sequence(Parser<? super T> one,
Parser<? super T> two,
Parser<? super T> three) {
return Parser.<T>sequence(one, Parser.<T>sequence(two, three));
}
/**
* Creates a sequence of four parsers.
* @see #sequence(Parser,Parser,Parser)
*/
public static <T> Parser<T> sequence(Parser<? super T> one,
Parser<? super T> two,
Parser<? super T> three,
Parser<? super T> four) {
return Parser.<T>sequence(
one, Parser.<T>sequence(
two, Parser.<T>sequence(three, four)));
}
/**
* Creates a sequence of five parsers.
* @see #sequence(Parser,Parser,Parser)
*/
public static <T> Parser<T> sequence(Parser<? super T> one,
Parser<? super T> two,
Parser<? super T> three,
Parser<? super T> four,
Parser<? super T> five) {
return Parser.<T>sequence(
one, Parser.<T>sequence(
two, Parser.<T>sequence(
three, Parser.<T>sequence(four, five))));
}
}