/* Generated By:JavaCC: Do not edit this line. FreeformQueryParser.java */
package org.cdlib.xtf.textEngine.freeform;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import javax.xml.transform.stream.StreamSource;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.tinytree.TinyBuilder;
import net.sf.saxon.trans.XPathException;
/**
* A grammar-based parser for "freeform queries", constructed with JavaCC.
*
* Designed to parse a query language much like that supported by "gaggle",
* a little query language used at CDL, which is in turn designed to act
* much like Google.
*
* <p> Uses a tokenizer that should be good for most European-language queries.
*/
@SuppressWarnings("unused")
public class FreeformQueryParser implements FreeformQueryParserConstants {
/**
* Simple command-line test driver.
*/
public static void main(String[] args) throws IOException
{
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
while (true)
{
System.out.print("Enter query: ");
String text = in.readLine();
if (text == null || text.length() == 0)
break;
FreeformQueryParser parser = new FreeformQueryParser(new StringReader(text));
try {
FNode query = parser.Query();
System.out.println(query.toXML());
}
catch (ParseException e) {
System.out.println("Parse error: " + e);
}
}
}
/**
* The result of a parse. A very simple hierarchical structure, basically
* mirroring the XML that would be generated for an XTF query.
*/
public class FNode
{
public String name; // Name of the element, such as "query", "and", "term", etc.
public String text; // Text of a term element
public String field; // Field name, or null if specified by parent, or "serverChoice"
public ArrayList<FNode> children = new ArrayList(); // Sub-elements
/** Private constructor */
FNode(String n) { name = n; }
/** Private constructor */
FNode(String n, String t) { name = n; text = t; }
/** Generate XML for this node and its descendants. */
public String toXML()
{
StringBuffer buf = new StringBuffer();
toXML(0, buf);
return buf.toString();
}
/** Workhorse XML generator */
private void toXML(int level, StringBuffer buf)
{
buf.append(indent(level) + "<" + name);
if (field != null)
buf.append(" field=\"" + field + "\"");
if (text != null && children.isEmpty())
buf.append(">" + text + "</" + name + ">\n");
else
{
buf.append(">\n");
level++;
if (text != null)
buf.append(indent(level) + text + "\n");
for (FNode kid : children)
kid.toXML(level, buf);
--level;
buf.append(indent(level) + "</" + name + ">\n");
}
}
/** Convert the query to something more compact than XML */
public String toString()
{
StringBuffer buf = new StringBuffer();
buf.append(name + "(");
if (field != null)
buf.append(field + ": ");
if (text != null)
buf.append("\"" + text + "\"");
boolean first = true;
for (FNode kid : children) {
if (!first)
buf.append(", ");
first = false;
buf.append(kid.toString());
}
buf.append(")");
return buf.toString();
}
/** Return a string with two spaces per level, used for indenting XML. */
private String indent(int level)
{
StringBuffer buf = new StringBuffer();
for (int i=0; i<level; i++)
buf.append(" ");
return buf.toString();
}
/** Add a child to this node */
private void add(FNode n)
{
children.add(n);
}
/** If we only have one child, return it. Else, return 'this'. */
private FNode promoteSingle()
{
if (children.size() == 1)
return children.get(0);
return this;
}
/** Clear the 'field' on this node and all descendants */
private void clearFields()
{
field = null;
for (FNode kid : children)
kid.clearFields();
}
/**
* Carry field identifiers to the right. If all fields at one level are
* the same, move them up to the parent.
*/
private void resolveFields(int level)
{
String f = null;
// If a field is specified on the parent, ignore specs on children
if (this.field != null) {
for (FNode kid : children)
kid.clearFields();
}
else
{
// Propagate field names to the right like Google does
for (FNode kid : children) {
if (kid.field != null)
f = kid.field;
else if (f != null)
kid.field = f;
}
// If any kid has a field specifier, force all of them to.
if (f != null)
{
// If all kids have the same field name, propagate it up.
boolean anyDiff = false;
for (FNode kid : children) {
if (!f.equals(kid.field))
anyDiff = true;
}
if (!anyDiff) {
for (FNode kid : children)
kid.field = null;
this.field = f;
}
// Otherwise, assign "serverChoice" to kids that don't have a field.
else {
for (FNode kid : children) {
if (kid.field == null)
kid.field = "serverChoice";
}
}
}
// Recursively process descendants
for (FNode kid : children)
kid.resolveFields(level+1);
}
// If no fields anywhere, assign one at the top level.
if (level == 0 && this.field == null && f == null)
field = "serverChoice";
}
/**
* In XTF, "not" is always implemented as AND-NOT. So make sure that
* every not is part of an AND, if necessary sticking an <allDocs>
* query onto it.
*/
private void fixNots()
{
// Recursively fix nots below here
for (FNode kid : children)
kid.fixNots();
// Now scan for unmatched nots at this level
for (int i = 0; i < children.size(); i++)
{
FNode kid = children.get(i);
if (!kid.name.equals("not"))
continue;
// If the parent isn't an "and", change it.
if (!name.equals("and") && !name.equals("query"))
name = "and";
// Within an AND, we check if there's anything else (positive)
// with the same field.
//
boolean found = false;
for (FNode k2 : children) {
if (k2 == kid || k2.name == "not")
continue;
if (k2.field == kid.field)
found = true;
}
// If nothing to match against, add something.
if (!found) {
FNode all = new FNode("allDocs");
FNode and = new FNode("and");
and.add(all);
and.add(kid);
children.set(i, and);
}
}
}
}
/*****************************************************************************
* Parser begins here. The grammar builds from the bottom up, beginning with
* a Term, followed by things that use Term, etc. The root of the grammar
* is Query, at the very end.
****************************************************************************/
/**
* In general a term is just a single word. But it can also be an email
* address, symbol, number, etc.
*/
final public FNode Term() throws ParseException {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case BASIC:
jj_consume_token(BASIC);
break;
case APOSTROPHE:
jj_consume_token(APOSTROPHE);
break;
case ACRONYM:
jj_consume_token(ACRONYM);
break;
case COMPANY:
jj_consume_token(COMPANY);
break;
case EMAIL:
jj_consume_token(EMAIL);
break;
case HOST:
jj_consume_token(HOST);
break;
case NUM:
jj_consume_token(NUM);
break;
case SYMBOL:
jj_consume_token(SYMBOL);
break;
case CJK:
jj_consume_token(CJK);
break;
default:
jj_la1[0] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
{if (true) return new FNode("term", token.image);}
throw new Error("Missing return statement in function");
}
/**
* A phrase is a quoted string of terms (but we also take care not to barf on
* reserved words).
*/
final public FNode Phrase() throws ParseException {
FNode phrase = new FNode("phrase");
FNode term;
jj_consume_token(QUOTE);
label_1:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case AND:
case OR:
case NOT:
case PLUS:
case COLON:
case OPEN_PAREN:
case CLOSE_PAREN:
case BASIC:
case APOSTROPHE:
case ACRONYM:
case COMPANY:
case EMAIL:
case HOST:
case NUM:
case SYMBOL:
case CJK:
;
break;
default:
jj_la1[1] = jj_gen;
break label_1;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case BASIC:
case APOSTROPHE:
case ACRONYM:
case COMPANY:
case EMAIL:
case HOST:
case NUM:
case SYMBOL:
case CJK:
term = Term();
phrase.add(term);
break;
case AND:
case OR:
case NOT:
case PLUS:
case COLON:
case OPEN_PAREN:
case CLOSE_PAREN:
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case AND:
jj_consume_token(AND);
break;
case OR:
jj_consume_token(OR);
break;
case NOT:
jj_consume_token(NOT);
break;
case PLUS:
jj_consume_token(PLUS);
break;
case COLON:
jj_consume_token(COLON);
break;
case OPEN_PAREN:
jj_consume_token(OPEN_PAREN);
break;
case CLOSE_PAREN:
jj_consume_token(CLOSE_PAREN);
break;
default:
jj_la1[2] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
break;
default:
jj_la1[3] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
jj_consume_token(QUOTE);
{if (true) return phrase;}
throw new Error("Missing return statement in function");
}
/**
* You can stick "not" in front of something to negate it. There is post-
* processing in the Query() production (at the end) to guarantee that each
* NOT is actually part of an AND-NOT.
*/
final public FNode Not() throws ParseException {
FNode node;
FNode kid;
jj_consume_token(NOT);
kid = Component();
// Handle double-not
if (kid.name == "not") {
assert kid.children.size() == 1;
{if (true) return kid.children.get(0);}
}
node = new FNode("not");
node.add(kid);
{if (true) return node;}
throw new Error("Missing return statement in function");
}
/**
* We allow parenthesized sub-expressions for grouping
*/
final public FNode ParenSeq() throws ParseException {
FNode node;
jj_consume_token(OPEN_PAREN);
node = SubQuery();
jj_consume_token(CLOSE_PAREN);
{if (true) return node;}
throw new Error("Missing return statement in function");
}
/**
* A component of a query is a phrase, term, parenthesized sequence, or a
* "not" clause. It can be preceded by an optional field specification.
*/
final public FNode Component() throws ParseException {
String field = null;
FNode node;
label_2:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case PLUS:
;
break;
default:
jj_la1[4] = jj_gen;
break label_2;
}
jj_consume_token(PLUS);
}
label_3:
while (true) {
if (jj_2_1(2)) {
;
} else {
break label_3;
}
node = Term();
jj_consume_token(COLON);
field = node.text;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case QUOTE:
node = Phrase();
break;
case BASIC:
case APOSTROPHE:
case ACRONYM:
case COMPANY:
case EMAIL:
case HOST:
case NUM:
case SYMBOL:
case CJK:
node = Term();
break;
case OPEN_PAREN:
node = ParenSeq();
break;
case NOT:
node = Not();
break;
default:
jj_la1[5] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
if (field != null)
node.field = field;
{if (true) return node;}
throw new Error("Missing return statement in function");
}
/**
* A sequence of components, separated by "OR" or "|"
*/
final public FNode ORs() throws ParseException {
FNode node = new FNode("or");
FNode kid;
kid = Component();
node.add(kid);
label_4:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case OR:
;
break;
default:
jj_la1[6] = jj_gen;
break label_4;
}
jj_consume_token(OR);
kid = Component();
node.add(kid);
}
{if (true) return node.promoteSingle();}
throw new Error("Missing return statement in function");
}
/**
* A sequence of terms (optionally separated by "AND" or "&") is AND-ed together.
* As in Google, "AND" binds more loosely than "OR", so that A AND B OR C should
* be grouped like this: A AND (B OR C).
*/
final public FNode ANDs() throws ParseException {
FNode node = new FNode("and");
FNode kid;
kid = ORs();
node.add(kid);
label_5:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case AND:
case NOT:
case PLUS:
case QUOTE:
case OPEN_PAREN:
case BASIC:
case APOSTROPHE:
case ACRONYM:
case COMPANY:
case EMAIL:
case HOST:
case NUM:
case SYMBOL:
case CJK:
;
break;
default:
jj_la1[7] = jj_gen;
break label_5;
}
label_6:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case AND:
;
break;
default:
jj_la1[8] = jj_gen;
break label_6;
}
jj_consume_token(AND);
}
kid = ORs();
node.add(kid);
}
{if (true) return node.promoteSingle();}
throw new Error("Missing return statement in function");
}
/**
* A single sub-query (can be contained in a paren expr)
*/
final public FNode SubQuery() throws ParseException {
FNode node;
node = ANDs();
{if (true) return node;}
throw new Error("Missing return statement in function");
}
/**
* The entire query, which consists of a single sub-query. We apply additional
* processing to ensure proper structure.
*/
final public FNode Query() throws ParseException {
FNode sub;
sub = SubQuery();
// Propagate field names from left to right, and from children to parent.
// Also assign "serverChoice" at the highest level we're forced to.
//
sub.resolveFields(0);
// Create the final wrapper node.
FNode query = new FNode("query");
query.add(sub);
// Guarantee that every NOT is part of a AND-NOT
query.fixNots();
// All done!
{if (true) return query;}
throw new Error("Missing return statement in function");
}
final private boolean jj_2_1(int xla) {
jj_la = xla; jj_lastpos = jj_scanpos = token;
try { return !jj_3_1(); }
catch(LookaheadSuccess ls) { return true; }
finally { jj_save(0, xla); }
}
final private boolean jj_3R_7() {
Token xsp;
xsp = jj_scanpos;
if (jj_scan_token(9)) {
jj_scanpos = xsp;
if (jj_scan_token(10)) {
jj_scanpos = xsp;
if (jj_scan_token(11)) {
jj_scanpos = xsp;
if (jj_scan_token(12)) {
jj_scanpos = xsp;
if (jj_scan_token(13)) {
jj_scanpos = xsp;
if (jj_scan_token(14)) {
jj_scanpos = xsp;
if (jj_scan_token(15)) {
jj_scanpos = xsp;
if (jj_scan_token(16)) {
jj_scanpos = xsp;
if (jj_scan_token(22)) return true;
}
}
}
}
}
}
}
}
return false;
}
final private boolean jj_3_1() {
if (jj_3R_7()) return true;
if (jj_scan_token(COLON)) return true;
return false;
}
public FreeformQueryParserTokenManager token_source;
SimpleCharStream jj_input_stream;
public Token token, jj_nt;
private int jj_ntk;
private Token jj_scanpos, jj_lastpos;
private int jj_la;
public boolean lookingAhead = false;
private boolean jj_semLA;
private int jj_gen;
final private int[] jj_la1 = new int[9];
static private int[] jj_la1_0;
static {
jj_la1_0();
}
private static void jj_la1_0() {
jj_la1_0 = new int[] {0x41fe00,0x41ffde,0x1de,0x41ffde,0x10,0x41fea8,0x4,0x41feba,0x2,};
}
final private JJCalls[] jj_2_rtns = new JJCalls[1];
private boolean jj_rescan = false;
private int jj_gc = 0;
public FreeformQueryParser(java.io.InputStream stream) {
this(stream, null);
}
public FreeformQueryParser(java.io.InputStream stream, String encoding) {
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
token_source = new FreeformQueryParserTokenManager(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 9; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
public void ReInit(java.io.InputStream stream) {
ReInit(stream, null);
}
public void ReInit(java.io.InputStream stream, String encoding) {
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
token_source.ReInit(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 9; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
public FreeformQueryParser(java.io.Reader stream) {
jj_input_stream = new SimpleCharStream(stream, 1, 1);
token_source = new FreeformQueryParserTokenManager(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 9; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
public void ReInit(java.io.Reader stream) {
jj_input_stream.ReInit(stream, 1, 1);
token_source.ReInit(jj_input_stream);
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 9; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
public FreeformQueryParser(FreeformQueryParserTokenManager tm) {
token_source = tm;
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 9; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
public void ReInit(FreeformQueryParserTokenManager tm) {
token_source = tm;
token = new Token();
jj_ntk = -1;
jj_gen = 0;
for (int i = 0; i < 9; i++) jj_la1[i] = -1;
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
}
final private Token jj_consume_token(int kind) throws ParseException {
Token oldToken;
if ((oldToken = token).next != null) token = token.next;
else token = token.next = token_source.getNextToken();
jj_ntk = -1;
if (token.kind == kind) {
jj_gen++;
if (++jj_gc > 100) {
jj_gc = 0;
for (int i = 0; i < jj_2_rtns.length; i++) {
JJCalls c = jj_2_rtns[i];
while (c != null) {
if (c.gen < jj_gen) c.first = null;
c = c.next;
}
}
}
return token;
}
token = oldToken;
jj_kind = kind;
throw generateParseException();
}
static private final class LookaheadSuccess extends java.lang.Error { }
final private LookaheadSuccess jj_ls = new LookaheadSuccess();
final private boolean jj_scan_token(int kind) {
if (jj_scanpos == jj_lastpos) {
jj_la--;
if (jj_scanpos.next == null) {
jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
} else {
jj_lastpos = jj_scanpos = jj_scanpos.next;
}
} else {
jj_scanpos = jj_scanpos.next;
}
if (jj_rescan) {
int i = 0; Token tok = token;
while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
if (tok != null) jj_add_error_token(kind, i);
}
if (jj_scanpos.kind != kind) return true;
if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
return false;
}
final public Token getNextToken() {
if (token.next != null) token = token.next;
else token = token.next = token_source.getNextToken();
jj_ntk = -1;
jj_gen++;
return token;
}
final public Token getToken(int index) {
Token t = lookingAhead ? jj_scanpos : token;
for (int i = 0; i < index; i++) {
if (t.next != null) t = t.next;
else t = t.next = token_source.getNextToken();
}
return t;
}
final private int jj_ntk() {
if ((jj_nt=token.next) == null)
return (jj_ntk = (token.next=token_source.getNextToken()).kind);
else
return (jj_ntk = jj_nt.kind);
}
private java.util.Vector jj_expentries = new java.util.Vector();
private int[] jj_expentry;
private int jj_kind = -1;
private int[] jj_lasttokens = new int[100];
private int jj_endpos;
private void jj_add_error_token(int kind, int pos) {
if (pos >= 100) return;
if (pos == jj_endpos + 1) {
jj_lasttokens[jj_endpos++] = kind;
} else if (jj_endpos != 0) {
jj_expentry = new int[jj_endpos];
for (int i = 0; i < jj_endpos; i++) {
jj_expentry[i] = jj_lasttokens[i];
}
boolean exists = false;
for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) {
int[] oldentry = (int[])(e.nextElement());
if (oldentry.length == jj_expentry.length) {
exists = true;
for (int i = 0; i < jj_expentry.length; i++) {
if (oldentry[i] != jj_expentry[i]) {
exists = false;
break;
}
}
if (exists) break;
}
}
if (!exists) jj_expentries.addElement(jj_expentry);
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
}
}
public ParseException generateParseException() {
jj_expentries.removeAllElements();
boolean[] la1tokens = new boolean[25];
for (int i = 0; i < 25; i++) {
la1tokens[i] = false;
}
if (jj_kind >= 0) {
la1tokens[jj_kind] = true;
jj_kind = -1;
}
for (int i = 0; i < 9; i++) {
if (jj_la1[i] == jj_gen) {
for (int j = 0; j < 32; j++) {
if ((jj_la1_0[i] & (1<<j)) != 0) {
la1tokens[j] = true;
}
}
}
}
for (int i = 0; i < 25; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;
jj_expentries.addElement(jj_expentry);
}
}
jj_endpos = 0;
jj_rescan_token();
jj_add_error_token(0, 0);
int[][] exptokseq = new int[jj_expentries.size()][];
for (int i = 0; i < jj_expentries.size(); i++) {
exptokseq[i] = (int[])jj_expentries.elementAt(i);
}
return new ParseException(token, exptokseq, tokenImage);
}
final public void enable_tracing() {
}
final public void disable_tracing() {
}
final private void jj_rescan_token() {
jj_rescan = true;
for (int i = 0; i < 1; i++) {
try {
JJCalls p = jj_2_rtns[i];
do {
if (p.gen > jj_gen) {
jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
switch (i) {
case 0: jj_3_1(); break;
}
}
p = p.next;
} while (p != null);
} catch(LookaheadSuccess ls) { }
}
jj_rescan = false;
}
final private void jj_save(int index, int xla) {
JJCalls p = jj_2_rtns[index];
while (p.gen > jj_gen) {
if (p.next == null) { p = p.next = new JJCalls(); break; }
p = p.next;
}
p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
}
static final class JJCalls {
int gen;
Token first;
int arg;
JJCalls next;
}
}