/*
* xtc - The eXTensible Compiler
* Copyright (C) 2007-2008 Robert Grimm
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
* USA.
*/
package xtc.parser;
import java.util.ArrayList;
import java.util.List;
import xtc.Constants;
import xtc.tree.Visitor;
import xtc.type.AST;
import xtc.util.Runtime;
/**
* Visitor to recognize token-level productions.
*
* <p />This visitor recognizes the boundary between hierarchical and
* lexical syntax:
*
* <li>This visitor recognizes all lexical productions. A production
* is lexical if it is text-only or if it is void and only references
* other lexical productions (if any). As a result, a lexical
* production may not contain parser actions, semantic actions that
* reference {@link CodeGenerator#VALUE}, or bindings to {@link
* CodeGenerator#VALUE}.</li>
*
* <li>This visitor traverses the grammar, starting with its public
* productions. When encountering a lexical production it does not
* traverse into the production; rather, if the lexical production
* also consumes the input, this visitor marks the production as
* token-level.</li>
*
* <li>This visitor ensures that all void productions are correctly
* annotated with the {@link Properties#CONSUMER} property, indicating
* whether they consume the input.</li>
*
* </ol>
*
* <p />This visitor does <em>not</em> change a token-level
* production's type, so that later parser generator phases can still
* distinguish between productions that used to be text-only and that
* used to be void. It does however, remove any {@link
* Properties#TEXT_ONLY} property.
*
* <p />This visitor assumes that the entire grammar is contained in a
* single module and that text-only productions have been marked as
* such. It may perform faster if the grammar has been annotated with
* its real root.
*
* @see TextTester
* @see RootFinder
*
* @author Robert Grimm
* @version $Revision: 1.12 $
*/
public class Tokenizer extends GrammarVisitor {
/** Visitor to determine which productions are lexical. */
public static class Tester extends Visitor {
/** The runtime. */
protected final Runtime runtime;
/** The analyzer utility. */
protected final Analyzer analyzer;
/** The flag for whether the current production is lexical. */
protected boolean isLexical;
/**
* Create a new lexical tester.
*
* @param runtime The runtime.
* @param analyzer The analyzer utility.
*/
public Tester(Runtime runtime, Analyzer analyzer) {
this.runtime = runtime;
this.analyzer = analyzer;
}
/**
* Mark the specified production as lexical.
*
* @param p The production.
*/
protected void mark(Production p) {
if (runtime.test("optionVerbose")) {
System.err.println("[Recognizing " + p.qName + " as lexical syntax]");
}
p.setProperty(Properties.LEXICAL, Boolean.TRUE);
}
/** Visit the specified grammar. */
public void visit(Module m) {
// Initialize the per-grammar state.
analyzer.register(this);
analyzer.init(m);
// Process the productions.
for (Production p : m.productions) {
// Make sure that the production has not been processed
// already and that it returns a string.
if (analyzer.isProcessed(p.qName)) {
continue;
} else if (p.getBooleanProperty(Properties.TEXT_ONLY)) {
mark(p);
analyzer.processed(p.qName);
continue;
} else if (! AST.isVoid(p.type)) {
analyzer.processed(p.qName);
continue;
}
// Clear the per-production state.
isLexical = true;
// Process the production.
analyzer.process(p);
// Tabulate the results.
if (isLexical) {
// All visited productions are guaranteed to be lexical.
for (NonTerminal nt : analyzer.working()) {
// This lookup is guaranteed to work, as the production's
// fully qualified name was added by visit(Production).
Production p2 = analyzer.lookup(nt);
mark(p2);
analyzer.processed(p2.qName);
}
} else {
// We only know that the current production is not lexical.
analyzer.processed(p.qName);
}
}
}
/** Visit the specified production. */
public void visit(Production p) {
Object closure = analyzer.enter(p);
analyzer.workingOn(p.qName);
dispatch(p.choice);
analyzer.exit(closure);
}
/** Visit the specified ordered choice. */
public void visit(OrderedChoice c) {
for (Sequence alt : c.alternatives) {
dispatch(alt);
if (! isLexical) {
// We don't need to look any further.
return;
}
}
}
/** Visit the specified sequence. */
public void visit(Sequence s) {
for (Element e : s.elements) {
dispatch(e);
if (! isLexical) {
// We don't need to look any further.
return;
}
}
}
/** Visit the specified semantic predicate. */
public void visit(SemanticPredicate p) {
// Ignore the semantic action.
}
/** Visit the specified binding. */
public void visit(Binding b) {
// We allow bindings in lexical productions, so that they can
// contain semantic predicates. However, we disallow a binding to
// CodeGenerator.VALUE.
if (CodeGenerator.VALUE.equals(b.name)) {
isLexical = false;
} else {
dispatch(b.element);
}
}
/** Visit the specified nonterminal. */
public void visit(NonTerminal nt) {
Production p;
try {
p = analyzer.lookup(nt);
} catch (IllegalArgumentException x) {
// Too many productions. We assume the worst.
isLexical = false;
return;
}
if (null == p) {
// No such production. We assume the worst.
isLexical = false;
} else if (analyzer.isProcessed(p.qName)) {
// If the corresponding production has already been processed,
// make sure it is lexical.
if (! p.getBooleanProperty(Properties.LEXICAL)) {
isLexical = false;
}
} else if (! analyzer.isBeingWorkedOn(p.qName)) {
// The production has not been processed and is not yet under
// consideration. If is text-only, accept it. If it is void,
// check it out.
if (p.getBooleanProperty(Properties.TEXT_ONLY)) {
// Nothing to do.
} else if (AST.isVoid(p.type)) {
dispatch(p);
} else {
isLexical = false;
}
}
}
/** Visit the specified character case. */
public void visit(CharCase c) {
dispatch(c.element);
}
/** Visit the specified character switch. */
public void visit(CharSwitch s) {
for (CharCase kase : s.cases) {
dispatch(kase);
if (! isLexical) {
// We don't need to look any further.
return;
}
}
dispatch(s.base);
}
/** Visit the specified terminal. */
public void visit(Terminal t) {
// Nothing to do. Terminals are lexical.
}
/**
* Visit the specified unary operator. This method provides the
* default implementation for repetitions, options, syntactic
* predicates, voided elements, and string matches.
*/
public void visit(UnaryOperator op) {
dispatch(op.element);
}
/** Visit the specified null literal. */
public void visit(NullLiteral l) {
// Nothing to do.
}
/** Visit the specified node marker. */
public void visit(NodeMarker m) {
isLexical = false;
}
/** Visit the specified action. */
public void visit(Action a) {
if (a.setsValue()) isLexical = false;
}
/** Visit the specified parser action. */
public void visit(ParserAction pa) {
isLexical = false;
}
/**
* Visit the specified element. This method provides the default
* implementation for parse tree nodes and value elements.
*/
public void visit(Element e) {
isLexical = false;
}
}
// ==========================================================================
/**
* Create a new tokenizer.
*
* @param runtime The runtime.
* @param analyzer The analyzer utility.
*/
public Tokenizer(Runtime runtime, Analyzer analyzer) {
super(runtime, analyzer);
}
/** Visit the specified grammar. */
public Object visit(Module m) {
// Recognize lexical syntax first.
new Tester(runtime, analyzer).dispatch(m);
// Initialize the per-grammar state.
analyzer.register(this);
analyzer.init(m);
// Make sure that all lexical and void productions are tested for
// whether they consume the input.
for (Production p : m.productions) {
if (p.getBooleanProperty(Properties.LEXICAL) || AST.isVoid(p.type)) {
analyzer.notWorkingOnAny();
analyzer.consumesInput(p.qName);
}
}
// Determine which productions to process.
List<Production> todo;
if (m.hasProperty(Properties.ROOT)) {
todo = new ArrayList<Production>(1);
todo.add(analyzer.lookup((NonTerminal)m.getProperty(Properties.ROOT)));
} else {
todo = m.productions;
}
// Process the productions.
for (Production p : todo) {
// Skip processed or non-public productions.
if (analyzer.isProcessed (p.qName) ||
(! p.hasAttribute(Constants.ATT_PUBLIC))) {
continue;
}
// Mark production as processed to avoid recursive processing.
analyzer.processed(p.qName);
if (p.getBooleanProperty(Properties.LEXICAL)) {
// We have reached a lexical production. If it consumes the
// input, we mark it as token-level.
analyzer.notWorkingOnAny();
if (analyzer.consumesInput(p.qName)) {
markToken(p, runtime.test("optionVerbose"));
}
} else {
// Recurse into the production.
analyzer.process(p);
}
}
// Done.
return null;
}
/** Visit the specified nonterminal. */
public Element visit(NonTerminal nt) {
FullProduction p = analyzer.lookup(nt);
if (! analyzer.isProcessed(p.qName)) {
analyzer.processed(p.qName);
if (p.getBooleanProperty(Properties.LEXICAL)) {
if (analyzer.consumesInput(nt)) {
markToken(p, runtime.test("optionVerbose"));
}
} else {
dispatch(p);
}
}
return nt;
}
/**
* Mark the specified production as token-level. This method sets
* the specified production's {@link Properties#TOKEN} property and
* removes any {@link Properties#TEXT_ONLY} property. It does,
* however, <em>not</em> adjust the production's type to
* <code>Token</code>.
*
* @param p The production.
*/
public static void markToken(Production p, boolean verbose) {
if (verbose) {
System.err.println("[Recognizing " + p.qName + " as token-level]");
}
p.setProperty(Properties.TOKEN, Boolean.TRUE);
p.removeProperty(Properties.TEXT_ONLY);
}
}