/*
* Copyright 2007-2013
* Licensed under GNU Lesser General Public License
*
* This file is part of EpochX: genetic programming software for research
*
* EpochX is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* EpochX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with EpochX. If not, see <http://www.gnu.org/licenses/>.
*
* The latest version is available from: http://www.epochx.org
*/
package org.epochx.grammar;
import java.io.*;
import java.util.*;
import org.epochx.Config.ConfigKey;
/**
* A grammar object is constructed from a context-free language grammar. In the
* grammar guided representations, a <code>Grammar</code> instance defines the
* valid syntax of a program's source being evolved. As well as defining the
* syntax of solutions, the grammar also essentially determines the function
* and terminal sets which are explicitly defined in tree GP.
*
* <p>
* A file or string containing a grammar in Backus Naur Form (BNF) is parsed
* on construction and a form of derivation tree with all possible options is
* created. The root of this tree is determined as the first rule in the grammar
* string and is retrievable with a call to the <code>getStartSymbol()</code>
* method. The <code>GrammarNode</code> this method returns may be either a
* <code>GrammarLiteral</code> or a <code>GrammarRule</code>. Terminal symbols
* are represented as <code>GrammarLiterals</code> and simply consist of a
* string value which matches the string from the BNF grammar. It is these
* terminals that will form the source code of any program that uses it.
* Non-literals are represented as <code>GrammarRules</code> and each has a set
* of <code>GrammarProductions</code>. Each production describes a valid syntax
* for that non-terminal rule.
*
* <p>
* Productions can also be provided with attributes. Attributes are simply
* key/value pairs which are then stored in the resultant
* <code>GrammarProduction</code> instance. They provide a basic mechanism to
* implement semantic constraints such as those used in attribute grammars or to
* otherwise provide meta-data about a production, for example, weights. The
* format to provide attributes is to include within the production a special
* grammar rule. The rule should begin and end with a question mark character
* '?'. Between those question marks a series of key/values, made up of the key
* and an equals character '=' followed by the value. Multiple key/values can be
* provided separated by a semi-colon ';' character.
*
* <p>
* An example rule:
* </p>
*
* <blockquote><code>
* <example-rule> ::= abc <?key1=32;key2=true?> | cde | <ruleA> <ruleB>
* </code></blockquote>
*
* <p>
* Most of the features known as EBNF (extended BNF) are not currently
* supported.
*/
public class Grammar {
/**
* The key for setting and retrieving a language grammar
*/
public static final ConfigKey<Grammar> GRAMMAR = new ConfigKey<Grammar>();
// Index into the rulesets.
private final Map<String, GrammarLiteral> literals;
private final Map<String, GrammarRule> rules;
// The starting symbol - the root of the parse tree.
private GrammarRule start;
/**
* Constructs a <code>Grammar</code> with the given string as the BNF
* grammar to be parsed.
*
* @param grammarStr a <code>String</code> containing a BNF language
* grammar.
* @throws MalformedGrammarException if the given grammar string is not in a
* valid format.
*/
public Grammar(String grammarStr) {
literals = new HashMap<String, GrammarLiteral>();
rules = new HashMap<String, GrammarRule>();
parseGrammar(grammarStr);
}
/**
* Constructs a <code>Grammar</code> with the given file as a reference to a
* text file containing a BNF grammar, which will be read and parsed.
*
* @param grammarFile a <code>File</code> pointing to a text file containing
* a BNF language grammar.
* @throws IOException if there was a problem reading the file.
* @throws MalformedGrammarException if the given grammar string is not in a
* valid format.
*/
public Grammar(File grammarFile) throws IOException {
String grammar = readGrammarFile(grammarFile);
literals = new HashMap<String, GrammarLiteral>();
rules = new HashMap<String, GrammarRule>();
parseGrammar(grammar);
}
/**
* Returns the root of the grammar parse tree. This will always be a
* <code>GrammarRule</code> for a valid BNF grammar.
*
* @return the starting GrammarRule that is at the root of the grammar parse
* tree.
*/
public GrammarRule getStartRule() {
return start;
}
/*
* Reads the contents of the given File and returns it as a String.
*/
private String readGrammarFile(File grammarFile) throws IOException {
BufferedReader input = new BufferedReader(new FileReader(grammarFile));
StringBuilder grammarStr = new StringBuilder();
String line = null;
try {
while ((line = input.readLine()) != null) {
grammarStr.append(line);
grammarStr.append(System.getProperty("line.separator"));
}
} finally {
input.close();
}
return grammarStr.toString();
}
/*
* The different states/modes the parser can be in.
*/
private enum State {
START, START_RULE, LHS_READ, PRODUCTION, START_OF_LINE
}
/*
* Parses the given grammar string into a Grammar parse tree, rooted at
* the 'start' symbol and made up of Symbols and Productions.
*
* This is based upon the grammar parser found in the
* Mapper.ContextFreeGrammar class of GEVA v.1.0.
*/
protected void parseGrammar(String grammar) {
State state = State.START;
StringBuilder buffer = new StringBuilder();
GrammarRule lhs = null;
GrammarProduction grammarProduction = new GrammarProduction();
boolean quoted = false;
boolean terminal = true;
boolean special = false;
// Append a \n to ensure we detect the end of file.
grammar += '\n';
for (int i = 0; i < grammar.length(); i++) {
char ch = grammar.charAt(i);
if (grammar.charAt(i) == '\\') {
// Start of an escape sequence - test next character.
i++;
if (i >= grammar.length()) {
// Escape sequence as last char is invalid.
throw new MalformedGrammarException("Escape sequence as last char is invalid");
} else if ((!terminal) && (grammar.charAt(i) != '\n')) {
// Only escaped newline allowed inside non-terminal
throw new MalformedGrammarException("Only escaped newline allowed inside non-terminal");
}
boolean skip = false;
if (grammar.charAt(i) == '\'') {// Single quote
ch = '\'';
} else if (grammar.charAt(i) == '\'') {// Double quote
ch = '\'';
} else if (grammar.charAt(i) == '\\') {// Backslash
ch = '\\';
} else if (grammar.charAt(i) == '0') {// Null character
ch = '\0';
} else if (grammar.charAt(i) == 'a') {// Audible bell
ch = '\007';
} else if (grammar.charAt(i) == 'b') {// Backspace
ch = '\b';
} else if (grammar.charAt(i) == 'f') {// Formfeed
ch = '\f';
} else if (grammar.charAt(i) == 'n') {// Newline
ch = '\n';
} else if (grammar.charAt(i) == 'r') {// Carriage return
ch = '\r';
} else if (grammar.charAt(i) == 't') {// Horizontal tab
ch = '\t';
} else if (grammar.charAt(i) == 'v') {// Vertical tab
ch = '\013';
} else if (grammar.charAt(i) == '\n') {// Escaped newline
skip = true;// Ignore newline
} else if (grammar.charAt(i) == '\r') {// Escaped DOS return
skip = true;// Ignore newline
if (grammar.charAt(++i) != '\n') {
throw new MalformedGrammarException("No newline");
}
} else {// Normal character
ch = grammar.charAt(i);
}
if (!skip) {
buffer.append(ch);
}
continue;
}
switch (state) {
case START:
if (ch == '\r') {
// Ignore DOS newline.
} else if (ch == '#') {
// Comment - skip to end of line.
while ((i < grammar.length()) && (grammar.charAt(i) != '\n')) {
i++;
}
} else if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
// Ignore whitespace, tabs and newlines.
} else if (ch == '<') {
state = State.START_RULE;
} else {
// No other valid chars in this state.
throw new MalformedGrammarException("Illegal character: " + ch);
}
break;
case START_RULE:
if (ch == '\r') {
// Ignore DOS newline.
} else if (ch == '\n') {
// Newlines are illegal here.
throw new MalformedGrammarException("Misplaced newline");
} else if (ch == '>') {
// Possible end of non-terminal.
String symbolName = buffer.toString();
if (!rules.containsKey(symbolName)) {
lhs = new GrammarRule(symbolName);
rules.put(symbolName, lhs);
} else {
lhs = rules.get(symbolName);
// LHS might have been a production already, but
// shouldn't have been a LHS.
if (lhs.getNoProductions() > 0) {
throw new MalformedGrammarException("Duplicate rule: " + symbolName);
}
}
// The first LHS becomes the start symbol.
if (start == null) {
start = lhs;
}
// Clear the buffer.
buffer = new StringBuilder();
// Move to next stage.
state = State.LHS_READ;
} else if ((ch == '"') || (ch == '|') || (ch == '<')) {
throw new MalformedGrammarException("Non-escaped special char: " + ch);
} else {
// Append char to buffer.
buffer.append(ch);
}
break;
case LHS_READ:
if (ch == '\r') {
// Ignore DOS newline.
} else if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
// Ignore whitespace, tabs and newlines.
} else if (ch == ':') {
// Part of ::= token.
buffer.append(ch);
} else if (ch == '=') {
// Should be end of ::= token.
buffer.append(ch);
if (!buffer.toString().equals("::=")) {
throw new MalformedGrammarException("Expected '::=' " + "but found: " + buffer.toString());
}
// Clear the buffer.
buffer = new StringBuilder();
// Move onto the next stage.
state = State.PRODUCTION;
} else {
throw new MalformedGrammarException("Illegal character: " + ch);
}
break;
case PRODUCTION:
if (ch == '\r') {
// Ignore DOS newline.
} else if ((ch == '|') && quoted) {
buffer.append(ch);
} else if ((ch == '\n') || (ch == '|')) {
// End of production and possibly rule.
if (buffer.length() != 0) {
String symbolName = buffer.toString();
if (terminal) {
GrammarLiteral newSymbol = literals.get(symbolName);
if (newSymbol == null) {
newSymbol = new GrammarLiteral(symbolName);
literals.put(symbolName, newSymbol);
}
grammarProduction.addGrammarNode(newSymbol);
} else {
/*
* GrammarNode newSymbol =
* rules.get(symbolName);
* if (newSymbol == null) {
* newSymbol = new GrammarRule(symbolName);
* }
* production.addSymbol(newSymbol);
* terminal = true;
*/
throw new MalformedGrammarException("Unterminated non-terminal");
}
buffer = new StringBuilder();
}
// Add the production to the current rule's LHS.
lhs.addProduction(grammarProduction);
grammarProduction = new GrammarProduction();
// Move onto next stage - either another production or
// new rule.
if (ch == '\n') {
state = State.START_OF_LINE;
}
} else if (ch == '<') {
if (quoted) {
buffer.append(ch);
} else if (buffer.length() == 0) {
terminal = false;
} else {
throw new MalformedGrammarException("Non-escaped " + "special char: " + ch);
}
} else if (ch == '?') {
// if (quoted) {
// buffer.append(ch);
// } else if (special) {
if (special) {
// This should be the closing '?'.
String specialCommand = buffer.toString().trim();
// Parse and process the command.
processSpecialRule(specialCommand, grammarProduction);
} else if (!terminal) {
// This should be the opening '?'.
special = true;
} else {
// Otherwise is outside <> so treat as normal char.
buffer.append(ch);
/*
* throw new
* MalformedGrammarException("Non-escaped " +
* "special char: " + ch);
*/
}
} else if (ch == '>') {
if (quoted) {
buffer.append(ch);
} else if (special) {
// End of special - no symbol to save.
special = false;
terminal = true;
buffer = new StringBuilder();
} else if (!terminal) {
// End of non-terminal.
String symbolName = buffer.toString();
GrammarRule newSymbol = rules.get(symbolName);
if (newSymbol == null) {
newSymbol = new GrammarRule(symbolName);
rules.put(symbolName, newSymbol);
}
grammarProduction.addGrammarNode(newSymbol);
terminal = true;
// Clear buffer.
buffer = new StringBuilder();
} else {
throw new MalformedGrammarException("Non-escaped " + "special char: " + ch);
}
} else if ((ch == ' ') || (ch == '\t')) {
if (quoted || !terminal) {
buffer.append(ch);
} else if (buffer.length() != 0) {
// Token separator.
String symbolName = buffer.toString();
GrammarLiteral newSymbol = literals.get(symbolName);
if (newSymbol == null) {
newSymbol = new GrammarLiteral(symbolName);
literals.put(symbolName, newSymbol);
}
grammarProduction.addGrammarNode(newSymbol);
// Clear buffer.
buffer = new StringBuilder();
} else {
// Excess whitespace, ignore.
}
} else if (ch == '"') {
// Start or end quoted section.
quoted = !quoted;
} else {
// All other characters just append to buffer to become
// symbol.
buffer.append(ch);
}
break;
case START_OF_LINE:
if (ch == '#') {
// Comment - skip to end of line.
while ((i < grammar.length()) && (grammar.charAt(i) != '\n')) {
i++;
}
} else if (ch == '\r') {
// Ignore DOS newline.
} else if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
// Ignore whitespace, tabs and newlines.
} else if (ch == '|') {
// Start of new production.
state = State.PRODUCTION;
} else if (ch == '<') {
// Start of LHS of new rule.
state = State.START_RULE;
} else {
throw new MalformedGrammarException("Illegal character: " + ch);
}
break;
default:
throw new MalformedGrammarException("Impossible error, quit program now.");
}
}
setRecursiveness();
// Check the validity of the grammar.
// Test a start rule was found.
if (start == null) {
throw new MalformedGrammarException("No valid rules found in grammar string");
}
// Setup and validate each rule - done together for efficiency only.
Collection<GrammarRule> ruleList = rules.values();
for (GrammarRule rule: ruleList) {
// Calculate and set the minimum depths of all rules.
rule.setMinDepth(getMinDepth(new ArrayList<GrammarRule>(), rule));
// Test that all rules have at least one valid production.
if (rule.getNoProductions() == 0) {
throw new MalformedGrammarException("Grammar rule " + rule.getName() + " has no productions");
}
// Test that all rules have a potential exit point.
if (isInfinitelyRecursive(rule)) {
throw new MalformedGrammarException("Grammar rule " + rule.getName() + " is infinitely recursive");
}
}
}
/**
* Determines whether the given <code>GrammarRule</code> is infinitely
* recursive. A rule is infinitely recursive if all its productions
* contain either a recursive reference to the rule, or a reference to
* another rule where all its productions contain such a reference to the
* rule.
*
* @param rule the rule to test for infinite recursion.
* @return true if the given rule is infinitely recursive, and false
* otherwise.
*/
protected boolean isInfinitelyRecursive(GrammarRule rule) {
return rule.isRecursive() && isInfinitelyRecursive(rule, rule, new ArrayList<GrammarRule>());
}
/*
* Recursive helper method for isInfinitelyRecursive(GrammarRule).
*/
private boolean isInfinitelyRecursive(GrammarRule rule, GrammarRule currentRule,
List<GrammarRule> path) {
path.add(currentRule);
boolean ref = true;
List<GrammarProduction> productions = currentRule.getProductions();
outer:for (GrammarProduction p: productions) {
List<GrammarNode> nodes = p.getGrammarNodes();
if (nodes.contains(rule)) {
continue outer;
} else {
for (GrammarNode n: nodes) {
if (n instanceof GrammarRule) {
GrammarRule r = (GrammarRule) n;
if (path.contains(r) || isInfinitelyRecursive(rule, r, path)) {
continue outer;
}
}
}
ref = false;
break outer;
}
}
// Remove the last element from the path.
path.remove(path.size() - 1);
return ref;
}
/*
* Iterates through the grammar and searches for cases of recursiveness and
* sets symbol's recursive flag accordingly. A rule is recursive if any of
* its productions contain a reference to itself, or if any of the
* productions in the tree below that rule contain a reference to that rule.
*/
private void setRecursiveness() {
if (start instanceof GrammarRule) {
setRecursiveness(new ArrayList<GrammarRule>(), start);
}
}
/*
* Recursive helper for setRecursiveness().
*/
private void setRecursiveness(List<GrammarRule> path, GrammarRule current) {
// Check for recursiveness then step down.
if (path.contains(current)) {
// Then everything in the path is recursive.
for (GrammarRule s: path) {
s.setRecursive(true);
}
} else {
path.add(current);
for (int i = 0; i < current.getNoProductions(); i++) {
GrammarProduction p = current.getProduction(i);
for (int j = 0; j < p.getNoGrammarNodes(); j++) {
// We only care about non-terminal symbols here.
if (!(p.getGrammarNode(j) instanceof GrammarRule)) {
continue;
}
GrammarRule nt = (GrammarRule) p.getGrammarNode(j);
setRecursiveness(new ArrayList<GrammarRule>(path), nt);
}
}
}
}
/*
* Processes a special rule. Currently the only supported special rule is
* key value pairs.
*/
private void processSpecialRule(String command, GrammarProduction production) {
String[] commands = command.split(";");
for (String c: commands) {
String[] keyAndValue = c.split("=");
production.setAttribute(keyAndValue[0], keyAndValue[1]);
}
}
/*
* Recursive helper that calculates the minimum depth of the current symbol.
*/
private int getMinDepth(List<GrammarRule> path, GrammarNode currentSymbol) {
if (!(currentSymbol instanceof GrammarRule)) {
return 0;
}
GrammarRule current = (GrammarRule) currentSymbol;
// Check for recursiveness then step down.
if (path.contains(current)) {
// Is recursive, cannot possibly be route to min depth - return
// impossibly high min depth.
return Integer.MAX_VALUE;
} else {
// Continue down this route looking for smaller minimum depth.
path.add(current);
int minDepth = Integer.MAX_VALUE;
for (int i = 0; i < current.getNoProductions(); i++) {
GrammarProduction p = current.getProduction(i);
int productionsMinDepth = -1;
for (int j = 0; j < p.getNoGrammarNodes(); j++) {
// The largest of production's symbols min depths, is
// productions min depth.
int d = getMinDepth(new ArrayList<GrammarRule>(path), p.getGrammarNode(j));
if (d > productionsMinDepth) {
productionsMinDepth = d;
}
}
// The smallest of productions min depths is the symbols min
// depth.
if (productionsMinDepth < minDepth) {
minDepth = productionsMinDepth;
}
}
// Plus one to include this current symbol. Protect against
// overflow.
if (minDepth != Integer.MAX_VALUE) {
minDepth++;
}
// Set the minimum depth on the actual symbol.
// current.setMinDepth(minDepth);
return minDepth;
}
}
/**
* Returns a list of the grammar's terminal symbols - the literal values.
*
* @return a complete list of the literals in this grammar.
*/
public List<GrammarLiteral> getGrammarLiterals() {
return new ArrayList<GrammarLiteral>(literals.values());
}
/**
* Returns a specific terminal from this grammar, according to the name
* label of the symbol.
*
* @param name the label that refers to the grammar literal to return.
* @return the grammar terminal with the given name label, or <code>null</code>
* if a terminal with that name does not exist in the grammar.
*/
public GrammarLiteral getGrammarLiteral(String name) {
return literals.get(name);
}
/**
* Returns a list of the grammar rules that make up this grammar.
*
* @return a complete list of the non-literals in this grammar.
*/
public List<GrammarRule> getGrammarRules() {
return new ArrayList<GrammarRule>(rules.values());
}
/**
* Returns a specific GrammarRule from this grammar, according to the name
* label of the rule.
*
* @param name the label that refers to the grammar rule to return.
* @return the <code>GrammarRule</code> with the given name label, or
* <code>null</code> if rule with that name does not exist in the
* grammar.
*/
public GrammarRule getGrammarRule(String name) {
return rules.get(name);
}
/**
* Returns the minimum depth of this grammr. The minimum depth of a grammar
* is equal to the minimum depth of its start symbol, which is the minimum
* depth required to resolve to only grammar literals.
*
* @return the minimum depth required by this grammar to resolve to only
* grammar literals.
*/
public int getMinimumDepth() {
assert (start != null);
return start.getMinDepth();
}
}