// This file is part of AceWiki. // Copyright 2008-2013, AceWiki developers. // // AceWiki is free software: you can redistribute it and/or modify it under the terms of the GNU // Lesser General Public License as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // AceWiki is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without // even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License along with AceWiki. If // not, see http://www.gnu.org/licenses/. package ch.uzh.ifi.attempto.chartparser; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import ch.uzh.ifi.attempto.base.PredictiveParser; /** * This is a chart parser (concretely an Earley parser) that implements the predictive parser * interface. * * @see Grammar * @author Tobias Kuhn */ public class ChartParser implements PredictiveParser { private final org.slf4j.Logger log = org.slf4j.LoggerFactory.getLogger(this.getClass()); private final Grammar grammar; private final String startCategoryName; private final Nonterminal[] context; private DynamicLexicon dynLexicon; private final Chart chart; private final List<String> tokens = new ArrayList<String>(); private final List<CPNextTokenOptions> options = new ArrayList<CPNextTokenOptions>(); private final List<List<FeatureMap>> backwardReferences = new ArrayList<List<FeatureMap>>(); private ParseTree parseTree; private Map<String, Integer> progressTable; private boolean recalculateParseTree = true; private String positionIdentifierPrefix = "#"; private boolean debug; /** * Creates a new chart parser for the given grammar. The grammar must not be changed afterwards. * * @param grammar The grammar to be used by the chart parser. * @param startCategoryName The name of the start category. * @param context A list of forward references and scope openers that define the context. */ public ChartParser(Grammar grammar, String startCategoryName, List<Nonterminal> context) { this.grammar = grammar; this.startCategoryName = startCategoryName; if (context == null) { this.context = new Nonterminal[0]; } else { this.context = context.toArray(new Nonterminal[0]); } this.chart = new Chart(grammar); options.add(null); init(); runParsingSteps(); } /** * Creates a new chart parser for the given grammar. The grammar must not be changed afterwards. * * @param grammar The grammar to be used by the chart parser. * @param startCategoryName The name of the start category. */ public ChartParser(Grammar grammar, String startCategoryName) { this(grammar, startCategoryName, null); } /** * This method can be used to switch on/off debug mode (default is off). In debug mode, messages * about the actions of the chart parser are printed onto the standard error device. * * @param debug true to switch debug mode on, or false to switch it off. */ public void debug(boolean debug) { this.debug = debug; } /** * Sets the dynamic lexicon. * * @param dynLexicon The dynamic lexicon. */ public void setDynamicLexicon(DynamicLexicon dynLexicon) { this.dynLexicon = dynLexicon; updateConcreteOptions(tokens.size()); } /** * Sets the prefix for the position identifiers that are assigned to the variables of the * position operator "#". The default prefix is "#" so that the position identifiers are "#0", * "#1", "#2" and so on. * * @param prefix The new prefix. */ public void setPositionIdentifierPrefix(String prefix) { this.positionIdentifierPrefix = prefix.toString(); } public void addToken(String token) { chart.addEdge(new Edge(tokens.size(), new Terminal(token))); List<LexicalRule> lexRules; if (dynLexicon == null) { lexRules = grammar.lexRulesByWord(token); } else { lexRules = new ArrayList<LexicalRule>(); lexRules.addAll(grammar.lexRulesByWord(token)); lexRules.addAll(dynLexicon.getLexRules(token)); } // add edges for applicable lexical rules: for (LexicalRule lexRule : lexRules) { Edge edge = new Edge(tokens.size(), lexRule.deepCopy()); chart.addEdge(edge); if (debug) log.debug("SCANNER: {}", edge); } runParsingSteps(); // add the token to the list of tokens: tokens.add(token); if (debug) { log.debug("ADD TOKEN: {}", token); log.debug("TOKEN LIST: {}", tokens); } options.add(null); backwardReferences.add(new ArrayList<FeatureMap>()); progressTable = null; runParsingSteps(); //if (debug) log("CHART:"); //if (debug) log(chart); recalculateParseTree = true; } public void addTokens(List<String> tokens) { for (String t : tokens) { addToken(t); } } public void removeToken() { chart.removeEdgesWithEndPos(tokens.size()); backwardReferences.remove(tokens.size()-1); options.remove(tokens.size()); tokens.remove(tokens.size()-1); progressTable = null; updateConcreteOptions(tokens.size()); recalculateParseTree = true; if (debug) { log.debug("REMOVE LAST TOKEN."); log.debug("TOKEN LIST: {}", tokens); } } public void removeAllTokens() { if (debug) log.debug("REMOVE ALL TOKENS"); tokens.clear(); options.clear(); options.add(null); backwardReferences.clear(); progressTable = null; chart.clear(); init(); runParsingSteps(); recalculateParseTree = true; } public void setTokens(List<String> tokens) { removeAllTokens(); addTokens(tokens); } public List<String> getTokens() { return new ArrayList<String>(tokens); } public int getTokenCount() { return tokens.size(); } /** * This method returns the token number to which the token at the given position refers, if it * is a reference. -1 is returned if the given token is not a reference. * * @param pos The position of the token for which the reference should be returned. * @return The token number to which the token at the given position refers, or -1. */ public int getReference(int pos) { int ref = -1; for (FeatureMap f : getBackwardReferences(pos)) { String s = f.getFeature("*pos").getString(); if (s != null) { int i = new Integer(s) - 1; if (i > -1) { ref = i; } } } return ref; } public int getReference() { return getReference(tokens.size()-1); } /** * Return a list of feature maps that show how the backward references at the given position * in the text can be resolved. These feature maps contain special features of the form "*pos" * that denote the textual position of the respective forward references. * * @param pos The position of the backward reference. * @return The list of feature maps. */ public List<FeatureMap> getBackwardReferences(int pos) { if (pos == -1 || pos >= tokens.size()) { return new ArrayList<FeatureMap>(); } return backwardReferences.get(pos); } /** * Returns a list of feature maps that show how the backward references at the end of the * token sequence can be resolved. * * @return The list of feature maps. */ public List<FeatureMap> getBackwardReferences() { return getBackwardReferences(tokens.size()-1); } public boolean isComplete() { for (Edge e : chart.getEdgesByEndPos(tokens.size())) { if (e.getStartPos() != 0) continue; if (e.isActive()) continue; if (!e.getHead().getName().equals(startCategoryName)) continue; return true; } return false; } /** * Returns the parse tree of the parsed text if it is a complete statement according to the * given grammar and category. Null is returned if the text is not a complete statement. * * @param categoryName The category name. * @return The parse tree. */ public ParseTree getParseTree(String categoryName) { for (Edge e : chart.getEdgesByEndPos(tokens.size())) { if (e.getStartPos() != 0) continue; if (e.isActive()) continue; if (!e.getHead().getName().equals(categoryName)) continue; return new ParseTree(e); } return null; } /** * Returns the parse tree of the parsed text if it is a complete statement according to the * given grammar and start category. Null is returned if the text is not a complete statement. * * @return The parse tree. */ public ParseTree getParseTree() { if (recalculateParseTree) { parseTree = getParseTree(startCategoryName); } recalculateParseTree = false; return parseTree; } /** * This methods shows the possible tokens that could be used to continue the text at the given * position. * * @param position The position at which the possible next tokens should be found. * @return The options describing the possible next tokens. */ public CPNextTokenOptions getNextTokenOptions(int position) { createOptions(position); return options.get(position); } public CPNextTokenOptions getNextTokenOptions() { return getNextTokenOptions(tokens.size()); } /** * This method returns a set of abstract options describing the possible next tokens at the * given position in an abstract way. * * @param position The position at which the possible next tokens should be found. * @return The set of abstract options describing the possible next tokens. */ public Set<CPAbstractOption> getAbstractOptions(int position) { createOptions(position); return options.get(position).getAbstractOptions(); } /** * This method returns a set of abstract options describing the possible next tokens at the end * position in an abstract way. * * @return The set of abstract options describing the possible next tokens. */ public Set<CPAbstractOption> getAbstractOptions() { return getAbstractOptions(tokens.size()); } /** * This method returns a set of concrete options describing the possible next tokens at the * given position in a concrete way. * * @param position The position at which the possible next tokens should be found. * @return The set of concrete options describing the possible next tokens. */ public Set<CPConcreteOption> getConcreteOptions(int position) { createOptions(position); return options.get(position).getConcreteOptions(); } /** * This method returns a set of concrete options describing the possible next tokens at the end * position in a concrete way. * * @return The set of concrete options describing the possible next tokens. */ public Set<CPConcreteOption> getConcreteOptions() { return getConcreteOptions(tokens.size()); } public boolean isPossibleNextToken(String token) { if (getNextTokenOptions().containsToken(token)) return true; for (LexicalRule lr : dynLexicon.getLexRules(token)) { if (!lr.getWord().getName().equals(token)) continue; if (getNextTokenOptions().containsCategory(lr.getCategory())) return true; } return false; } /** * Creates the abstract and concrete options at the given position. The options are cached. * * @param position The position for which the options should be calculated. */ private void createOptions(int position) { if (options.get(position) == null) { Set<CPAbstractOption> aOptions = createAbstractOptions(position); Set<CPConcreteOption> cOptions = createConcreteOptions(position, aOptions); options.set(position, new CPNextTokenOptions(aOptions, cOptions)); } } private void updateConcreteOptions(int position) { if (options.get(position) == null) { createOptions(position); } else { Set<CPAbstractOption> aOptions = options.get(position).getAbstractOptions(); Set<CPConcreteOption> cOptions = createConcreteOptions(position, aOptions); options.set(position, new CPNextTokenOptions(aOptions, cOptions)); } } /** * Calculates the set of abstract options for the given position. * * @param position The position for which the abstract options should be calculated. * @return The set of abstract options. */ private Set<CPAbstractOption> createAbstractOptions(int position) { Set<CPAbstractOption> aOptions = new HashSet<CPAbstractOption>(); for (Edge e : chart.getEdgesByEndPos(position)) { if (!e.isActive()) continue; if (e.getNextActive() instanceof Nonterminal) continue; BackrefCategory backref = null; Nonterminal negbackref = null; int refpos = 0; Category[] body = e.getBody(); int p = e.getProgress(); for (int i = p + 1 ; i < body.length ; i++) { Category c = body[i]; if (!(c instanceof Nonterminal)) continue; if (c instanceof BackrefCategory) { backref = (BackrefCategory) c; refpos = i; } else if (i == (p+1) && c.getName().equals("/<")) { negbackref = (Nonterminal) c; refpos = i; } break; } if (backref != null) { // For edges with backwards references, the possible bindings have to be performed: for (int i = e.getCombinedAnteList().length - 1 ; i >= 0 ; i--) { if (e.getCombinedAnteList()[i].getName().equals("//")) continue; int posrefsCount = backref.getPosFeatureMaps().size(); int negrefsCount = backref.getNegFeatureMaps().size(); List<Category> exceptions = null; boolean makeRestriction = true; if (refpos == (p+1)) { exceptions = new ArrayList<Category>(); for (int j = 0 ; j < negrefsCount ; j++) { Edge eC = e.deepCopy(); try { FeatureMap backrefFm = ((BackrefCategory) eC.getBody()[refpos]).getNegFeatureMaps().get(j); eC.getCombinedAnteList()[i].getFeatureMap().unify(backrefFm); if (eC.getNextActive() instanceof Terminal) { makeRestriction = false; break; } else { exceptions.add(eC.getNextActive()); } } catch (UnificationFailedException ex) {} } } if (!makeRestriction) break; for (int j = 0 ; j < posrefsCount ; j++) { Edge eC = e.deepCopy(); try { FeatureMap backrefFm = ((BackrefCategory) eC.getBody()[refpos]).getPosFeatureMaps().get(j); eC.getCombinedAnteList()[i].getFeatureMap().unify(backrefFm); if (exceptions != null) { aOptions.add(new CPAbstractOption( grammar, eC.getNextActive(), copyExceptionsList(exceptions) )); } else { aOptions.add(new CPAbstractOption(grammar, eC.getNextActive())); } } catch (UnificationFailedException ex) {} } } } else if (negbackref != null) { List<Category> exceptions = new ArrayList<Category>(); // Edges with negative backwards references lead to exceptions: boolean makeRestriction = true; for (int i = 0 ; i < e.getCombinedAnteList().length ; i++) { if (e.getCombinedAnteList()[i].getName().equals("//")) continue; Edge eC = e.deepCopy(); try { eC.getCombinedAnteList()[i].getFeatureMap().unify(eC.getBody()[refpos].getFeatureMap()); if (eC.getNextActive() instanceof Terminal) { makeRestriction = false; break; } else { exceptions.add(eC.getNextActive()); } } catch (UnificationFailedException ex) {} } if (makeRestriction) { aOptions.add(new CPAbstractOption(grammar, e.getNextActive().deepCopy(), exceptions)); } } else { aOptions.add(new CPAbstractOption(grammar, e.getNextActive().deepCopy())); } } if (debug) { for (CPAbstractOption o : aOptions) { log.debug("LOOKING FORWARD: {}", o); } } return aOptions; } /** * Calculates the set of concrete options for the given position on the basis of a set of * abstract options. * * @param position The position for which the concrete options should be calculated. * @param aOptions The set of abstract options. * @return The set of concrete options. */ private Set<CPConcreteOption> createConcreteOptions(int position, Set<CPAbstractOption> aOptions) { Set<CPConcreteOption> cOptions = new HashSet<CPConcreteOption>(); for (CPAbstractOption ao : aOptions) { if (ao.getCategory() instanceof Preterminal) { List<LexicalRule> lexRules; if (dynLexicon == null) { lexRules = grammar.lexRulesByCat(ao.getCategory().getName()); } else { lexRules = new ArrayList<LexicalRule>(); lexRules.addAll(grammar.lexRulesByCat(ao.getCategory().getName())); lexRules.addAll(dynLexicon.getLexRules(ao)); } for (LexicalRule lexRule : lexRules) { if (ao.isFulfilledBy(lexRule.getCategory())) { cOptions.add(new CPConcreteOption(grammar, lexRule.deepCopy())); } } } else if (ao.getCategory() instanceof Terminal) { cOptions.add(new CPConcreteOption(grammar, (Terminal) ao.getCategory(), null)); } } return cOptions; } /** * Runs the initialization step of the Earley parsing algorithm. */ private void init() { for (GrammarRule rule : grammar.rulesByHeadName(startCategoryName)) { Edge edge = new Edge(0, rule.deepCopy(), context); chart.addEdge(edge); if (debug) log.debug("INIT: {} ---> {}", rule, edge); } } /** * Runs the main parsing steps of the Earley algorithm. These parsing steps consists of the * completion/prediction/resolution loop. */ private void runParsingSteps() { // Run completion/predition/resolution until neither of them generates a new edge: int chartSize = 0; int step = 0; int idleSteps = 0; if (progressTable == null) { progressTable = new HashMap<String, Integer>(); progressTable.put("prediction", 0); progressTable.put("completion", 0); progressTable.put("resolution", 0); } while (true) { step++; chartSize = chart.getSize(); if (step == 1) { predict(progressTable); } else if (step == 2) { resolve(progressTable); } else { complete(progressTable); step = 0; } if (chartSize == chart.getSize()) { idleSteps++; } else { idleSteps = 0; } if (idleSteps > 2) { break; } } } /** * Runs the prediction step of the Earley parsing algorithm. * * @param progressTable This table captures the progress state in order to prevent from * checking the same edges more than once. */ private void predict(Map<String, Integer> progressTable) { List<Edge> l = chart.getEdgesByEndPos(tokens.size()); for (int i = new Integer(progressTable.get("prediction")) ; i < l.size() ; i++) { // During this loop, elements might be added to the end of the list l. Edge existingEdge = l.get(i); Category category = existingEdge.getNextActive(); if (category == null) continue; if (category instanceof Terminal) continue; if (category.isSpecialCategory()) continue; if (debug) log.debug("PREDICTION FOR CATEGORY: {}", category); for (GrammarRule rule : grammar.rulesByHeadName(category.getName())) { try { if (!category.isSimilar(rule.getHead())) continue; Edge edgeC = existingEdge.deepCopy(); GrammarRule ruleC = rule.deepCopy(); edgeC.getNextActive().unify(ruleC.getHead()); Edge edge = new Edge(tokens.size(), ruleC, edgeC.getCombinedAnteList()); boolean isNewEdge = chart.addEdge(edge); if (debug) log.debug("PREDICT ({}): {} ---> {}", (isNewEdge ? "NEW" : "KNOWN"), rule, edge); } catch (UnificationFailedException ex) { continue; } } } progressTable.put("prediction", l.size()); } /** * Runs the completion step of the Earley parsing algorithm. * * @param progressTable This table captures the progress state in order to prevent from * checking the same edges more than once. */ private void complete(Map<String, Integer> progressTable) { List<Edge> l1 = chart.getEdgesByEndPos(tokens.size()); for (int i1 = 0 ; i1 < l1.size() ; i1++) { // During this loop, elements might be added to the end of the list l1. Edge passiveEdge = l1.get(i1); if (passiveEdge.isActive()) continue; if (debug) log.debug("COMPLETION FOR EDGE: {}", passiveEdge); List<Edge> l2 = chart.getEdgesByEndPos(passiveEdge.getStartPos()); int start; if (i1 < progressTable.get("completion")) { Integer progress = progressTable.get("completion " + i1); if (progress == null) { start = 0; } else { start = progress; } } else { start = 0; } for (int i2 = start ; i2 < l2.size() ; i2++) { // During this loop, elements might be added to the end of the list l2. Edge edge = l2.get(i2); if (!edge.isActive()) continue; if (!edge.getNextActive().getName().equals(passiveEdge.getHead().getName())) continue; try { if (!passiveEdge.getHead().isSimilar(edge.getNextActive())) continue; Edge passiveEdgeC = passiveEdge.deepCopy(); Edge edgeC = edge.deepCopy(); passiveEdgeC.getHead().unify(edgeC.getNextActive()); if (!passiveEdge.carriesAntecedentInformation()) { // Antecedent lists have to match: Category[] al1 = edgeC.getCombinedAnteList(); Category[] al2 = passiveEdgeC.getExternalAnteList(); if (al1.length != al2.length) throw new UnificationFailedException(); for (int i = 0 ; i < al1.length ; i++) { al1[i].unify(al2[i]); } } edgeC.step(tokens.size(), passiveEdgeC); boolean isNewEdge = chart.addEdge(edgeC); if (debug) log.debug("COMPLETE ({}): {} ---> {}", (isNewEdge ? "NEW" : "KNOWN"), edge, edgeC); } catch (UnificationFailedException ex) { continue; } } progressTable.put("completion " + i1, l2.size()); } progressTable.put("completion", l1.size()); } /** * Runs the resolution step, which is an extension of the standard Earley algorithm. * * @param progressTable This table captures the progress state in order to prevent from * checking the same edges more than once. */ private void resolve(Map<String, Integer> progressTable) { List<Edge> l1 = chart.getEdgesByEndPos(tokens.size()); for (int i1 = progressTable.get("resolution") ; i1 < l1.size() ; i1++) { // During this loop, elements might be added to the end of the list l1. Edge edge = l1.get(i1); if (!edge.isActive()) continue; String n = edge.getNextActive().getName(); List<Edge> newEdges = new ArrayList<Edge>(); if (n.equals("#")) { Edge edgeC = edge.deepCopy(); try { String posId = positionIdentifierPrefix + tokens.size(); edgeC.getNextActive().getFeature("pos").unify(new StringRef(posId)); newEdges.add(edgeC); } catch (UnificationFailedException ex) {} } else if (n.equals(">") || n.equals(">>") || n.equals("//")) { Edge edgeC = edge.deepCopy(); edgeC.getNextActive().setFeature("*pos", tokens.size() + ""); edgeC.addAntecedents(edgeC.getNextActive()); newEdges.add(edgeC); } else if (n.equals("<")) { BackrefCategory bwrefCat = (BackrefCategory) edge.getNextActive(); Category[] ante = edge.getCombinedAnteList(); for (int i = ante.length-1 ; i >= 0 ; i--) { if (ante[i].getName().equals("//")) continue; boolean negMatch = false; for (FeatureMap negfm : bwrefCat.getNegFeatureMaps()) { if (ante[i].getFeatureMap().canUnify(negfm)) { negMatch = true; break; } } if (negMatch) continue; boolean posMatch = false; List<FeatureMap> posfms = bwrefCat.getPosFeatureMaps(); for (int j = 0 ; j < posfms.size() ; j++) { if (ante[i].getFeatureMap().canUnify(posfms.get(j))) { try { Edge edgeC = edge.deepCopy(); edgeC.getExternalAnteList()[i].getFeatureMap().unify( ((BackrefCategory) edgeC.getNextActive()).getPosFeatureMaps().get(j) ); backwardReferences.get(tokens.size()-1).add( edgeC.getExternalAnteList()[i].getFeatureMap().deepCopy() ); newEdges.add(edgeC); posMatch = true; } catch (UnificationFailedException ex) {} } } if (posMatch) break; } } else if (n.equals("/<")) { Edge edgeC = edge.deepCopy(); for (Category c : edge.getCombinedAnteList()) { if (c.getName().equals("//")) continue; if (c.getFeatureMap().canUnify(edge.getNextActive().getFeatureMap())) { edgeC = null; break; } } if (edgeC != null) { newEdges.add(edgeC); } } else { continue; } if (newEdges.isEmpty()) { if (debug) log.debug("CANNOT RESOLVE: {}", edge); } for (Edge newEdge : newEdges) { newEdge.step(); boolean isNewEdge = chart.addEdge(newEdge); if (debug) log.debug("RESOLVE ({}): {} ---> {}", (isNewEdge ? "NEW" : "KNOWN"), edge, newEdge); } } progressTable.put("resolution", l1.size()); } private static List<Category> copyExceptionsList(List<Category> list) { List<Category> listCopy = new ArrayList<Category>(); for (Category x : list) { listCopy.add(x.deepCopy()); } return listCopy; } }