package org.maltparser.core.syntaxgraph.ds2ps; import java.util.SortedMap; import org.maltparser.core.exception.MaltChainedException; import org.maltparser.core.helper.SystemLogger; import org.maltparser.core.io.dataformat.ColumnDescription; import org.maltparser.core.io.dataformat.DataFormatInstance; import org.maltparser.core.symbol.SymbolTable; import org.maltparser.core.symbol.SymbolTableHandler; import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; import org.maltparser.core.syntaxgraph.edge.Edge; import org.maltparser.core.syntaxgraph.headrules.HeadRules; import org.maltparser.core.syntaxgraph.node.DependencyNode; import org.maltparser.core.syntaxgraph.node.NonTerminalNode; import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; /** * * * @author Johan Hall */ public class LosslessMapping implements Dependency2PhraseStructure { private String DEPREL = "DEPREL"; private String PHRASE = "PHRASE"; private String HEADREL = "HEADREL"; private String ATTACH = "ATTACH"; private String CAT = "CAT"; private String EDGELABEL; private final char EMPTY_SPINE = '*'; private final String EMPTY_LABEL = "??"; private final char SPINE_ELEMENT_SEPARATOR = '|'; private final char LABEL_ELEMENT_SEPARATOR = '~'; private final char QUESTIONMARK = '?'; private String optionString; private HeadRules headRules; private DataFormatInstance dependencyDataFormatInstance; private DataFormatInstance phraseStructuretDataFormatInstance; private boolean lockUpdate = false; private int nonTerminalCounter; private StringBuilder deprel; private StringBuilder headrel; private StringBuilder phrase; public LosslessMapping(DataFormatInstance dependencyDataFormatInstance, DataFormatInstance phraseStructuretDataFormatInstance) { setDependencyDataFormatInstance(dependencyDataFormatInstance); setPhraseStructuretDataFormatInstance(phraseStructuretDataFormatInstance); deprel = new StringBuilder(); headrel = new StringBuilder(); phrase = new StringBuilder(); if (phraseStructuretDataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet().size() == 1) { for (ColumnDescription column : phraseStructuretDataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) { EDGELABEL = column.getName(); } } clear(); } public void clear() { nonTerminalCounter = 0; } public String getOptionString() { return optionString; } public void setOptionString(String optionString) { this.optionString = optionString; } public DataFormatInstance getDependencyDataFormatInstance() { return dependencyDataFormatInstance; } public void setDependencyDataFormatInstance( DataFormatInstance dependencyDataFormatInstance) { this.dependencyDataFormatInstance = dependencyDataFormatInstance; } public DataFormatInstance getPhraseStructuretDataFormatInstance() { return phraseStructuretDataFormatInstance; } public void setPhraseStructuretDataFormatInstance( DataFormatInstance phraseStructuretDataFormatInstance) { this.phraseStructuretDataFormatInstance = phraseStructuretDataFormatInstance; } public void update(MappablePhraseStructureGraph graph, Edge e, Object arg) throws MaltChainedException { if (lockUpdate == false) { // if (e.getType() == Edge.PHRASE_STRUCTURE_EDGE && e.getSource() instanceof NonTerminalNode && lockUpdate == false) { // if(e.getTarget() instanceof TerminalNode) { // PhraseStructureNode top = (PhraseStructureNode)e.getTarget(); // while (top.getParent() != null && ((NonTerminalNode)top.getParent()).getLexicalHead() == (PhraseStructureNode)e.getTarget()) { // top = top.getParent(); // } // updateDependenyGraph(graph, top); // } // else if (e.getSource().isRoot()) { // updateDependenyGraph(graph, graph.getPhraseStructureRoot()); // } // } if (e.getType() == Edge.DEPENDENCY_EDGE && e.getSource() instanceof DependencyNode && e.getTarget() instanceof DependencyNode) { if (e.isLabeled() && e.getLabelSet().size() == 4) { updatePhraseStructureGraph(graph, (Edge) e, false); } } } } public void updateDependenyGraph(MappablePhraseStructureGraph graph, PhraseStructureNode top) throws MaltChainedException { if (graph.nTokenNode() == 1 && graph.nNonTerminals() == 0) { // Special case when the root dominates direct a single terminal node Edge e = graph.addDependencyEdge(graph.getDependencyRoot(), graph.getDependencyNode(1)); e.addLabel(graph.getSymbolTables().getSymbolTable(DEPREL), graph.getDefaultRootEdgeLabelSymbol(graph.getSymbolTables().getSymbolTable(DEPREL))); e.addLabel(graph.getSymbolTables().getSymbolTable(HEADREL), graph.getDefaultRootEdgeLabelSymbol(graph.getSymbolTables().getSymbolTable(HEADREL))); e.addLabel(graph.getSymbolTables().getSymbolTable(PHRASE), "*"); // e.addLabel(graph.getSymbolTables().getSymbolTable(PHRASE), graph.getDefaultRootEdgeLabelSymbol(graph.getSymbolTables().getSymbolTable(PHRASE))); e.addLabel(graph.getSymbolTables().getSymbolTable(ATTACH), graph.getDefaultRootEdgeLabelSymbol(graph.getSymbolTables().getSymbolTable(ATTACH))); } else { updateDependencyEdges(graph, top); updateDependenyLabels(graph); } } private void updateDependencyEdges(MappablePhraseStructureGraph graph, PhraseStructureNode top) throws MaltChainedException { if (top == null) { return; } DependencyNode head; DependencyNode dependent = null; if (top instanceof NonTerminalNode) { for (PhraseStructureNode node : ((NonTerminalNode) top).getChildren()) { if (node instanceof NonTerminalNode) { updateDependencyEdges(graph, node); } else { head = ((NonTerminalNode) top).getLexicalHead(headRules); dependent = (DependencyNode) node; if (head != null && dependent != null && head != dependent) { lockUpdate = true; if (!dependent.hasHead()) { graph.addDependencyEdge(head, dependent); } else if (head != dependent.getHead()) { graph.moveDependencyEdge(head, dependent); } lockUpdate = false; } } } } head = null; if (top.getParent() != null) { head = ((NonTerminalNode) top.getParent()).getLexicalHead(headRules); } else if (top.isRoot()) { head = (DependencyNode) top; } if (top instanceof NonTerminalNode) { dependent = ((NonTerminalNode) top).getLexicalHead(headRules); } else if (!top.isRoot()) { dependent = (DependencyNode) top; } if (head != null && dependent != null && head != dependent) { lockUpdate = true; if (!dependent.hasHead()) { graph.addDependencyEdge(head, dependent); } else if (head != dependent.getHead()) { graph.moveDependencyEdge(head, dependent); } lockUpdate = false; } } private void updateDependenyLabels(MappablePhraseStructureGraph graph) throws MaltChainedException { for (int index : graph.getTokenIndices()) { PhraseStructureNode top = (PhraseStructureNode) graph.getTokenNode(index); while (top != null && top.getParent() != null && graph.getTokenNode(index) == ((NonTerminalNode) top.getParent()).getLexicalHead(headRules)) { top = top.getParent(); } lockUpdate = true; labelDependencyEdge(graph, graph.getTokenNode(index).getHeadEdge(), top); lockUpdate = false; } } // private void updateDependenyLabels(MappablePhraseStructureGraph graph, PhraseStructureNode top) throws MaltChainedException { // if (top == null) { // return; // } // DependencyNode head = null; // DependencyNode dependent = null; // if (top instanceof NonTerminalNode) { // for (PhraseStructureNode node : ((NonTerminalNode)top).getChildren()) { // if (node instanceof NonTerminalNode) { // updateDependenyLabels(graph, node); // } else { // head = ((NonTerminalNode)top).getLexicalHead(headRules); // dependent = (DependencyNode)node; // if (head != null && dependent != null && head != dependent) { // lockUpdate = true; // if (dependent.hasHead()) { // Edge e = dependent.getHeadEdge(); // labelDependencyEdge(graph, e, node); // } // lockUpdate = false; // } // } // } // } // // dependent = null; // if (top instanceof NonTerminalNode) { // dependent = ((NonTerminalNode)top).getLexicalHead(headRules); // } // // if (dependent != null) { // lockUpdate = true; // if (dependent.hasHead()) { // Edge e = dependent.getHeadEdge(); // labelDependencyEdge(graph, e, top); // } // lockUpdate = false; // } // } private void labelDependencyEdge(MappablePhraseStructureGraph graph, Edge e, PhraseStructureNode top) throws MaltChainedException { if (e == null) { return; } SymbolTableHandler symbolTables = graph.getSymbolTables(); deprel.setLength(0); phrase.setLength(0); headrel.setLength(0); e.removeLabel(symbolTables.getSymbolTable(DEPREL)); e.removeLabel(symbolTables.getSymbolTable(HEADREL)); e.removeLabel(symbolTables.getSymbolTable(PHRASE)); e.removeLabel(symbolTables.getSymbolTable(ATTACH)); int i = 0; SortedMap<String, SymbolTable> edgeLabelSymbolTables = phraseStructuretDataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); SortedMap<String, SymbolTable> nodeLabelSymbolTables = phraseStructuretDataFormatInstance.getPhraseStructureNodeLabelSymbolTables(); if (!top.isRoot()) { for (String name : edgeLabelSymbolTables.keySet()) { if (top.hasParentEdgeLabel(symbolTables.getSymbolTable(name))) { deprel.append(top.getParentEdgeLabelSymbol(symbolTables.getSymbolTable(name))); } else { deprel.append(EMPTY_LABEL); } i++; if (i < edgeLabelSymbolTables.size()) { deprel.append(LABEL_ELEMENT_SEPARATOR); } } if (deprel.length() != 0) { e.addLabel(symbolTables.getSymbolTable(DEPREL), deprel.toString()); } } else { String deprelDefaultRootLabel = graph.getDefaultRootEdgeLabelSymbol(symbolTables.getSymbolTable(DEPREL)); if (deprelDefaultRootLabel != null) { e.addLabel(symbolTables.getSymbolTable(DEPREL), deprelDefaultRootLabel); } else { e.addLabel(symbolTables.getSymbolTable(DEPREL), EMPTY_LABEL); } } PhraseStructureNode tmp = (PhraseStructureNode) e.getTarget(); while (tmp != top && tmp.getParent() != null) { // && !tmp.getParent().isRoot()) { i = 0; for (String name : edgeLabelSymbolTables.keySet()) { if (tmp.hasParentEdgeLabel(symbolTables.getSymbolTable(name))) { headrel.append(tmp.getParentEdgeLabelSymbol(symbolTables.getSymbolTable(name))); } else { headrel.append(EMPTY_LABEL); } i++; if (i < edgeLabelSymbolTables.size()) { headrel.append(LABEL_ELEMENT_SEPARATOR); } } i = 0; headrel.append(SPINE_ELEMENT_SEPARATOR); for (String name : nodeLabelSymbolTables.keySet()) { if (tmp.getParent().hasLabel(symbolTables.getSymbolTable(name))) { phrase.append(tmp.getParent().getLabelSymbol(symbolTables.getSymbolTable(name))); } else { if (tmp.getParent().isRoot()) { String deprelDefaultRootLabel = graph.getDefaultRootEdgeLabelSymbol(symbolTables.getSymbolTable(PHRASE)); if (deprelDefaultRootLabel != null) { phrase.append(deprelDefaultRootLabel); } else { phrase.append(EMPTY_LABEL); } } else { phrase.append(EMPTY_LABEL); } } i++; if (i < nodeLabelSymbolTables.size()) { phrase.append(LABEL_ELEMENT_SEPARATOR); } } phrase.append(SPINE_ELEMENT_SEPARATOR); tmp = tmp.getParent(); } if (phrase.length() == 0) { headrel.append(EMPTY_SPINE); phrase.append(EMPTY_SPINE); } else { headrel.setLength(headrel.length() - 1); phrase.setLength(phrase.length() - 1); } e.addLabel(symbolTables.getSymbolTable(HEADREL), headrel.toString()); e.addLabel(symbolTables.getSymbolTable(PHRASE), phrase.toString()); int a = 0; tmp = (PhraseStructureNode) e.getSource(); while (top.getParent() != null && tmp.getParent() != null && tmp.getParent() != top.getParent()) { a++; tmp = tmp.getParent(); } e.addLabel(symbolTables.getSymbolTable(ATTACH), Integer.toString(a)); } public void connectUnattachedSpines(MappablePhraseStructureGraph graph) throws MaltChainedException { connectUnattachedSpines(graph, graph.getDependencyRoot()); if (!graph.getPhraseStructureRoot().isLabeled()) { graph.getPhraseStructureRoot().addLabel(graph.getSymbolTables().addSymbolTable(CAT), graph.getDefaultRootEdgeLabelSymbol(graph.getSymbolTables().getSymbolTable(PHRASE))); } } private void connectUnattachedSpines(MappablePhraseStructureGraph graph, DependencyNode depNode) throws MaltChainedException { if (!depNode.isRoot()) { PhraseStructureNode dependentSpine = (PhraseStructureNode) depNode; while (dependentSpine.getParent() != null) { dependentSpine = dependentSpine.getParent(); } if (!dependentSpine.isRoot()) { updatePhraseStructureGraph(graph, depNode.getHeadEdge(), true); } } for (int i = 0; i < depNode.getLeftDependentCount(); i++) { connectUnattachedSpines(graph, depNode.getLeftDependent(i)); } for (int i = depNode.getRightDependentCount() - 1; i >= 0; i--) { connectUnattachedSpines(graph, depNode.getRightDependent(i)); } } public void updatePhraseStructureGraph(MappablePhraseStructureGraph graph, Edge depEdge, boolean attachHeadSpineToRoot) throws MaltChainedException { PhraseStructureNode dependentSpine = (PhraseStructureNode) depEdge.getTarget(); if (((PhraseStructureNode) depEdge.getTarget()).getParent() == null) { // Restore dependent spine String phraseSpineLabel = null; String edgeSpineLabel = null; int empty_label = 0; if (depEdge.hasLabel(graph.getSymbolTables().getSymbolTable(PHRASE))) { phraseSpineLabel = depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(PHRASE)); } if (depEdge.hasLabel(graph.getSymbolTables().getSymbolTable(HEADREL))) { edgeSpineLabel = depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(HEADREL)); } if (phraseSpineLabel != null && phraseSpineLabel.length() > 0 && phraseSpineLabel.charAt(0) != EMPTY_SPINE) { int ps = 0, es = 0, i = 0, j = 0, n = phraseSpineLabel.length() - 1, m = edgeSpineLabel.length() - 1; PhraseStructureNode child = (PhraseStructureNode) depEdge.getTarget(); while (true) { while (i <= n && phraseSpineLabel.charAt(i) != SPINE_ELEMENT_SEPARATOR) { if (phraseSpineLabel.charAt(i) == QUESTIONMARK) { empty_label++; } else { empty_label = 0; } i++; } if (depEdge.getSource().isRoot() && i >= n) { dependentSpine = graph.getPhraseStructureRoot(); } else { dependentSpine = graph.addNonTerminalNode(++nonTerminalCounter); } if (empty_label != 2 && ps != i) { dependentSpine.addLabel(graph.getSymbolTables().addSymbolTable(CAT), phraseSpineLabel.substring(ps, i)); } empty_label = 0; if (edgeSpineLabel != null) { while (j <= m && edgeSpineLabel.charAt(j) != SPINE_ELEMENT_SEPARATOR) { if (edgeSpineLabel.charAt(j) == QUESTIONMARK) { empty_label++; } else { empty_label = 0; } j++; } } lockUpdate = true; Edge e = graph.addPhraseStructureEdge(dependentSpine, child); if (empty_label != 2 && es != j && edgeSpineLabel != null && e != null) { e.addLabel(graph.getSymbolTables().addSymbolTable(EDGELABEL), edgeSpineLabel.substring(es, j)); } else if (es == j) { e.addLabel(graph.getSymbolTables().addSymbolTable(EDGELABEL), EMPTY_LABEL); } lockUpdate = false; child = dependentSpine; if (i >= n) { break; } empty_label = 0; ps = i = i + 1; es = j = j + 1; } } // Recursively attach the dependent spines to target node. DependencyNode target = (DependencyNode) depEdge.getTarget(); for (int i = 0; i < target.getLeftDependentCount(); i++) { updatePhraseStructureGraph(graph, target.getLeftDependent(i).getHeadEdge(), attachHeadSpineToRoot); } for (int i = target.getRightDependentCount() - 1; i >= 0; i--) { updatePhraseStructureGraph(graph, target.getRightDependent(i).getHeadEdge(), attachHeadSpineToRoot); } } else { // If dependent spine already exist, then set dependentSpine to the highest nonterminal // of the dependent spine. while (dependentSpine.getParent() != null && !dependentSpine.getParent().isRoot()) { dependentSpine = dependentSpine.getParent(); } } PhraseStructureNode headSpine; if (((PhraseStructureNode) depEdge.getSource()).getParent() != null) { // If head spine exist, then attach dependent spine to the head spine at the attachment level a. int a = 0; headSpine = ((PhraseStructureNode) depEdge.getSource()).getParent(); if (depEdge.hasLabel(graph.getSymbolTables().getSymbolTable(ATTACH))) { try { a = Integer.parseInt((depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(ATTACH)))); } catch (NumberFormatException e) { throw new MaltChainedException(e.getMessage()); } } for (int i = 0; i < a && headSpine != null; i++) { headSpine = headSpine.getParent(); } if ((headSpine == null || headSpine == dependentSpine) && attachHeadSpineToRoot) { headSpine = graph.getPhraseStructureRoot(); } if (headSpine != null) { lockUpdate = true; Edge e = graph.addPhraseStructureEdge(headSpine, dependentSpine); if (depEdge.hasLabel(graph.getSymbolTables().getSymbolTable(DEPREL)) && !depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(DEPREL)).equals(EMPTY_LABEL) & e != null) { e.addLabel(graph.getSymbolTables().addSymbolTable(EDGELABEL), depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(DEPREL))); } lockUpdate = false; } } else if (depEdge.getSource().isRoot() && !depEdge.isLabeled()) { headSpine = graph.getPhraseStructureRoot(); lockUpdate = true; Edge e = graph.addPhraseStructureEdge(headSpine, dependentSpine); if (depEdge.hasLabel(graph.getSymbolTables().getSymbolTable(DEPREL)) && !depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(DEPREL)).equals(EMPTY_LABEL) & e != null) { e.addLabel(graph.getSymbolTables().addSymbolTable(EDGELABEL), depEdge.getLabelSymbol(graph.getSymbolTables().getSymbolTable(DEPREL))); } else { e.addLabel(graph.getSymbolTables().addSymbolTable(EDGELABEL), graph.getDefaultRootEdgeLabelSymbol(graph.getSymbolTables().getSymbolTable(DEPREL))); } lockUpdate = false; // Recursively attach the dependent spines to target node. DependencyNode target = (DependencyNode) depEdge.getTarget(); for (int i = 0; i < target.getLeftDependentCount(); i++) { updatePhraseStructureGraph(graph, target.getLeftDependent(i).getHeadEdge(), attachHeadSpineToRoot); } for (int i = target.getRightDependentCount() - 1; i >= 0; i--) { updatePhraseStructureGraph(graph, target.getRightDependent(i).getHeadEdge(), attachHeadSpineToRoot); } } } public HeadRules getHeadRules() { return headRules; } public void setHeadRules(HeadRules headRules) { this.headRules = headRules; } public void setHeadRules(String headRulesURL) throws MaltChainedException { if (headRulesURL != null && headRulesURL.length() > 0 && !headRulesURL.equals("*")) { headRules = new HeadRules(SystemLogger.logger(), phraseStructuretDataFormatInstance); headRules.parseHeadRules(headRulesURL); } } }