/** * Copyright (c) 2010, Regents of the University of Colorado All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. Redistributions in binary * form must reproduce the above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or other materials provided * with the distribution. Neither the name of the University of Colorado at * Boulder nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package clear.treebank; import clear.dep.DepLib; import clear.dep.DepNode; import clear.dep.DepTree; import com.carrotsearch.hppc.IntObjectOpenHashMap; import java.util.ArrayList; /** * This class provides APIs to convert phrase structure trees to dependency * trees in English. * * @author Jinho D. Choi <b>Last update:</b> 9/1/2010 */ public class TBKrConvert extends AbstractTBConvert { public TBKrConvert(TBHeadRules headrules) { g_headrules = headrules; } /** * @return a dependency tree converted from * <code>pTree</cdoe>. */ @Override public DepTree toDepTree(TBTree pTree) { p_tree = pTree; d_tree = new DepTree(); initDepTree(pTree.getRootNode()); setDepHeads(pTree.getRootNode()); configureConjunctions(); setDepRoot(); d_tree.checkTree(); return d_tree; } /** * Initializes * <code>tree</code> using the subtree of * <code>curr</code>. */ private void initDepTree(TBNode curr) { curr.pos = curr.pos.split(";")[0]; if (curr.isPhrase()) { for (TBNode child : curr.getChildren()) { initDepTree(child); } } else { DepNode node = new DepNode(); node.id = curr.terminalId + 1; node.lemma = curr.form; node.pos = curr.pos; d_tree.add(node); } } /** * Finds heads for all phrases. */ private void setDepHeads(TBNode curr) { if (!curr.isPhrase()) { return; } // traverse all subtrees for (TBNode child : curr.getChildren()) { setDepHeads(child); } // top-level constituent if (curr.isPos(TBLib.POS_TOP)) { return; } // find heads of all subtrees findHeads(curr); setDepHeadsAux(curr); } /** * Finds heads of all phrases under * <code>curr</code> using * <code>headrules</code>. * <code>beginId</code> inclusive, * <code>endId</code> exclusive. */ private void findHeads(TBNode curr) { TBHeadRule headrule = g_headrules.getHeadRule(curr.pos); ArrayList<TBNode> children = curr.getChildren(); if (children.size() == 1) { curr.headId = children.get(0).headId; return; } if (findHeadByTag(curr, headrule)) { return; } for (int i = 0; i < 4; i++) { if (findHeadByRule(curr, headrule, i)) { break; } } } private boolean findHeadByTag(TBNode curr, TBHeadRule headrule) { if (curr.tags == null) { return false; } ArrayList<TBNode> children = curr.getChildren(); ArrayList<TBNode> tNodes = new ArrayList<>(); String tag = (String) curr.tags.toArray()[0]; TBNode child; int i, size = children.size(); if (size == 2) { child = children.get(1); if (child.isPos(TBKrLib.POS_X + "|" + TBKrLib.POS_S) && child.isTag(tag) && !child.isPhrase()) { curr.headId = children.get(0).headId; return true; } } // curr.tag == child.tag for (i = size - 1; i >= 0; i--) { child = children.get(i); if (child.isTag(tag)) { if (child.isPos(curr.pos)) { curr.headId = child.headId; return true; } if (!child.isPos(TBKrLib.POS_X + "|" + TBKrLib.POS_L + "|" + TBKrLib.POS_R)) { tNodes.add(child); } } } // only one child with the same tag if ((size = tNodes.size()) == 1) { curr.headId = tNodes.get(0).headId; return true; } if (size > 1) { for (String rule : headrule.rules) { if (headrule.dir == -1) { for (i = size - 1; i >= 0; i--) { if (findHeadAux(curr, tNodes.get(i), rule, 3)) { return true; } } } else { for (i = 0; i < size; i++) { if (findHeadAux(curr, tNodes.get(i), rule, 3)) { return true; } } } } } return false; } private boolean findHeadByRule(TBNode curr, TBHeadRule headrule, int isSkip) { ArrayList<TBNode> children = curr.getChildren(); int i, size = children.size(); for (String rule : headrule.rules) { if (headrule.dir == -1) { for (i = 0; i < size; i++) { if (findHeadAux(curr, children.get(i), rule, isSkip)) { return true; } } } else { for (i = size - 1; i >= 0; i--) { if (findHeadAux(curr, children.get(i), rule, isSkip)) { return true; } } } } return false; } /** * This method is called by {@link TBKrConvert#findHeads(TBNode, TBHeadRules)} * and {@link TBKrConvert#findGapHeads(TBNode, TBHeadRules)}. */ private boolean findHeadAux(TBNode curr, TBNode child, String rule, int isSkip) { if (isSkip == 0) { if (child.isTag(TBKrLib.TAG_CNJ + "|" + TBKrLib.TAG_MOD) || child.isTag(TBKrLib.TAG_PRN) || child.isPos(TBKrLib.POS_L + "|" + TBKrLib.POS_R) || (child.isPos(TBKrLib.POS_X + "|" + TBKrLib.POS_S) && !child.isPhrase())) { return false; } } else if (isSkip == 1) { if (child.isTag(TBKrLib.TAG_PRN) || child.isPos(TBKrLib.POS_L + "|" + TBKrLib.POS_R) || (child.isPos(TBKrLib.POS_X + "|" + TBKrLib.POS_S) && !child.isPhrase())) { return false; } } else if (isSkip == 2) { if (child.isPos(TBKrLib.POS_L + "|" + TBKrLib.POS_R) || (child.isPos(TBKrLib.POS_X + "|" + TBKrLib.POS_S) && !child.isPhrase())) { return false; } } if (child.isRule(rule)) { curr.headId = child.headId; return true; } return false; } private void configureConjunctions() { IntObjectOpenHashMap<ArrayList<DepNode>> map = new IntObjectOpenHashMap<>(); ArrayList<DepNode> list; DepNode curr; for (int i = 1; i < d_tree.size(); i++) { curr = d_tree.get(i); if (curr.isDeprel(TBKrLib.TAG_CNJ)) { if (map.containsKey(curr.headId)) { list = map.get(curr.headId); } else { list = new ArrayList<>(); map.put(curr.headId, list); } list.add(curr); } } for (int key : map.keys().toArray()) { list = map.get(key); for (int i = 0; i < list.size() - 1; i++) { list.get(i).headId = list.get(i + 1).id; } } } /** * Assigns the root of the dependency tree. */ private void setDepRoot() { for (int i = 1; i < d_tree.size(); i++) { DepNode node = d_tree.get(i); if (node.headId == DepLib.NULL_HEAD_ID) { node.setHead(DepLib.ROOT_ID, DepLib.DEPREL_ROOT, 0); } } } private void setDepHeadsAux(TBNode curr) { ArrayList<TBNode> children = curr.getChildren(); TBNode child; for (int i = 0; i < children.size(); i++) { child = children.get(i); if (child.headId == curr.headId) { continue; } // if (hasHead(child.headId)) continue; String deprel = getDeprel(curr, child); setDependency(child.headId, curr.headId, deprel); } } private String getDeprel(TBNode parent, TBNode child) { String deprel; TBNode p = p_tree.getTerminalNodes().get(parent.headId); TBNode c = p_tree.getTerminalNodes().get(child.headId); if (child.isPos(TBKrLib.POS_L + "|" + TBKrLib.POS_R) || TBKrLib.isPunctuation(c.form)) { return TBKrLib.DEP_P; } if ((deprel = getXDeprel(child)) != null) { return deprel; } if ((deprel = getTagDeprel(child)) != null) { return deprel; } if ((deprel = getInferredDeprel(p, c)) != null) { return deprel; } return DepLib.DEPREL_DEP; } private String getXDeprel(TBNode child) { if (child.isPos(TBKrLib.POS_X + "|" + TBKrLib.POS_S) && !child.isPhrase()) { String tag = getTagDeprel(child); if (tag != null) { return TBKrLib.POS_X + "_" + tag; } else { return TBKrLib.POS_X; } } return null; } private String getTagDeprel(TBNode child) { if (child.tags != null) { String deprel = (String) child.tags.toArray()[0]; return deprel; } return null; } private String getInferredDeprel(TBNode p, TBNode c) { if (c.isPos(TBKrLib.POS_AP)) { return TBKrLib.DEP_ADV; } if (p.isPos(TBKrLib.POS_AP)) { return TBKrLib.DEP_AMOD; } if (p.isPos(TBKrLib.POS_DP)) { return TBKrLib.DEP_DMOD; } if (p.isPos(TBKrLib.POS_NP)) { return TBKrLib.DEP_NMOD; } if (p.isPos(TBKrLib.POS_VP + "|" + TBKrLib.POS_VNP + "|" + TBKrLib.POS_IP)) { return TBKrLib.DEP_VMOD; } // if (p.isPos(TBKrLib.POS_Q)) // return TBKrLib.DEP_QMOD; return null; } /** * Assigns the dependency head of the current node. */ private void setDependency(int currId, int headId, String deprel) { d_tree.setHead(currId + 1, headId + 1, deprel, 1); } TBNode getTagNode(TBNode root, TBNode c, String tag) { if (c.isTag(tag)) { return c; } TBNode parent = c.getParent(); while (parent != null && !parent.equals(root)) { if (parent.isTag(tag)) { return parent; } parent = parent.getParent(); } return null; } /** * @return true if the current node already has its dependency head. */ boolean hasHead(int currId) { return d_tree.get(currId + 1).hasHead; } }