/** * Copyright (c) 2010, Regents of the University of Colorado All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. Redistributions in binary * form must reproduce the above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or other materials provided * with the distribution. Neither the name of the University of Colorado at * Boulder nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package clear.treebank; import clear.dep.DepLib; import clear.dep.DepNode; import clear.dep.DepTree; import clear.dep.srl.SRLHead; import clear.dep.srl.SRLInfo; import clear.morph.MorphEnAnalyzer; import clear.propbank.PBLoc; import com.carrotsearch.hppc.IntIntOpenHashMap; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; /** * This class provides APIs to convert phrase structure trees to dependency * trees in English. * * @author Jinho D. Choi <b>Last update:</b> 9/1/2010 */ public class TBEnConvert extends AbstractTBConvert { private boolean b_funcTag; private boolean b_ec; private boolean b_reverseVC; private MorphEnAnalyzer g_morph; public TBEnConvert(TBHeadRules headrules, MorphEnAnalyzer morph, boolean funcTag, boolean ec, boolean reverseVC) { g_headrules = headrules; g_morph = morph; b_funcTag = funcTag; b_ec = ec; b_reverseVC = reverseVC; } /** * @return a dependency tree converted from * <code>pTree</cdoe>. */ @Override public DepTree toDepTree(TBTree pTree) { p_tree = pTree; d_tree = new DepTree(); initDepTree(pTree.getRootNode()); setDepHeads(pTree.getRootNode()); setDepRoot(); remapEmptyCategory(); DepTree copy = removeEmptyCategories(); if (b_ec) { lemmatizeEC(copy); } // if (b_ec) relocatePROs(copy); copy.projectizePunc(); copy.checkTree(); return copy; } /** * @return a semantic role labeling tree converted from * <code>pTree</code>. */ public DepTree toSRLTree(TBTree pTree) { p_tree = pTree; d_tree = new DepTree(); reconfigureAntecedents(); initSRLTree(pTree.getRootNode()); setDepHeads(pTree.getRootNode()); setDepRoot(); remapEmptyCategory(); p_tree.mapSRLTree(d_tree); // mapPBLocToDep(); DepTree copy = removeEmptyCategories(); // if (b_ec) relocatePROs(copy); if (b_ec) { lemmatizeEC(copy); } postProp(copy); copy.projectizePunc(); copy.checkTree(); return copy; } /** * Initializes * <code>tree</code> using the subtree of * <code>curr</code>. */ private void initDepTree(TBNode curr) { if (curr.isPhrase()) { for (TBNode child : curr.getChildren()) { initDepTree(child); } } else { DepNode node = new DepNode(); node.id = curr.terminalId + 1; node.form = curr.form; node.pos = curr.pos; node.lemma = (g_morph != null) ? g_morph.getLemma(node.form, curr.pos) : node.form.toLowerCase(); d_tree.add(node); } } private void initSRLTree(TBNode curr) { initDepTree(curr); DepNode node; for (int i = 1; i < d_tree.size(); i++) { node = d_tree.get(i); node.srlInfo = new SRLInfo(); node.pbLoc = new PBLoc[2]; node.pbLoc[0] = new PBLoc("", -1, -1); node.pbLoc[1] = new PBLoc("", -1, -1); } } /** * Finds heads for all phrases. */ private void setDepHeads(TBNode curr) { if (!curr.isPhrase()) { return; } // traverse all subtrees for (TBNode child : curr.getChildren()) { setDepHeads(child); } // top-level constituent if (curr.isPos(TBLib.POS_TOP)) { return; } // find heads of all subtrees findHeads(curr); if (isCoordination(curr)) { setCoordination(curr); } else if (curr.isPos(TBEnLib.POS_NP + "|" + TBEnLib.POS_NX + "|" + TBEnLib.POS_NML)) { setApposition(curr); } setGap(curr); reconfigureHead(curr); setDepHeadsAux(curr); } /** * Finds heads of all phrases under * <code>curr</code> using * <code>headrules</code>. * <code>beginId</code> inclusive, * <code>endId</code> exclusive. */ private void findHeads(TBNode curr) { TBHeadRule headrule = g_headrules.getHeadRule(curr.pos); ArrayList<TBNode> children = curr.getChildren(); if (children.size() == 1) { curr.headId = children.get(0).headId; return; } if (headrule == null) { System.err.println("Rules not found for [POS=" + curr.pos + "]"); return; } for (String rule : headrule.rules) { if (headrule.dir == -1) { for (int i = 0; i < children.size(); i++) { if (findHeadsAux(curr, children.get(i), rule)) { return; } } } else { for (int i = children.size() - 1; i >= 0; i--) { if (findHeadsAux(curr, children.get(i), rule)) { return; } } } } // head not found (because all children are either empty-category or punctuation if (curr.headId < 0) { if (headrule.dir == -1) { curr.headId = children.get(0).headId; } else { curr.headId = children.get(children.size() - 1).headId; } } } /** * This method is called by {@link TBEnConvert#findHeads(TBNode, TBHeadRules)} * and {@link TBEnConvert#findGapHeads(TBNode, TBHeadRules)}. */ private boolean findHeadsAux(TBNode curr, TBNode child, String rule) { if (curr.isPos(TBEnLib.POS_NP) && (child.isTag(TBEnLib.TAG_BNF) || child.isTag(TBEnLib.TAG_DIR) || child.isTag(TBEnLib.TAG_LOC) || child.isTag(TBEnLib.TAG_MNR) || child.isTag(TBEnLib.TAG_PRP) || child.isTag(TBEnLib.TAG_TMP))) { return false; } if (child.isRule(rule) && !TBEnLib.isPunctuation(child.pos) && !child.isEmptyCategoryRec() && !child.isPos(TBEnLib.POS_EDITED) && !isAuxMod(curr, child)) { curr.headId = child.headId; return true; } return false; } private boolean isAuxMod(TBNode curr, TBNode child) { if (b_reverseVC && child.form != null && (TBEnLib.isAux(child.form) || child.isPos(TBEnLib.POS_MD))) { ArrayList<TBNode> children = curr.getChildren(); for (int i = child.childId + 1; i < children.size(); i++) { if (children.get(i).isPos(TBEnLib.POS_VP)) { return true; } } } return false; } /** * @return true if * <code>curr</code> consists of coordination structure. */ private boolean isCoordination(TBNode curr) { return curr.isPos(TBEnLib.POS_UCP) || curr.containsPos(TBEnLib.POS_CC) || curr.containsPos(TBEnLib.POS_CONJP) || curr.containsTag(TBEnLib.TAG_ETC); } /** * Reconstructs heads for coordinations. */ private void setCoordination(TBNode curr) { ArrayList<TBNode> children = curr.getChildren(); for (int i = children.size() - 2; i >= 0; i--) { TBNode conj = children.get(i); if (!TBEnLib.isConjunction(conj.pos)) { continue; } TBNode prev = getConjunct(children, i, false, -1); TBNode next = getConjunct(children, i, false, 1); if (prev == null) { break; } if (next == null) { continue; } if (!setCoordinationAux(curr, conj, prev, next)) { prev = getConjunct(children, i, true, -1); next = getConjunct(children, i, true, 1); if (prev == null) { break; } if (next == null) { continue; } setCoordinationAux(curr, conj, prev, next); } i = prev.childId; } } private TBNode getConjunct(ArrayList<TBNode> children, int id, boolean more, int dir) { String skip1 = TBEnLib.POS_PRN + "|" + TBEnLib.POS_INTJ + "|" + TBEnLib.POS_EDITED + "|" + TBEnLib.POS_META + "|" + TBEnLib.POS_CODE; String skip2 = TBEnLib.POS_ADVP + "|" + TBEnLib.POS_SBAR; for (int i = id + dir; 0 <= i && i < children.size(); i += dir) { TBNode node = children.get(i); if (!TBEnLib.isConjunction(node.pos) && !TBEnLib.isPunctuation(node.pos) && !node.isEmptyCategoryRec() && !node.isPos(skip1) && !(more && node.isPos(skip2))) { return node; } } return null; } /** * Set dependencies for coordination structure. */ private boolean setCoordinationAux(TBNode curr, TBNode conj, TBNode prev, TBNode next) { ArrayList<TBNode> children = curr.getChildren(); if (curr.isPos(TBEnLib.POS_UCP) || prev.isPos(next.pos) || next.isTag(TBEnLib.TAG_ETC) || (TBEnLib.isWordConjunction(conj.pos) && next.childId == children.size() - 1) || (TBEnLib.isNounLike(prev.pos) && TBEnLib.isNounLike(next.pos)) || (TBEnLib.isAdjectiveLike(prev.pos) && TBEnLib.isAdjectiveLike(next.pos)) || (curr.isPos(TBEnLib.POS_WHADVP) && TBEnLib.isWhAdverbLike(prev.pos) && TBEnLib.isWhAdverbLike(next.pos))) { for (int i = prev.childId + 1; i <= conj.childId; i++) { TBNode node = children.get(i); setDependency(node.headId, prev.headId, getDeprel(curr, node)); if (TBEnLib.isWordConjunction(node.pos)) { prev = node; } } for (int i = conj.childId + 1; i <= next.childId - 1; i++) { TBNode node = children.get(i); setDependency(node.headId, next.headId, getDeprel(curr, node)); } DepNode dNode = d_tree.get(next.headId + 1); if (dNode.deprel.startsWith(DepLib.DEPREL_GAP)) { if (TBEnLib.isWordConjunction(prev.pos)) { setDependency(prev.headId, dNode.headId - 1, DepLib.DEPREL_COORD); setDependency(next.headId, prev.headId, dNode.deprel); } } else { setDependency(next.headId, prev.headId, DepLib.DEPREL_CONJ); } return true; } return false; } private void setGap(TBNode curr) { ArrayList<TBNode> children = curr.getChildren(); outer: for (int i = children.size() - 1; i >= 0; i--) { TBNode child = children.get(i); if (child.gapIndex == -1) { continue; } for (int j = i - 1; j >= 0; j--) { TBNode head = children.get(j); if (head.coIndex == child.gapIndex || head.gapIndex == child.gapIndex) { DepNode dNode = d_tree.get(child.headId + 1); if (dNode.isDeprel(DepLib.DEPREL_CONJ)) { dNode.deprel = DepLib.DEPREL_GAP; } else { setDependency(child.headId, head.headId, DepLib.DEPREL_GAP); } continue outer; } } ArrayList<TBNode> siblings = curr.getParent().getChildren(); for (int j = curr.childId - 1; j >= 0; j--) { TBNode head; if ((head = siblings.get(j).getGapNode(child.gapIndex)) != null) { String deprel = getTagDeprel(p_tree.getTerminalNodes().get(curr.headId).getParent(), null); if (deprel == null || !isFuncTag(deprel)) { deprel = ""; } else { deprel = "-" + deprel; } setDependency(curr.headId, head.headId, DepLib.DEPREL_GAP + deprel); return; } } } } private void setApposition(TBNode curr) { ArrayList<TBNode> children = curr.getChildren(); for (int i = children.size() - 3; i >= 0; i--) { TBNode fst = children.get(i); TBNode mid = children.get(i + 1); TBNode lst = children.get(i + 2); if (fst.isPos(TBEnLib.POS_NP) && mid.isPos(TBEnLib.POS_COMMA) && lst.isPos(TBEnLib.POS_NP)) { setDependency(lst.headId, fst.headId, DepLib.DEPREL_APPO); } } } /** * Assigns the root of the dependency tree. */ private void setDepRoot() { for (int i = 1; i < d_tree.size(); i++) { DepNode node = d_tree.get(i); if (node.headId == DepLib.NULL_HEAD_ID) { node.setHead(DepLib.ROOT_ID, DepLib.DEPREL_ROOT, 0); } } } private void reconfigureHead(TBNode curr) { BitSet set = curr.getSubTerminalBitSet(); DepNode tmp = d_tree.get(curr.headId + 1); while (tmp.hasHead && set.get(tmp.headId - 1)) { tmp = d_tree.get(tmp.headId); } curr.headId = tmp.id - 1; } private void setDepHeadsAux(TBNode curr) { ArrayList<TBNode> children = curr.getChildren(); TBNode child, prev; int i, j; outer: for (i = 0; i < children.size(); i++) { child = children.get(i); if (child.headId == curr.headId) { continue; } if (hasHead(child.headId)) { continue; } if (child.isPos(TBEnLib.POS_HYPH)) { for (j = i - 1; j >= 0; j--) { prev = children.get(j); if (!prev.isEmptyCategoryRec() && !TBEnLib.isPunctuation(prev.pos)) { setDependency(child.headId, prev.headId, DepLib.DEPREL_P); continue outer; } } } String deprel = getDeprel(curr, child); setDependency(child.headId, curr.headId, deprel); } } private String getDeprel(TBNode parent, TBNode child) { String deprel; TBNode p = p_tree.getTerminalNodes().get(parent.headId); TBNode c = p_tree.getTerminalNodes().get(child.headId); if ((deprel = getTagDeprel(child, p)) != null) { TBNode tNode = getTagNode(parent, p, TBEnLib.TAG_SBJ); if (deprel.equals(TBEnLib.TAG_PRD) && tNode != null) { return DepLib.DEPREL_OPRD; } if (isFuncTag(deprel)) { return deprel; } } if ((deprel = getObjectDeprel(parent, child, c)) != null) { return deprel; } if (TBEnLib.isWordConjunction(child.pos)) { return DepLib.DEPREL_COORD; } if (TBEnLib.isPunctuation(child.pos)) { return DepLib.DEPREL_P; } if (child.isPos(TBEnLib.POS_PRN + "|" + TBEnLib.POS_META)) { return child.pos; } if ((parent.isPos(TBEnLib.POS_VP) || TBEnLib.isSentence(parent.pos)) && child.isPos(TBEnLib.POS_PP + "|" + TBEnLib.POS_ADVP + "|" + TBEnLib.POS_SBAR + "|" + TBEnLib.POS_RB)) { return DepLib.DEPREL_ADV; } if (parent.isPos(TBEnLib.POS_VP) && (child.isPos(TBEnLib.POS_PRT) || c.isPos(TBEnLib.POS_RP))) { return DepLib.DEPREL_PRT; } if (p.isPos(TBEnLib.POS_TO) && child.isPos(TBEnLib.POS_VP)) { return DepLib.DEPREL_IM; } if (p.isPos(TBEnLib.POS_VB) && c.isPos(TBEnLib.POS_TO)) // when VC is reversed { return DepLib.DEPREL_IM; } if (b_reverseVC && TBEnLib.isAux(c.form) && p.isPos("VB.*")) { return DepLib.DEPREL_AUX; } if (b_reverseVC && c.isPos(TBEnLib.POS_MD) && p.isPos("VB.*")) { return DepLib.DEPREL_MOD; } if (parent.isPos(TBEnLib.POS_VP + "|" + TBEnLib.POS_SQ + "|" + TBEnLib.POS_SINV) && child.isPos(TBEnLib.POS_VP) && p_tree.getTerminalNode(child.headId).isPos("VB.*")) { return DepLib.DEPREL_VC; } if (parent.isPos(TBEnLib.POS_SBAR) && p.isPos(TBEnLib.POS_IN + "|" + TBEnLib.POS_TO + "|" + TBEnLib.POS_DT)) { return DepLib.DEPREL_SUB; } if (parent.isPos(TBEnLib.POS_NP + "|" + TBEnLib.POS_NX + "|" + TBEnLib.POS_NAC + "|" + TBEnLib.POS_NML + "|" + TBEnLib.POS_WHNP)) { return DepLib.DEPREL_NMOD; } if (parent.isPos(TBEnLib.POS_ADJP + "|" + TBEnLib.POS_ADVP + "|" + TBEnLib.POS_WHADJP + "|" + TBEnLib.POS_WHADVP)) { return DepLib.DEPREL_AMOD; } if (parent.isPos(TBEnLib.POS_PP + "|" + TBEnLib.POS_WHPP)) { return DepLib.DEPREL_PMOD; } if (parent.isPos(TBEnLib.POS_QP)) { return DepLib.DEPREL_QMOD; } if (child.isPos(TBEnLib.POS_INTJ) || c.isPos(TBEnLib.POS_UH)) { return DepLib.DEPREL_INTJ; } if (child.isPos(TBEnLib.POS_EDITED)) { return DepLib.DEPREL_EDIT; } if (child.isPos(TBEnLib.POS_CIT)) { return DepLib.DEPREL_CIT; } if (child.isPos(TBEnLib.POS_ADVP) || c.isPos(TBEnLib.POS_RB)) { return DepLib.DEPREL_ADV; } if (TBEnLib.isNounLike(parent.pos)) { return DepLib.DEPREL_NMOD; } return DepLib.DEPREL_DEP; } private boolean isFuncTag(String deprel) { return b_funcTag || deprel.matches(TBEnLib.TAG_SBJ + "|" + TBEnLib.TAG_LGS); } private String getTagDeprel(TBNode child, TBNode p) { if (p != null && (TBEnLib.isBe(p.form) || TBEnLib.isBecome(p.form)) && child.isTag(TBEnLib.TAG_PRD)) { return DepLib.DEPREL_PRD; } if (child.isTag(TBEnLib.TAG_SBJ)) { return DepLib.DEPREL_SBJ; } if (child.isPos(TBEnLib.POS_PP) && child.containsTag(TBEnLib.TAG_LGS)) { return DepLib.DEPREL_LGS; } if (child.isTag(TBEnLib.TAG_DTV)) { return DepLib.DEPREL_DTV; } // if (child.isTag(TBEnLib.TAG_CLF)) return DepLib.DEPREL_CLF; if (child.isTag(TBEnLib.TAG_EXT)) { return DepLib.DEPREL_EXT; } if (child.isTag(TBEnLib.TAG_LOC)) { return DepLib.DEPREL_LOC; } if (child.isTag(TBEnLib.TAG_TMP)) { return DepLib.DEPREL_TMP; } if (child.isPos(TBEnLib.POS_PP) && child.isTag(TBEnLib.TAG_BNF)) { return DepLib.DEPREL_BNF; } if (child.isTag(TBEnLib.TAG_DIR)) { return DepLib.DEPREL_DIR; } if (child.isTag(TBEnLib.TAG_MNR)) { return DepLib.DEPREL_MNR; } if (child.isTag(TBEnLib.TAG_PRP)) { return DepLib.DEPREL_PRP; } if (child.isTag(TBEnLib.TAG_SEZ)) { return DepLib.DEPREL_SEZ; } if (child.isTag(TBEnLib.TAG_VOC)) { return DepLib.DEPREL_VOC; } if (child.isTag(TBEnLib.TAG_PRD)) { return DepLib.DEPREL_PRD; } if (child.isTag(TBEnLib.TAG_ADV)) { return DepLib.DEPREL_ADV; } return null; } private String getObjectDeprel(TBNode parent, TBNode child, TBNode c) { if (!parent.isPos(TBEnLib.POS_VP)) { return null; } String deprel = getObjectDeprelAux(child, c); if (deprel != null) { return deprel; } if (child.isPos(TBEnLib.POS_UCP)) { deprel = getObjectDeprelAux(child, c); if (deprel != null) { return deprel; } } return null; } private String getObjectDeprelAux(TBNode child, TBNode c) { if (child.isPos(TBEnLib.POS_NP) || child.isPos(TBEnLib.POS_SBAR) && !(c.form.toLowerCase().matches("as|because|for|since|with")) || child.isPos(TBEnLib.POS_S + "|" + TBEnLib.POS_SQ + "|" + TBEnLib.POS_SINV + "|" + TBEnLib.POS_SBARQ)) { TBNode tNode = getTagNode(child, c, TBEnLib.TAG_PRD); if (tNode != null) { return DepLib.DEPREL_OPRD; } return c.isPos(TBEnLib.POS_TO + "|" + TBEnLib.POS_VBG + "|" + TBEnLib.POS_VBN) ? DepLib.DEPREL_OPRD : DepLib.DEPREL_OBJ; } return null; } private TBNode getTagNode(TBNode root, TBNode c, String tag) { if (c.isTag(tag)) { return c; } TBNode parent = c.getParent(); while (parent != null && !parent.equals(root)) { if (parent.isTag(tag)) { return parent; } parent = parent.getParent(); } return null; } /** * Redirects empty categories' antecedents. */ private void remapEmptyCategory() { HashSet<String> sRNR = new HashSet<>(); for (int i = d_tree.size() - 1; i >= 0; i--) { DepNode ec = d_tree.get(i); // checks for empty categories if (!ec.form.startsWith(TBEnLib.EC_EXP) && !ec.form.startsWith(TBEnLib.EC_ICH) && // !ec.form.startsWith(TBEnLib.EC_PPA) && !ec.form.startsWith(TBEnLib.EC_RNR) && !ec.form.startsWith(TBEnLib.EC_TRACE)) { continue; } // checks if there is co-index String[] tmp = ec.form.split("-"); if (tmp.length <= 1 || !tmp[1].matches("\\d*")) { continue; } // finds its antecedent int coIndex = Integer.parseInt(tmp[1]); TBNode antecedent = p_tree.getAntecedent(coIndex); if (antecedent == null) { continue; } DepNode ante = d_tree.get(antecedent.headId + 1); // if (ante.isPos(TBLib.POS_NONE)) continue; if (ante.id == ec.headId) { continue; } if (ec.form.startsWith(TBEnLib.EC_EXP)) { ante.deprel = DepLib.DEPREL_EXTR; continue; } if (ec.form.startsWith(TBEnLib.EC_RNR)) { if (sRNR.contains(ec.form)) { continue; } sRNR.add(ec.form); } if (d_tree.isAncestor(ante.id, ec.headId)) { if (ec.form.startsWith(TBEnLib.EC_RNR)) { for (DepNode node : d_tree.getDependents(ante.id)) { if (node.id == ec.headId || d_tree.isAncestor(node.id, ec.headId)) { node.setHead(ante.headId, ante.deprel, 1); break; } } } else { DepNode head = d_tree.get(ec.headId); while (head.headId != ante.id) { head = d_tree.get(head.headId); } head.setHead(ante.headId, ante.deprel, 1); } } String deprel = ec.deprel; while (ec.hasHead && d_tree.get(ec.headId).isPos(TBEnLib.POS_NONE)) { ec = d_tree.get(ec.headId); } ante.setHead(ec.headId, deprel, 1); } } /** * Removes all empty categories from * <code>tree</code>. * * @return dependency tree without empty categories. */ private DepTree removeEmptyCategories() { HashMap<Integer, Integer> map = new HashMap<>(); for (int i = 0, j = 0; i < d_tree.size(); i++) { DepNode node = d_tree.get(i); map.put(i, j); if (isNodeInclude(node, false)) { j++; } } DepTree copy = new DepTree(); for (int i = 1; i < d_tree.size(); i++) { DepNode node = d_tree.get(i); if (isNodeInclude(node, true)) { node.id = map.get(node.id); node.headId = map.get(node.headId); if (node.srlInfo != null) { for (SRLHead head : node.srlInfo.heads) { head.headId = map.get(head.headId); } } copy.add(node); } } return copy; } private boolean isNodeInclude(DepNode node, boolean isChange) { if (!node.isPos(TBLib.POS_NONE)) { return true; } if (!b_ec) { return false; } if (node.form.matches("\\*PRO\\*.*|\\*|\\*-\\d")) { DepNode head = d_tree.get(node.headId); if (head.isPosx("VB.*|TO") && node.id < head.id) { node.lemma = "*PRO*"; return true; } } else if (node.form.startsWith("0")) { TBNode parent = p_tree.getTerminalNode(node.id - 1).getParent(); if (parent.isPos(TBEnLib.POS_WHNP)) { node.lemma = "*REL*"; return true; } } /* * if (node.form.startsWith("*T*")) { TBNode ante = * p_tree.getAntecedent(Integer.parseInt(node.form.substring(node.form.lastIndexOf("-")+1))); * // System.out.println(node.id+" "+ante.pbLoc+" * "+d_tree.toString()+"\n"); // try {System.in.read();} catch * (IOException e) {} * * if (ante != null && ante.isPos(TBEnLib.POS_WHNP)) { if * (ante.isPhrase()) ante = ante.getChildren().get(0); * * if (ante.isPos(TBEnLib.POS_NONE) && ante.isForm("0")) { * System.out.println(node.id+" "+d_tree+"\n"); try {System.in.read();} * catch (IOException e) {} return true; } } } */ return false; } private void lemmatizeEC(DepTree tree) { DepNode node; for (int i = 1; i < tree.size(); i++) { node = tree.get(i); if (node.isPos(TBEnLib.POS_NONE)) { node.form = node.lemma; } } } private void postProp(DepTree tree) { HashSet<String> set = new HashSet<>(); DepNode node; String sub; for (int i = 1; i < tree.size(); i++) { node = tree.get(i); if (node.srlInfo != null) { for (SRLHead head : node.srlInfo.heads) { if (head.label.equals("C-V")) { continue; } if (head.label.startsWith("C-")) { sub = head.label.substring(2); if (!set.contains(head.headId + ":" + sub)) { head.label = sub; set.add(sub); } } else if (!head.label.startsWith("R-")) { set.add(head.headId + ":" + head.label); } } } } } /** * Assigns the dependency head of the current node. */ private void setDependency(int currId, int headId, String deprel) { d_tree.setHead(currId + 1, headId + 1, deprel, 1); } /** * @return true if the current node already has its dependency head. */ private boolean hasHead(int currId) { return d_tree.get(currId + 1).hasHead; } // =========================== SRL conversion =========================== private void reconfigureAntecedents() { ArrayList<TBNode> terminalNodes = p_tree.getTerminalNodes(); TBNode node, ante; for (int i = 0; i < terminalNodes.size(); i++) { node = terminalNodes.get(i); if (!node.hasAntecedent()) { continue; } ante = node.antecedent; while (ante.isEmptyCategoryRec()) { if (ante.isPhrase()) { ante = p_tree.getTerminalNode(ante.pbLoc.terminalId); } if (ante.hasAntecedent()) { ante = ante.antecedent; } else { break; } } node.antecedent = ante; } } /** * Saves the original propbank locations. */ void mapPBLocToDep() { setPBLocInDepAux(p_tree.getRootNode()); } private void setPBLocInDepAux(TBNode tNode) { if (tNode.headId >= 0) { DepNode dNode = d_tree.get(tNode.headId + 1); if (dNode.pbLoc[1].terminalId < 0 && !containsPredicate(tNode, dNode)) { dNode.pbLoc[1].set("", tNode.pbLoc.terminalId, tNode.pbLoc.height); } } if (tNode.isPhrase()) { for (TBNode child : tNode.getChildren()) { setPBLocInDepAux(child); } } else { DepNode dNode = d_tree.get(tNode.terminalId + 1); dNode.pbLoc[0].set("", tNode.pbLoc.terminalId, tNode.pbLoc.height); } } private boolean containsPredicate(TBNode tNode, DepNode dNode) { BitSet set = tNode.getSubTerminalBitSet(); for (SRLHead head : dNode.srlInfo.heads) { if (set.get(head.headId - 1)) { return true; } } return false; } void relocatePROs(DepTree tree) { DepNode node, head, inte; int i, j, size = tree.size(); ArrayList<DepNode> delList = new ArrayList<>(); for (i = 1; i < size; i++) { node = tree.get(i); if (!node.form.startsWith("*PRO*")) { continue; } node.form = node.lemma = (node.antecedent == null) ? "*pro*" : "*PRO*"; head = tree.get(node.headId); if (head.isPosx("VB.*") && head.id == node.id + 1) { continue; } if (node.srlInfo != null) { for (SRLHead tmp : node.srlInfo.heads) { if (node.id < tmp.headId) { head = tree.get(tmp.headId); break; } } } if (head.isPos("TO")) { for (j = head.id + 1; j < size; j++) { inte = tree.get(j); if (inte.headId == head.id && inte.isPosx("VB.*")) { head = inte; break; } } } if (!head.isPosx("VB.*") || head.id < node.id) { delList.add(node); } else { node.headId = head.id; tree.add(head.id, node); tree.remove(i); i = head.id; } } tree.removeAll(delList); IntIntOpenHashMap map = new IntIntOpenHashMap(); for (i = 1; i < tree.size(); i++) { node = tree.get(i); map.put(node.id, i); } for (i = 1; i < tree.size(); i++) { node = tree.get(i); node.id = i; node.headId = map.get(node.headId); if (node.srlInfo != null) { for (SRLHead tmp : node.srlInfo.heads) { tmp.headId = map.get(tmp.headId); } } } } }