/** * Copyright (c) 2007, Regents of the University of Colorado All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. Redistributions in binary * form must reproduce the above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or other materials provided * with the distribution. Neither the name of the University of Colorado at * Boulder nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package clear.treebank; import clear.dep.srl.SRLHead; import clear.propbank.PBLoc; import com.carrotsearch.hppc.IntOpenHashSet; import java.util.ArrayList; import java.util.BitSet; import java.util.HashSet; import java.util.StringTokenizer; /** * Treebank node. * * @author Jinho D. Choi <b>Last update:</b> 9/1/2010 */ public class TBNode { /** * Word-form of the node */ public String form; /** * Part-of-speech tag of the node */ public String pos; /** * Function tags of the node */ public HashSet<String> tags; /** * Co-index (e.g., NP-1) */ public int coIndex; /** * Gapping index (e.g., NP=1) */ public int gapIndex; /** * Terminal index (counting ECs), starting from 0 */ public int terminalId; /** * Token index (not counting ECs), starting from 0 */ public int tokenId; /** * Head index, derived by headrules */ public int headId; /** * Child index among its siblings */ public int childId; /** * If empty category, store its antecedent. */ public TBNode antecedent; /** * PropBank location of this node */ public PBLoc pbLoc; /** * Roleset ID if exist */ public String rolesetId; /** * Parent node */ protected TBNode nd_parent; /** * List of children nodes */ protected ArrayList<TBNode> ls_children; /** * PropBank heads */ protected ArrayList<SRLHead> pb_heads; /** * PropBank arguments (if this node is a predicate) */ // protected ArrayList<SRLArg> pb_args; /** * Initializes the node with its parent node and pos-tag. */ public TBNode(TBNode parent, String postag) { form = null; tags = null; coIndex = -1; gapIndex = -1; terminalId = -1; tokenId = -1; headId = -1; childId = -1; antecedent = null; pbLoc = null; rolesetId = null; nd_parent = parent; ls_children = null; pb_heads = null; // pb_args = null; init(postag); } /** * Initializes pos-tags, function-tags, co-index, and gap-index. */ private void init(String postag) { if (postag.matches("-([A-Z])+-")) { pos = postag; return; } StringTokenizer tok = new StringTokenizer(postag, "-=~", true); pos = tok.nextToken(); OUTER: while (tok.hasMoreTokens()) { String op = tok.nextToken(); switch (op) { case "=": if (tok.hasMoreTokens()) { gapIndex = Integer.parseInt(tok.nextToken()); } else { break OUTER; } break; case "-": if (tok.hasMoreTokens()) { String tag = tok.nextToken(); if (tag.matches("\\d*")) { coIndex = Integer.parseInt(tag); } else { if (tags == null) { tags = new HashSet<>(); } tags.add(tag); } } else { break OUTER; } break; } } } /** * Returns true if the word-form of this node is * <code>form</code>. If the node is a phrase, returns false. */ public boolean isForm(String regex) { return this.form != null && this.form.matches(regex); } /** * Returns true if the rule applies to this node. If * <code>rule</code> starts with '-', it compares the function tag; * otherwise, compares the pos-tag. */ public boolean isRule(String rule) { if (rule.charAt(0) == '-') { return isTag(rule.substring(1)); } else { return isPos(rule); } } /** * @return true if the pos-tag of this node is * <code>pos</code> in regular expression (e.g., NN.*|VB). */ public boolean isPos(String regex) { return pos.matches(regex); } /** * @return true is the function tag of this node is * <code>tag</code>. */ public boolean isTag(String tag) { return tags != null && tags.contains(tag); } /** * @return true if the node is an empty category. */ public boolean isEmptyCategory() { return pos.equals(TBLib.POS_NONE); } /** * @return true if the node contains only empty category, recursively. */ public boolean isEmptyCategoryRec() { return isEmptyCategoryRecAux(this); } /** * This method is called from {@link TBNode#isEmptyCategoryRec()}. */ private boolean isEmptyCategoryRecAux(TBNode curr) { if (!curr.isPhrase()) { return curr.isEmptyCategory(); } for (TBNode child : curr.getChildren()) { if (!isEmptyCategoryRecAux(child)) { return false; } } return true; } public boolean isToken() { return tokenId != -1; } /** * @return true if the node is a phrase. */ public boolean isPhrase() { return ls_children != null; } /** * @return true if this node is a phrase containing * <code>pos</code> as a pos-tag of its children. */ public boolean containsPos(String pos) { if (!isPhrase()) { return false; } for (TBNode child : ls_children) { if (child.isPos(pos)) { return true; } } return false; } /** * @return true if this node is a phrase containing * <code>tag</code> as a function-tag of its children. */ public boolean containsTag(String tag) { if (!isPhrase()) { return false; } for (TBNode child : ls_children) { if (child.isTag(tag)) { return true; } } return false; } /** * @param regex word-form * @return true if this node contains the word-form. */ public boolean containsForm(String regex) { return containsFormAux(this, regex); } private boolean containsFormAux(TBNode node, String regex) { if (!node.isPhrase()) { return node.isForm(regex); } for (TBNode child : node.getChildren()) { if (containsFormAux(child, regex)) { return true; } } return false; } /** * @return true if the node contains a gap-node. */ public boolean containsGap() { if (!isPhrase()) { return false; } for (TBNode child : ls_children) { if (child.gapIndex != -1) { return true; } } return false; } /** * Returns a child with a gap-index of * <code>index</code>. If there is no such child, returns null. */ public TBNode getGapNode(int index) { return getGapNodeAux(index, this); } /** * This method is called from {@link TBNode#getGapNode(int)}. */ private TBNode getGapNodeAux(int index, TBNode curr) { if (!curr.isPhrase()) { return null; } for (TBNode child : curr.getChildren()) { if (child.coIndex == index || child.gapIndex == index) { return curr; } TBNode node = getGapNodeAux(index, child); if (node != null) { return node; } } return null; } /** * @return the number of children whose pos-tag is * <code>pos</code>. */ public int countsPos(String pos) { if (!isPhrase()) { return 0; } int count = 0; for (TBNode child : ls_children) { if (child.isPos(pos)) { count++; } } return count; } /** * Returns the parent node. If there is none, returns null. */ public TBNode getParent() { return nd_parent; } /** * Returns the list of children nodes. If there is no child, returns null. */ public ArrayList<TBNode> getChildren() { return ls_children; } /** * Assigns * <code>form</code> to {@link TBNode#form}. If * <code>form</code> is a bracket-tag, convert it to its actual word-form * (e.g., "-LRB-" to "("). */ public void setForm(String form) { form = form.replaceAll("-LRB-", "("); form = form.replaceAll("-LSB-", "["); form = form.replaceAll("-LCB-", "{"); form = form.replaceAll("-RRB-", ")"); form = form.replaceAll("-RSB-", "]"); form = form.replaceAll("-RCB-", "}"); this.form = form; } /** * Sets the parent node to * <code>parent</code>. */ public void setParent(TBNode parent) { nd_parent = parent; } /** * Adds a child node. */ public void addChild(TBNode child) { if (ls_children == null) { ls_children = new ArrayList<>(); } child.childId = ls_children.size(); ls_children.add(child); } /** * Returns word-forms of the node's subtree. */ public String toWords() { return toWordsAux(this); } /** * This method is called from {@link TBNode#toWords()}. */ private String toWordsAux(TBNode curr) { if (curr.isPhrase()) { StringBuilder build = new StringBuilder(); for (TBNode child : curr.getChildren()) { build.append(toWordsAux(child)); build.append(" "); } return build.toString().trim(); } else { return curr.form; } } /** * @return pos-tags of the node's children. */ public String toPosTags() { StringBuilder build = new StringBuilder(); for (TBNode child : ls_children) { build.append(child.pos).append("-").append(child.tags); build.append(" "); } return build.toString(); } /** * @return pos-tags of the node's children. */ public String toPosWords() { StringBuilder build = new StringBuilder(); for (TBNode child : ls_children) { build.append("("); build.append(child.pos); build.append(" "); build.append(child.toWords()); build.append(")"); build.append(" "); } return build.toString(); } public ArrayList<TBNode> getSubTerminalNodes() { ArrayList<TBNode> list = new ArrayList<>(); getSubTerminalNodesAux(this, list); return list; } private void getSubTerminalNodesAux(TBNode node, ArrayList<TBNode> list) { if (node.isPhrase()) { for (TBNode child : node.getChildren()) { getSubTerminalNodesAux(child, list); } } else { list.add(node); } } public IntOpenHashSet getSubTermainlIDs() { IntOpenHashSet set = new IntOpenHashSet(); getSubTerminalIDsAux(this, set); return set; } private void getSubTerminalIDsAux(TBNode node, IntOpenHashSet set) { if (node.isPhrase()) { for (TBNode child : node.getChildren()) { getSubTerminalIDsAux(child, set); } } else { set.add(node.terminalId); } } /** * @return the bitset of terminal indices of the subtree of this node. */ public BitSet getSubTerminalBitSet() { BitSet set = new BitSet(); getSubTerminalBitSetAux(this, set); return set; } /** * This method is called from {@link TBTree#getSubTerminalBitSet()}. */ private void getSubTerminalBitSetAux(TBNode node, BitSet set) { if (node.getChildren() == null) { set.set(node.terminalId); } else { for (TBNode child : node.getChildren()) { getSubTerminalBitSetAux(child, set); } } } /** * Returns the bitset of token indices of the subtree of the current node. * Each index gets added by * <code>offset</code> (e.g., if * <code>offset</code> is 1, [0,1,2] becomes [1,2,3]). */ public BitSet getSubTokenBitSet(int offset) { BitSet set = new BitSet(); getSubTokenBitSetAux(this, set, offset); return set; } /** * This method is called from {@link TBNode#getSubTokenBitSet(int)}. */ private void getSubTokenBitSetAux(TBNode node, BitSet set, int offset) { if (node.getChildren() == null) { int tokenIndex = node.tokenId + offset; if (tokenIndex >= offset) { set.set(tokenIndex); } } else { for (TBNode child : node.getChildren()) { getSubTokenBitSetAux(child, set, offset); } } } /** * If this node is an empty category, return the coIndex of its antecedent. */ public int getEmptyCategoryCoIndex() { if (isEmptyCategory()) { int idx = form.lastIndexOf('-'); if (idx >= 0) { return Integer.parseInt(form.substring(idx + 1)); } } return -1; } public String getTags() { StringBuilder build = new StringBuilder(); build.append(pos); if (coIndex != -1) { build.append("-"); build.append(coIndex); } if (gapIndex != -1) { build.append("="); build.append(gapIndex); } if (tags != null) { for (String tag : tags) { build.append("-"); build.append(tag); } } return build.toString(); } public boolean addPBHead(SRLHead sHead) { if (pb_heads == null) { pb_heads = new ArrayList<>(); } for (SRLHead head : pb_heads) { if (head.equals(sHead)) { return false; } } return pb_heads.add(sHead); } public ArrayList<SRLHead> getPBHeads() { return pb_heads; } /* * public boolean addPBArg(SRLArg sArg) { if (pb_args == null) pb_args = new * ArrayList<SRLArg>(); * * for (SRLArg arg : pb_args) { if (arg.isLabel(sArg.label)) return false; } * * return pb_args.add(sArg); } */ public String getSentenceGroup() { return getSentenceGroupAux(this); } private String getSentenceGroupAux(TBNode node) { if (node.isPos("S.*")) { return node.pos; } if (node.getParent() == null) { return null; } return getSentenceGroupAux(node.getParent()); } public boolean isPrior(String pos) { if (nd_parent == null) { return false; } TBNode parent = getParent(); ArrayList<TBNode> siblings = parent.getChildren(); for (int i = childId - 1; i >= 0; i--) { if (siblings.get(i).isPos(pos)) { return true; } } return false; } public boolean isSucceededBy(String pos) { if (nd_parent == null) { return false; } TBNode parent = getParent(); ArrayList<TBNode> siblings = parent.getChildren(); if (childId - 1 >= 0 && siblings.get(childId - 1).isPos(pos)) { return true; } return false; } public boolean isFollowedBy(String pos) { if (nd_parent == null) { return false; } TBNode parent = getParent(); ArrayList<TBNode> siblings = parent.getChildren(); for (int i = childId + 1; i < siblings.size(); i++) { if (siblings.get(i).isPos(pos)) { return true; } } return false; } /** * @return complementizer terminal node. If there is no such node, return * null. */ public TBNode getComplementizer() { for (TBNode node : getSubTerminalNodes()) { if (node.isPos("W.*|-NONE-") || TBEnLib.isComplementizer(node.form)) { return node; } } return isPos("WH.*") ? this : null; } public TBNode getIncludedEmptyCategory(String regex) { for (TBNode node : getSubTerminalNodes()) { if (node.isForm(regex)) { return node; } } return null; } public boolean hasAntecedent() { return antecedent != null; } public TBNode getIncludedEmptyCategory() { for (TBNode node : getSubTerminalNodes()) { if (node.isEmptyCategory()) { return node; } } return null; } public int getMaxHeight() { TBNode curr = nd_parent; int height = 0; while (curr != null) { height++; curr = curr.getParent(); } return height; } }