/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; import java.util.Iterator; import java.util.List; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.IntPair; /** * A wrapper object that manages a tree object together with the respective * Token annotations for the leafs of the tree. This is needed for being able to * map the leaves of the tree to words in a CAS. * * Trees in TreeWithToken-object are always converted to trees with * CoreLabel-type labels. * * */ public class TreeWithTokens { private Tree tree; private List<Token> tokens; public TreeWithTokens(Tree tree, List<Token> tokens) { setTree(tree); setTokens(tokens); } public void setTree(Tree tree) { if (!(tree.label() instanceof CoreLabel)) { tree = tree.deepCopy(tree.treeFactory(), CoreLabel.factory()); } tree.indexLeaves(); this.tree = tree; } public Tree getTree() { return tree; } public void setTokens(List<Token> tokens) { this.tokens = tokens; } public List<Token> getTokens() { return tokens; } /** * Returns the span of the documentText that is covered by this * TreeWithTokens. * * @return an IntPair describing the span of the documentText that is * covered by this tree */ public IntPair getSpan() { return getSpan(getTree()); } /** * Returns the span of the documentText that is covered by a given subtree, * that has to be taken directly from the original tree. * <p> * NOTE: Possibly we could make this more general to also support general * trees that are contained in the original tree, but are not directly taken * from it (i.e. with different leaf-numbering). In order to do so, we would * have to make a Tregex-Matching of the given subtree in the original tree * to identify the positition of the given subtree. * <p> * This could be achieved by translating the subtree into a Tregex pattern * and then matching this pattern against the original tree. * * @param subtree * a subtree of this TreeWithTokens (it has to be a real * subtree(!), because index numbering of subtree has to fit to * the numbering of the original tree) * @return an IntPair describing the span of the documentText that is * covered by this tree */ public IntPair getSpan(Tree subtree) { // TODO check if subtree is a real subtree of tokenTree.getTree() int nodeIndexLeft = ((CoreLabel) getLeftmostLeaf(subtree).label()) .index(); int nodeIndexRight = ((CoreLabel) getRightmostLeaf(subtree).label()) .index(); int a = tokens.get(nodeIndexLeft - 1).getBegin(); int b = tokens.get(nodeIndexRight - 1).getEnd(); return new IntPair(a, b); } private Tree getLeftmostLeaf(Tree t) { if (t.isLeaf()) { return t; } else { return getLeftmostLeaf(t.firstChild()); } } private Tree getRightmostLeaf(Tree t) { if (t.isLeaf()) { return t; } else { return getRightmostLeaf(t.lastChild()); } } /** * Finds the best-fitting node in the tree for a given annotation. * * The best-fitting node for an annotation is the deepest node in the tree * that still completely contains the span of the given annotation. * * TODO Could be done more efficiently, I think. In a recursive method, for * example, recursion could be stopped as soon as overlap becomes -1 * * @param anno * the annotation to find a best fit for * * @return the node of the tree that is the best fit for <code>anno</code> */ public Tree getBestFit(Annotation anno) { Tree curBestFit = null; int curBestOverlap = Integer.MAX_VALUE; Iterator<Tree> treeIterator = getTree().iterator(); while (treeIterator.hasNext()) { Tree curTree = treeIterator.next(); IntPair span = getSpan(curTree); // calc overlap: if annotation not completely contained in span of // subtree, overlap will be -1, otherwise it will be >0 // Our goal is to find the node with minimal positive overlap int overlap = -1; int leftBorder = anno.getBegin() - span.getSource(); int rightBorder = span.getTarget() - anno.getEnd(); if (!(leftBorder < 0) && !(rightBorder < 0)) { overlap = leftBorder + rightBorder; } // determine whether node is better than the temporary best fit if ((overlap > -1) && overlap < curBestOverlap) { curBestFit = curTree; curBestOverlap = overlap; } } return curBestFit; } }