/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.selectFollowing; import static org.apache.uima.fit.util.JCasUtil.selectPreceding; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import org.apache.uima.cas.CASException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.PennTreeReader; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.TreeReader; import edu.stanford.nlp.util.IntPair; /** * Utility class for the StanfordParser * */ public class TreeUtils { /** * Recursively creates an edu.stanford.nlp.trees.Tree from a ROOT annotation It also saves the * whitespaces before and after a token as <code>CoreAnnotation.BeforeAnnotation</code> and * <code>CoreAnnotation.AfterAnnotation</code> in the respective label of the current node. * * @param root * the ROOT annotation * @return an {@link Tree} object representing the syntax structure of the sentence */ public static Tree createStanfordTree(Annotation root) { return createStanfordTree(root, new LabeledScoredTreeFactory(CoreLabel.factory())); } public static Tree createStanfordTree(Annotation root, TreeFactory tFact) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); ((CoreLabel) wordNode.label()).set(TokenKey.class, wordAnnotation); ((CoreLabel) wordNode.label()).set(TextAnnotation.class, wordAnnotation.getCoveredText()); // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()).set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()).set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // NER annotation List<NamedEntity> nes = selectCovered(NamedEntity.class, wordAnnotation); if (nes.size() > 0) { ((CoreLabel) wordNode.label()).setNER(nes.get(0).getValue()); } else { ((CoreLabel) wordNode.label()).setNER("O"); } // Lemma annotation String lemma = wordAnnotation.getLemmaValue(); if (lemma != null) { ((CoreLabel) wordNode.label()).setLemma(lemma); } else { ((CoreLabel) wordNode.label()).setLemma(wordAnnotation.getText()); } // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(wordAnnotation.getPosValue(), Arrays.asList((new Tree[] { wordNode }))); } return rootNode; } /** * <p> * Recreates a Stanford Tree from the StanfordParser annotations and saves all * non-StanfordParser-Annotations within the scope of the sentence in the label of the best * fitting node. * </p> * * <p> * <strong>CAUTION: </strong><i>This method is intended for the use by CAS Multipliers, which * create new CASes from this tree. The annotation-spans in the source-CAS will be changed!!!!!! * You do NOT want to use the source CAS after this method has been called. The * createStanfordTree()-method does not change the CAS, so use this instead, if the annotations * do not have to be recovered or accessed in the tree.</i> * </p> * * <p> * TODO: This behavior could be changed by making COPIES of the annotations and changing the * copied instead of the originals. However, in order to being able to make copies, a dummy CAS * must be introduced to which the annotations can be copied. When they are recovered, they will * be copied to the new destination CAS anyway. * </p> * * @param root * the ROOT annotation * @return an {@link Tree} object representing the syntax structure of the sentence * @throws CASException if the JCas cannot be accessed. */ public static Tree createStanfordTreeWithAnnotations(Annotation root) throws CASException { JCas aJCas = root.getCAS().getJCas(); // Create tree Tree tree = createStanfordTree(root); // Get all non-parser related annotations // and all tokens (needed for span-calculations later on) List<Annotation> nonParserAnnotations = new ArrayList<Annotation>(); List<Token> tokens = new ArrayList<Token>(); // Using getCoveredAnnotations instead of iterate, because subiterators did not work in all // cases List<Annotation> annosWithinRoot = JCasUtil.selectCovered(aJCas, Annotation.class, root); for (Annotation curAnno : annosWithinRoot) { if (!(curAnno instanceof POS) && !(curAnno instanceof Constituent) && !(curAnno instanceof Dependency) && !(curAnno instanceof PennTree) && !(curAnno instanceof Lemma) && !(curAnno instanceof Token) && !(curAnno instanceof DocumentMetaData)) { nonParserAnnotations.add(curAnno); } else if (curAnno instanceof Token) { tokens.add((Token) curAnno); } } // create wrapper for tree and its tokens TreeWithTokens annoTree = new TreeWithTokens(tree, tokens); /* * Add annotations to the best-fitting nodes. The best-fitting node for an annotation is the * deepest node in the tree that still completely contains the annotation. */ for (Annotation curAnno : nonParserAnnotations) { // get best fitting node Tree bestFittingNode = annoTree.getBestFit(curAnno); // Add annotation to node if (bestFittingNode != null) { // translate annotation span to a value relative to the // node-span IntPair span = annoTree.getSpan(bestFittingNode); curAnno.setBegin(curAnno.getBegin() - span.getSource()); curAnno.setEnd(curAnno.getEnd() - span.getSource()); // get the collection from the label of the best-fitting node in which we store UIMA // annotations or create it, if it does not exist Collection<Annotation> annotations = ((CoreLabel) bestFittingNode.label()) .get(UIMAAnnotations.class); if (annotations == null) { annotations = new ArrayList<Annotation>(); } // add annotation + checksum of annotated text to list and write it back to node // label annotations.add(curAnno); ((CoreLabel) bestFittingNode.label()).set(UIMAAnnotations.class, annotations); } } return tree; } private static boolean isLeaf(Constituent constituent) { return (constituent.getChildren() == null || constituent.getChildren().size() == 0); } /** * Returns the sentence from its tree representation. * * @param penn * the tree representation of the sentence * @return the sentence */ public static String pennString2Words(String penn) { return tree2Words(pennString2Tree(penn)); } /** * Returns the sentence from its tree representation. * * @param t * the tree representation of the sentence * @return the sentence */ public static String tree2Words(Tree t) { StringBuilder buffer = new StringBuilder(); List<Tree> leaves = t.getLeaves(); for (Tree leaf : leaves) { String word = ((CoreLabel) leaf.label()).get(CoreAnnotations.ValueAnnotation.class); // TODO maybe double check preceding whitespaces, because transformations could have // resulted in the situation that the trailing // whitespaces of out last tokens is not the same as the preceding whitespaces of out // current token BUT: This has also to be done in getTokenListFromTree(...) // now add the trailing whitespaces String trailingWhitespaces = ((CoreLabel) leaf.label()) .get(CoreAnnotations.AfterAnnotation.class); // if no whitespace-info is available, insert a whitespace this may happen for nodes // inserted by TSurgeon operations if (trailingWhitespaces == null) { trailingWhitespaces = " "; } buffer.append(word).append(trailingWhitespaces); } return buffer.toString(); } /** * Returns a list of Token annotations from a Tree-object * * @param aJCas * a JCas. * @param t * a tree. * @return the tokens. */ public static List<Token> getTokenListFromTree(JCas aJCas, Tree t) { List<Token> tokenList = new ArrayList<Token>(); int index = 0; for (Tree leaf : t.getLeaves()) { String word = ((CoreLabel) leaf.label()).get(CoreAnnotations.ValueAnnotation.class); tokenList.add(new Token(aJCas, index, index + word.length())); // get trailing whitespaces to calculate next index String whiteSpaces = ((CoreLabel) leaf.label()) .get(CoreAnnotations.AfterAnnotation.class); if (whiteSpaces == null) { whiteSpaces = " "; } index += word.length() + whiteSpaces.length(); } return tokenList; } /** * Reimplementation of the indexLeaves() method of stanford tree objects. This method reindexes * already indexed trees starting with index 1. The method expects trees with * <code>CoreMap</code>-type labels. * * @see edu.stanford.nlp.trees.Tree#indexLeaves() * @param t * a tree with CoreLabel-type labels. */ public static void reIndexLeaves(Tree t) { reIndexLeaves(t, 1); } private static int reIndexLeaves(Tree t, int startIndex) { if (t.isLeaf()) { CoreLabel afl = (CoreLabel) t.label(); afl.setIndex(startIndex); startIndex++; } else { for (Tree child : t.children()) { startIndex = reIndexLeaves(child, startIndex); } } return startIndex; } /** * Reads in a Penn Treebank-style String and returns a tree. * * @param pennString * A Penn Treebank-style String as produced by the StandfordParser * @return a tree representation of the PennString (LabeledScoredTree) */ public static Tree pennString2Tree(String pennString) { TreeReader tr = null; try { tr = new PennTreeReader(new StringReader(pennString), new LabeledScoredTreeFactory()); return tr.readTree(); } catch (IOException e) { throw new IllegalStateException(e); } finally { closeQuietly(tr); } } }