/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.tcas.Annotation;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.Tag;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.IntPair;
/**
* A StanfordAnnotator-object creates most of the annotations for the StanfordParser component.
* <p>
* The code has been moved away from the Parser component because it is also used by other
* components (e.g. Transformations)
*
*/
public class StanfordAnnotator
{
private static final String DEPPACKAGE = Dependency.class.getPackage().getName() + ".";
/**
* The separator that is used by Tsurgeon-Operations to separate additional tags from
* node-labels.
*/
private static final String TAG_SEPARATOR = "#";
private TreeWithTokens tokenTree = null;
private JCas jCas = null;
private MappingProvider posMappingProvider;
private MappingProvider constituentMappingProvider;
public TreeWithTokens getTokenTree()
{
return tokenTree;
}
public void setTokenTree(TreeWithTokens aTokenTree)
{
tokenTree = aTokenTree;
}
public JCas getJCas()
{
return jCas;
}
public void setJCas(JCas aJCas)
{
jCas = aJCas;
}
public void setPosMappingProvider(MappingProvider aPosMappingProvider)
{
posMappingProvider = aPosMappingProvider;
}
public void setConstituentMappingProvider(MappingProvider aConstituentMappingProvider)
{
constituentMappingProvider = aConstituentMappingProvider;
}
public StanfordAnnotator(TreeWithTokens aTokenTree)
throws CASException
{
setTokenTree(aTokenTree);
setJCas(aTokenTree.getTokens().get(0).getCAS().getJCas());
}
/**
* Creates linked constituent annotations, POS annotations and lemma-annotations.
* <p>
* Note: The annotations are directly written to the indexes of the CAS.
*
* @param aTreebankLanguagePack
* the language pack.
* @param aCreatePos
* whether to create POS annotations.
*/
public void createConstituentAnnotationFromTree(TreebankLanguagePack aTreebankLanguagePack,
boolean aCreatePos)
{
createConstituentAnnotationFromTree(aTreebankLanguagePack, tokenTree.getTree(), null,
aCreatePos);
}
/**
* Creates linked constituent annotations + POS annotations
*
* @param aTreebankLanguagePack
* the language pack.
* @param aNode
* the source tree
* @param aParentFS
* the parent annotation
* @param aCreatePos
* sets whether to create or not to create POS tags
* @return the child-structure (needed for recursive call only)
*/
private Annotation createConstituentAnnotationFromTree(
TreebankLanguagePack aTreebankLanguagePack, Tree aNode, Annotation aParentFS,
boolean aCreatePos)
{
String nodeLabelValue = aNode.value();
String syntacticFunction = null;
AbstractTreebankLanguagePack tlp = (AbstractTreebankLanguagePack) aTreebankLanguagePack;
int gfIdx = nodeLabelValue.indexOf(tlp.getGfCharacter());
if (gfIdx > 0) {
syntacticFunction = nodeLabelValue.substring(gfIdx + 1);
nodeLabelValue = nodeLabelValue.substring(0, gfIdx);
}
// calculate span for the current subtree
IntPair span = tokenTree.getSpan(aNode);
// Check if the node has been marked by a TSurgeon operation.
// If so, add a tag-annotation on the constituent
if (nodeLabelValue.contains(TAG_SEPARATOR) && !nodeLabelValue.equals(TAG_SEPARATOR)) {
int separatorIndex = nodeLabelValue.indexOf(TAG_SEPARATOR);
String tag = nodeLabelValue.substring(0, separatorIndex);
nodeLabelValue = nodeLabelValue.substring(separatorIndex + 1, nodeLabelValue.length());
createTagAnnotation(span.getSource(), span.getTarget(), tag);
}
// Check if node is a constituent node on sentence or phrase-level
if (aNode.isPhrasal()) {
// add annotation to annotation tree
Constituent constituent = createConstituentAnnotation(span.getSource(),
span.getTarget(), nodeLabelValue, syntacticFunction);
// link to parent
if (aParentFS != null) {
constituent.setParent(aParentFS);
}
// Do we have any children?
List<Annotation> childAnnotations = new ArrayList<Annotation>();
for (Tree child : aNode.getChildrenAsList()) {
Annotation childAnnotation = createConstituentAnnotationFromTree(
aTreebankLanguagePack, child, constituent, aCreatePos);
if (childAnnotation != null) {
childAnnotations.add(childAnnotation);
}
}
// Now that we know how many children we have, link annotation of
// current node with its children
FSArray children = new FSArray(jCas, childAnnotations.size());
int curChildNum = 0;
for (FeatureStructure child : childAnnotations) {
children.set(curChildNum, child);
curChildNum++;
}
constituent.setChildren(children);
// write annotation for current node to index
jCas.addFsToIndexes(constituent);
return constituent;
}
// If the node is a word-level constituent node (== POS):
// create parent link on token and (if not turned off) create POS tag
else if (aNode.isPreTerminal()) {
// create POS-annotation (annotation over the token)
POS pos = createPOSAnnotation(span.getSource(), span.getTarget(), nodeLabelValue);
// in any case: get the token that is covered by the POS
// TODO how about multi word prepositions etc. (e.g. "such as")
List<Token> coveredTokens = JCasUtil.selectCovered(jCas, Token.class, pos);
// the POS should only cover one token
assert coveredTokens.size() == 1;
Token token = coveredTokens.get(0);
// only add POS to index if we want POS-tagging
if (aCreatePos) {
jCas.addFsToIndexes(pos);
token.setPos(pos);
}
// link token to its parent constituent
if (aParentFS != null) {
token.setParent(aParentFS);
}
return token;
}
else {
throw new IllegalArgumentException("Node must be either phrasal nor pre-terminal");
}
}
/**
* Creates a tag-annotation over a constituent
*
* @param aBegin
* start-index of the constituent span
* @param aEnd
* end-index of the constituent span
* @param aTag
* the tag value
*/
public void createTagAnnotation(int aBegin, int aEnd, String aTag)
{
Tag newTag = new Tag(jCas, aBegin, aEnd);
newTag.setValue(aTag);
jCas.addFsToIndexes(newTag);
}
/**
* Creates a new Constituent annotation. Links to parent- and child-annotations are not yet
* created here.
*
* @param aBegin
* start-index of the constituent span
* @param aEnd
* end-index of the constituent span
* @param aConstituentType
* the constituent type
* @param aSyntacticFunction
* the syntactic function
* @return the annotation
*/
public Constituent createConstituentAnnotation(int aBegin, int aEnd, String aConstituentType,
String aSyntacticFunction)
{
// create the necessary objects and methods
Type constType = constituentMappingProvider.getTagType(aConstituentType);
Constituent constAnno = (Constituent) jCas.getCas().createAnnotation(constType, aBegin, aEnd);
constAnno.setConstituentType(aConstituentType);
constAnno.setSyntacticFunction(aSyntacticFunction);
return constAnno;
}
/**
* Creates a new Constituent annotation. Links to parent- and child-annotations are not yet
* created here.
*
* @param aBegin
* start-index of the constituent span
* @param aEnd
* end-index of the constituent span
* @param aPosType
* the constituent type
* @return the annotation
*/
public POS createPOSAnnotation(int aBegin, int aEnd, String aPosType)
{
// get mapping for DKPro-Typesystem
Type type = posMappingProvider.getTagType(aPosType);
// create instance of the desired type
POS anno = (POS) jCas.getCas().createAnnotation(type, aBegin, aEnd);
// save original (unmapped) postype in feature
anno.setPosValue(aPosType);
anno.setCoarseValue(anno.getClass().equals(POS.class) ? null
: anno.getType().getShortName().intern());
return anno;
}
public Dependency createDependencyAnnotation(GrammaticalRelation aDependencyType,
Token aGovernor, Token aDependent)
{
return createDependencyAnnotation(jCas, aDependencyType, aGovernor, aDependent);
}
/**
* Writes dependency annotations to the JCas
*
* @param jCas
* a CAS.
* @param aDependencyType
* the dependency type
* @param aGovernor
* the governing-word
* @param aDependent
* the dependent-word
* @return the newly created dependency annotation.
*/
public static Dependency createDependencyAnnotation(JCas jCas, GrammaticalRelation aDependencyType,
Token aGovernor, Token aDependent)
{
// create the necessary objects and methods
String dependencyTypeName = DEPPACKAGE + aDependencyType.getShortName().toUpperCase();
Type type = jCas.getTypeSystem().getType(dependencyTypeName);
if (type == null) {
// Fall back to generic type. If we used a mapping provider, we'd do that too.
type = JCasUtil.getType(jCas, Dependency.class);
// throw new IllegalStateException("Type [" + dependencyTypeName + "] mapped to tag ["
// + dependencyType + "] is not defined in type system");
}
Dependency dep = (Dependency) jCas.getCas().createFS(type);
dep.setDependencyType(aDependencyType.toString());
dep.setGovernor(aGovernor);
dep.setDependent(aDependent);
dep.setBegin(dep.getDependent().getBegin());
dep.setEnd(dep.getDependent().getEnd());
dep.addToIndexes();
return dep;
}
/**
* Creates annotation with Penn Treebank style representations of the syntax tree
*
* @param aBegin
* start offset.
* @param aEnd
* end offset.
*/
public void createPennTreeAnnotation(int aBegin, int aEnd)
{
Tree t = tokenTree.getTree();
// write Penn Treebank-style string to cas
PennTree pTree = new PennTree(jCas, aBegin, aEnd);
// create tree with simple labels and get penn string from it
t = t.deepCopy(t.treeFactory(), StringLabel.factory());
pTree.setPennTree(t.pennString());
pTree.addToIndexes();
}
/**
* Recovers annotations from a Stanford Tree-Object, which have been saved within the CoreLabel
* of the tree.
*<p>
* Note:
* Copying has to be done in batch, because we need to have ALL annotations that should be
* recovered together when copying them. The reason is that some annotations reference each
* other, which can cause problem if a referenced annotation has not yet been recovered.
*/
public void recoverAnnotationsFromNodes()
{
// create batch-copy list for recovered annotations
List<Annotation> annoList = new ArrayList<Annotation>();
Iterator<Tree> treeIterator = tokenTree.getTree().iterator();
CAS srcCAS = null;
while (treeIterator.hasNext()) {
Tree curTree = treeIterator.next();
// get the collection from the label of the best-fitting node in
// which we store UIMA annotations
Collection<Annotation> annotations = ((CoreLabel) curTree.label())
.get(UIMAAnnotations.class);
// do we have any annotations stored in the node?
if (annotations != null && annotations.size() > 0) {
// translate values which are now relative to the
// node-span back to absolute value (depending on the
// new offset of the node-span within the new CAS)
IntPair span = tokenTree.getSpan(curTree);
// iterate over all annotations
for (Annotation curAnno : annotations) {
srcCAS = srcCAS == null ? curAnno.getCAS() : srcCAS;
// TODO using the SPAN as new annotation index might not
// be correct in all cases - if not an EXACTLY MATCHING
// node had been found for the saved annotation, this will
// be wrong. Find a way to incorporate the anno-index here
curAnno.setBegin(span.getSource());
curAnno.setEnd(span.getTarget());
// add anno to batch-copy list
annoList.add(curAnno);
} // endfor iterate over annotations
} // endif check for annotations in node
} // endwhile iterate over subtrees
/*
* Now that we have gathered all annotations from the tree, batch-copy them to the new CAS
*/
// create CasRecoverer (=adapted version of the CasCopier)
CasCopier copier = new CasCopier(srcCAS, jCas.getCas());
// now batch-copy the annos
List<Annotation> copiedAnnos = copier.batchCopyAnnotations(annoList);
// add copied annos to indexes
for (Annotation cAnno : copiedAnnos) {
jCas.addFsToIndexes(cAnno);
}
}
}