/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.corenlp.internal;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.CorefChain.CorefMention;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.IntPair;
public class CoreNlp2DKPro
{
public static void convertPOSs(JCas aJCas, Annotation document,
MappingProvider mappingProvider, boolean internStrings)
{
for (CoreMap s : document.get(SentencesAnnotation.class)) {
for (CoreLabel t : s.get(TokensAnnotation.class)) {
Token token = t.get(TokenKey.class);
String tag = t.get(PartOfSpeechAnnotation.class);
Type tagType = mappingProvider.getTagType(tag);
POS anno = (POS) aJCas.getCas().createAnnotation(tagType, token.getBegin(),
token.getEnd());
anno.setPosValue(internStrings ? tag.intern() : tag);
anno.setCoarseValue(anno.getClass().equals(POS.class) ? null
: anno.getType().getShortName().intern());
anno.addToIndexes();
token.setPos(anno);
}
}
}
public static void convertNamedEntities(JCas aJCas, Annotation document,
MappingProvider mappingProvider, boolean internStrings)
{
for (CoreMap s : document.get(SentencesAnnotation.class)) {
for (CoreLabel t : s.get(TokensAnnotation.class)) {
Token token = t.get(TokenKey.class);
String tag = t.get(NamedEntityTagAnnotation.class);
// "O" is the hard-coded tag in CoreNLP to indicate no NER on this token
if ("O".equals(tag)) {
continue;
}
Type tagType = mappingProvider.getTagType(tag);
NamedEntity anno = (NamedEntity) aJCas.getCas().createAnnotation(tagType,
token.getBegin(), token.getEnd());
anno.setValue(internStrings ? tag.intern() : tag);
anno.addToIndexes();
}
}
}
public static void convertLemmas(JCas aJCas, Annotation document)
{
for (CoreMap s : document.get(SentencesAnnotation.class)) {
for (CoreLabel t : s.get(TokensAnnotation.class)) {
Token token = t.get(TokenKey.class);
String tag = t.get(LemmaAnnotation.class);
Lemma anno = new Lemma(aJCas, token.getBegin(), token.getEnd());
anno.setValue(tag);
anno.addToIndexes();
token.setLemma(anno);
}
}
}
public static void convertDependencies(JCas aJCas, Annotation document,
MappingProvider mappingProvider, boolean internStrings)
{
for (CoreMap s : document.get(SentencesAnnotation.class)) {
SemanticGraph graph = s.get(CollapsedDependenciesAnnotation.class);
//SemanticGraph graph = s.get(EnhancedDependenciesAnnotation.class);
// If there are no dependencies for this sentence, skip it. Might well mean we
// skip all sentences because normally either there are dependencies for all or for
// none.
if (graph == null) {
continue;
}
for (IndexedWord root : graph.getRoots()) {
Dependency dep = new ROOT(aJCas);
dep.setDependencyType("root");
dep.setDependent(root.get(TokenKey.class));
dep.setGovernor(root.get(TokenKey.class));
dep.setBegin(dep.getDependent().getBegin());
dep.setEnd(dep.getDependent().getEnd());
dep.setFlavor(DependencyFlavor.BASIC);
dep.addToIndexes();
}
for (SemanticGraphEdge edge : graph.edgeListSorted()) {
Token dependent = edge.getDependent().get(TokenKey.class);
Token governor = edge.getGovernor().get(TokenKey.class);
// For the type mapping, we use getShortName() instead, because the <specific>
// actually doesn't change the relation type
String labelUsedForMapping = edge.getRelation().getShortName();
// The nndepparser may produce labels in which the shortName contains a colon.
// These represent language-specific labels of the UD, cf:
// http://universaldependencies.github.io/docs/ext-dep-index.html
labelUsedForMapping = StringUtils.substringBefore(labelUsedForMapping, ":");
// Need to use toString() here to get "<shortname>_<specific>"
String actualLabel = edge.getRelation().toString();
Type depRel = mappingProvider.getTagType(labelUsedForMapping);
Dependency dep = (Dependency) aJCas.getCas().createFS(depRel);
dep.setDependencyType(internStrings ? actualLabel.intern() : actualLabel);
dep.setDependent(dependent);
dep.setGovernor(governor);
dep.setBegin(dep.getDependent().getBegin());
dep.setEnd(dep.getDependent().getEnd());
dep.setFlavor(edge.isExtra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC);
dep.addToIndexes();
}
}
}
public static void convertConstituents(JCas aJCas, Annotation aDocument,
MappingProvider aMappingProvider, boolean aInternStrings,
TreebankLanguagePack aTreebankLanguagePack)
{
for (CoreMap s : aDocument.get(SentencesAnnotation.class)) {
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
tree.setSpans();
List<CoreLabel> tokens = s.get(TokensAnnotation.class);
convertConstituentTreeNode(aJCas, aTreebankLanguagePack, tree, null, aInternStrings,
aMappingProvider, tokens);
}
}
private static org.apache.uima.jcas.tcas.Annotation convertConstituentTreeNode(JCas aJCas,
TreebankLanguagePack aTreebankLanguagePack, Tree aNode,
org.apache.uima.jcas.tcas.Annotation aParentFS, boolean internStrings,
MappingProvider constituentMappingProvider, List<CoreLabel> tokens)
{
// Get node label
String nodeLabelValue = aNode.value();
// Extract syntactic function from node label
String syntacticFunction = null;
AbstractTreebankLanguagePack tlp = (AbstractTreebankLanguagePack) aTreebankLanguagePack;
int gfIdx = nodeLabelValue.indexOf(tlp.getGfCharacter());
if (gfIdx > 0) {
syntacticFunction = nodeLabelValue.substring(gfIdx + 1);
nodeLabelValue = nodeLabelValue.substring(0, gfIdx);
}
// Check if node is a constituent node on sentence or phrase-level
if (aNode.isPhrasal()) {
Type constType = constituentMappingProvider.getTagType(nodeLabelValue);
IntPair span = aNode.getSpan();
int begin = tokens.get(span.getSource()).get(CharacterOffsetBeginAnnotation.class);
int end = tokens.get(span.getTarget()).get(CharacterOffsetEndAnnotation.class);
Constituent constituent = (Constituent) aJCas.getCas().createAnnotation(constType,
begin, end);
constituent.setConstituentType(internStrings ? nodeLabelValue.intern() :
nodeLabelValue);
constituent.setSyntacticFunction(internStrings && syntacticFunction != null ?
syntacticFunction.intern() : syntacticFunction);
constituent.setParent(aParentFS);
// Do we have any children?
List<org.apache.uima.jcas.tcas.Annotation> childAnnotations = new ArrayList<>();
for (Tree child : aNode.getChildrenAsList()) {
org.apache.uima.jcas.tcas.Annotation childAnnotation = convertConstituentTreeNode(
aJCas, aTreebankLanguagePack, child, constituent, internStrings,
constituentMappingProvider, tokens);
if (childAnnotation != null) {
childAnnotations.add(childAnnotation);
}
}
// Now that we know how many children we have, link annotation of
// current node with its children
constituent.setChildren(FSCollectionFactory.createFSArray(aJCas, childAnnotations));
constituent.addToIndexes();
return constituent;
}
// Create parent link on token
else if (aNode.isPreTerminal()) {
// link token to its parent constituent
List<Tree> children = aNode.getChildrenAsList();
assert children.size() == 1;
Tree terminal = children.get(0);
CoreLabel label = (CoreLabel) terminal.label();
Token token = label.get(TokenKey.class);
token.setParent(aParentFS);
return token;
}
else {
throw new IllegalArgumentException("Node must be either phrasal nor pre-terminal");
}
}
public static void convertPennTree(JCas aJCas, Annotation aDocument)
{
for (CoreMap s : aDocument.get(SentencesAnnotation.class)) {
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
int begin = s.get(CharacterOffsetBeginAnnotation.class);
int end = s.get(CharacterOffsetEndAnnotation.class);
// create tree with simple labels and get penn string from it
tree = tree.deepCopy(tree.treeFactory(), StringLabel.factory());
// write Penn Treebank-style string to cas
PennTree pTree = new PennTree(aJCas, begin, end);
pTree.setPennTree(tree.pennString());
pTree.addToIndexes();
}
}
public static void convertCorefChains(JCas aJCas, Annotation aDocument)
{
List<CoreMap> sentences = aDocument.get(SentencesAnnotation.class);
Map<Integer, CorefChain> chains = aDocument
.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (chains != null) {
for (CorefChain chain : chains.values()) {
CoreferenceLink last = null;
for (CorefMention mention : chain.getMentionsInTextualOrder()) {
CoreLabel beginLabel = sentences.get(mention.sentNum - 1)
.get(TokensAnnotation.class).get(mention.startIndex - 1);
CoreLabel endLabel = sentences.get(mention.sentNum - 1)
.get(TokensAnnotation.class).get(mention.endIndex - 2);
CoreferenceLink link = new CoreferenceLink(aJCas,
beginLabel.get(TokenKey.class).getBegin(),
endLabel.get(TokenKey.class).getEnd());
if (mention.mentionType != null) {
link.setReferenceType(mention.mentionType.toString());
}
if (last == null) {
// This is the first mention. Here we'll initialize the chain
CoreferenceChain corefChain = new CoreferenceChain(aJCas);
corefChain.setFirst(link);
corefChain.addToIndexes();
}
else {
// For the other mentions, we'll add them to the chain.
last.setNext(link);
}
last = link;
link.addToIndexes();
}
}
}
}
}