/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.corenlp.internal; import static org.apache.uima.fit.util.JCasUtil.selectPreceding; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.selectFollowing; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.uima.cas.CASException; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.IndexAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.StemAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.CoreMap; public class DKPro2CoreNlp { private CoreLabelTokenFactory tokenFactory = new CoreLabelTokenFactory(); private boolean ptb3Escaping; private List<String> quoteBegin; private List<String> quoteEnd; private Charset encoding; private boolean readPos = true; public void setReadPos(boolean aReadPos) { readPos = aReadPos; } public boolean isReadPos() { return readPos; } public String getEncoding() { return encoding != null ? encoding.name() : null; } public void setEncoding(String aEncoding) { encoding = aEncoding != null ? Charset.forName(aEncoding) : null; } public boolean isPtb3Escaping() { return ptb3Escaping; } public void setPtb3Escaping(boolean aPtb3Escaping) { ptb3Escaping = aPtb3Escaping; } public List<String> getQuoteBegin() { return quoteBegin; } public void setQuoteBegin(List<String> aQuoteBegin) { quoteBegin = aQuoteBegin; } public List<String> getQuoteEnd() { return quoteEnd; } public void setQuoteEnd(List<String> aQuoteEnd) { quoteEnd = aQuoteEnd; } public Annotation convert(JCas aSource, Annotation aTarget) { // Document annotation aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText()); // Sentences List<CoreMap> sentences = new ArrayList<>(); for (Sentence s : select(aSource, Sentence.class)) { if (StringUtils.isBlank(s.getCoveredText())) { continue; } String sentenceText = s.getCoveredText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { sentenceText = new String(sentenceText.getBytes(StandardCharsets.UTF_8), encoding); } Annotation sentence = new Annotation(sentenceText); sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin()); sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd()); sentence.set(SentenceIndexAnnotation.class, sentences.size()); // Tokens Map<Token, IndexedWord> idxTokens = new HashMap<>(); List<CoreLabel> tokens = new ArrayList<>(); for (Token t : selectCovered(Token.class, s)) { String tokenText = t.getCoveredText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding); } CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), t.getEnd() - t.getBegin()); // First add token so that tokens.size() returns a 1-based counting as required // by IndexAnnotation tokens.add(token); token.set(SentenceIndexAnnotation.class, sentences.size()); token.set(IndexAnnotation.class, tokens.size()); token.set(TokenKey.class, t); idxTokens.put(t, new IndexedWord(token)); // POS tags if (readPos && t.getPos() != null) { token.set(PartOfSpeechAnnotation.class, t.getPos().getPosValue()); } // Lemma if (t.getLemma() != null) { token.set(LemmaAnnotation.class, t.getLemma().getValue()); } // Stem if (t.getStem() != null) { token.set(StemAnnotation.class, t.getStem().getValue()); } // NamedEntity // TODO: only token-based NEs are supported, but not multi-token NEs // Supporting multi-token NEs via selectCovering would be very slow. To support // them, another approach would need to be implemented, e.g. via indexCovering. List<NamedEntity> nes = selectCovered(NamedEntity.class, t); if (nes.size() > 0) { token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue()); } else { token.set(NamedEntityTagAnnotation.class, "O"); } } // Constituents for (ROOT r : selectCovered(ROOT.class, s)) { Tree tree = createStanfordTree(r, idxTokens); tree.indexSpans(); sentence.set(TreeAnnotation.class, tree); } // Dependencies List<TypedDependency> dependencies = new ArrayList<>(); for (Dependency d : selectCovered(Dependency.class, s)) { TypedDependency dep = new TypedDependency( GrammaticalRelation.valueOf(d.getDependencyType()), idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent())); if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) { dep.setExtra(); } dependencies.add(dep); } sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies)); if (ptb3Escaping) { tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd); } sentence.set(TokensAnnotation.class, tokens); sentences.add(sentence); } aTarget.set(SentencesAnnotation.class, sentences); return aTarget; } public static Tree createStanfordTree(ROOT root) { return createStanfordTree(root, new LabeledScoredTreeFactory(CoreLabel.factory())); } /** * Recursively creates an edu.stanford.nlp.trees.Tree from a ROOT annotation It also saves the * whitespaces before and after a token as <code>CoreAnnotation.BeforeAnnotation</code> and * <code>CoreAnnotation.AfterAnnotation</code> in the respective label of the current node. * * @param root * the ROOT annotation * @return an {@link Tree} object representing the syntax structure of the sentence */ public static Tree createStanfordTree(ROOT root, Map<Token, IndexedWord> aIdxTokens) { return createStanfordTree(root, new LabeledScoredTreeFactory(CoreLabel.factory()), aIdxTokens); } public static Tree createStanfordTree(org.apache.uima.jcas.tcas.Annotation root, TreeFactory tFact) { return createStanfordTree(root, tFact, null); } public static Tree createStanfordTree(org.apache.uima.jcas.tcas.Annotation root, TreeFactory tFact, Map<Token, IndexedWord> aIdxTokens) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact, aIdxTokens)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode; if (aIdxTokens != null) { wordNode = tFact.newLeaf(aIdxTokens.get(wordAnnotation)); } else { wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); } // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()).set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()).set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // get POS-annotation POS pos = wordAnnotation.getPos(); // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] { wordNode }))); } return rootNode; } private static boolean isLeaf(Constituent constituent) { return (constituent.getChildren() == null || constituent.getChildren().size() == 0); } @SuppressWarnings("unchecked") public static <T extends HasWord> List<T> applyPtbEscaping(List<T> words, Collection<String> quoteBegin, Collection<String> quoteEnd) { PTBEscapingProcessor<T, String, Word> escaper = new PTBEscapingProcessor<T, String, Word>(); // Apply escaper to the whole sentence, not to each token individually. The // escaper takes context into account, e.g. when transforming regular double // quotes into PTB opening and closing quotes (`` and ''). words = (List<T>) escaper.apply(words); for (HasWord w : words) { if (quoteBegin != null && quoteBegin.contains(w.word())) { w.setWord("``"); } else if (quoteEnd != null && quoteEnd.contains(w.word())) { w.setWord("\'\'"); } } return words; } }