/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser.DependenciesMode; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.RootKey; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.StanfordAnnotator; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeUtils; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.LabeledScoredTreeFactory; import edu.stanford.nlp.trees.PennTreebankLanguagePack; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.trees.TreeFactory; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.Trees; import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.CoreMap; /** * Converts a constituency structure into a dependency structure. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent"}, outputs = {"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) public class StanfordDependencyConverter extends JCasAnnotator_ImplBase { private static final Map<String, Class<? extends TreebankLanguagePack>> languagePacks; static { languagePacks = new HashMap<String, Class<? extends TreebankLanguagePack>>(); //languagePacks.put("ar", ArabicTreebankLanguagePack.class); languagePacks.put("en", PennTreebankLanguagePack.class); //languagePacks.put("es", SpanishTreebankLanguagePack.class); //languagePacks.put("fr", FrenchTreebankLanguagePack.class); //languagePacks.put("zh", ChineseTreebankLanguagePack.class); } /** * Sets the kind of dependencies being created. * * <p>Default: {@link DependenciesMode#COLLAPSED TREE} * @see DependenciesMode */ public static final String PARAM_MODE = "mode"; @ConfigurationParameter(name = PARAM_MODE, mandatory = false, defaultValue = "TREE") protected DependenciesMode mode; /** * Use this language instead of the document language to resolve the model and tag set mapping. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Create original dependencies. If this is disabled, universal dependencies are created. The * default is to create the original dependencies. */ public static final String PARAM_ORIGINAL_DEPENDENCIES = "originalDependencies"; @ConfigurationParameter(name = PARAM_ORIGINAL_DEPENDENCIES, mandatory = true, defaultValue = "true") protected boolean originalDependencies; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String lang = language != null ? language : aJCas.getDocumentLanguage(); if (!languagePacks.containsKey(lang)) { throw new AnalysisEngineProcessException(new IllegalStateException( "Unsupported language [" + aJCas.getDocumentLanguage() + "]")); } TreebankLanguagePack lp; try { lp = languagePacks.get(aJCas.getDocumentLanguage()).newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new AnalysisEngineProcessException(e); } // For the moment we hard-code to generate the old non-universal dependencies. // Setting this through a parameter would be a problem if the model would be shared // between multiple AEs that use different settings for this parameter. lp.setGenerateOriginalDependencies(originalDependencies); List<CoreMap> sentences = new ArrayList<CoreMap>(); for (ROOT root : select(aJCas, ROOT.class)) { // Copy all relevant information from the tokens List<Token> tokens = selectCovered(Token.class, root); List<CoreLabel> coreTokens = new ArrayList<CoreLabel>(); for (Token token : tokens) { coreTokens.add(tokenToWord(token)); } // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace // it with PRN to avoid NPEs. TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) { @Override public Tree newTreeNode(String aParent, List<Tree> aChildren) { String parent = aParent; if ("PRN0".equals(parent)) { parent = "PRN"; } Tree node = super.newTreeNode(parent, aChildren); return node; } }; Tree tree = TreeUtils.createStanfordTree(root, tFact); Trees.convertToCoreLabels(tree); tree.indexSpans(); // Build the sentence CoreMap sentence = new CoreLabel(); sentence.set(TreeAnnotation.class, tree); sentence.set(TokensAnnotation.class, coreTokens); sentence.set(RootKey.class, root); sentences.add(sentence); doCreateDependencyTags(aJCas, lp, tree, tokens); } } protected void doCreateDependencyTags(JCas aJCas, TreebankLanguagePack aLP, Tree parseTree, List<Token> tokens) { GrammaticalStructure gs; try { gs = aLP.grammaticalStructureFactory(aLP.punctuationWordRejectFilter(), aLP.typedDependencyHeadFinder()).newGrammaticalStructure(parseTree); } catch (UnsupportedOperationException e) { // We already warned in the model provider if dependencies are not supported, so here // we just do nothing and skip the dependencies. return; } Collection<TypedDependency> dependencies = null; switch (mode) { case BASIC: dependencies = gs.typedDependencies(); // gs.typedDependencies(false); break; case NON_COLLAPSED: dependencies = gs.allTypedDependencies(); // gs.typedDependencies(true); break; case COLLAPSED_WITH_EXTRA: dependencies = gs.typedDependenciesCollapsed(true); break; case COLLAPSED: dependencies = gs.typedDependenciesCollapsed(false); break; case CC_PROPAGATED: dependencies = gs.typedDependenciesCCprocessed(true); break; case CC_PROPAGATED_NO_EXTRA: dependencies = gs.typedDependenciesCCprocessed(false); break; case ENHANCED: dependencies = gs.typedDependenciesEnhanced(); break; case ENHANCED_PLUS_PLUS: dependencies = gs.typedDependenciesEnhancedPlusPlus(); break; case TREE: dependencies = gs.typedDependenciesCollapsedTree(); break; default: throw new IllegalArgumentException("Unknown mode: [" + mode + "]"); } for (TypedDependency currTypedDep : dependencies) { int govIndex = currTypedDep.gov().index(); int depIndex = currTypedDep.dep().index(); Dependency dep; if (govIndex != 0) { Token govToken = tokens.get(govIndex - 1); Token depToken = tokens.get(depIndex - 1); dep = StanfordAnnotator.createDependencyAnnotation(aJCas, currTypedDep.reln(), govToken, depToken); } else { Token depToken = tokens.get(depIndex - 1); dep = new de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT(aJCas); dep.setDependencyType(currTypedDep.reln().toString()); dep.setGovernor(depToken); dep.setDependent(depToken); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.addToIndexes(); } dep.setFlavor(currTypedDep.extra() ? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC); } } protected CoreLabel tokenToWord(Token aToken) { CoreLabel t = CoreNlpUtils.tokenToWord(aToken); t.set(TokenKey.class, aToken); List<NamedEntity> nes = selectCovered(NamedEntity.class, aToken); if (nes.size() > 0) { t.setNER(nes.get(0).getValue()); } else { t.setNER("O"); } return t; } }