/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.corenlp; import static org.apache.uima.util.Level.INFO; import static org.apache.uima.util.Level.WARNING; import java.io.IOException; import java.net.URL; import java.util.List; import java.util.Properties; import org.apache.commons.lang.reflect.FieldUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; import edu.stanford.nlp.parser.common.ParserGrammar; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.parser.lexparser.Lexicon; import edu.stanford.nlp.parser.shiftreduce.BaseModel; import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.ParserAnnotator; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.trees.AbstractTreebankLanguagePack; import edu.stanford.nlp.trees.EnglishGrammaticalRelations; import edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.GrammaticalStructureFactory; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; import edu.stanford.nlp.trees.UniversalEnglishGrammaticalStructureFactory; import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalRelations; /** * Parser from CoreNLP. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) public class CoreNlpParser extends JCasAnnotator_ImplBase { /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") private boolean printTagSet; /** * Use this language instead of the document language to resolve the model and tag set mapping. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; /** * Variant of a model the model. Used to address a specific model if here are multiple models * for one language. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; /** * Location from which the model is read. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) private String modelLocation; /** * The character encoding used by the model. */ public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) private String modelEncoding; /** * Location of the mapping file for dependency tags to UIMA types. */ public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) private String dependencyMappingLocation; /** * Location of the mapping file for dependency tags to UIMA types. */ public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) private String constituentMappingLocation; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) private String posMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spaming the heap with thousands of strings representing only a few different tags. * * Default: {@code false} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internStrings; public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") private int maxSentenceLength; public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; public static final String PARAM_MAX_TIME = "maxTime"; @ConfigurationParameter(name = PARAM_MAX_TIME, mandatory = true, defaultValue = "-1") private int maxTime; /** * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). * * @see PTBEscapingProcessor */ public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") private boolean ptb3Escaping; /** * List of extra token texts (usually single character strings) that should be treated like * opening quotes and escaped accordingly before being sent to the parser. */ public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) private List<String> quoteBegin; /** * List of extra token texts (usually single character strings) that should be treated like * closing quotes and escaped accordingly before being sent to the parser. */ public static final String PARAM_QUOTE_END = "quoteEnd"; @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) private List<String> quoteEnd; public static final String PARAM_EXTRA_DEPENDENCIES = "extraDependencies"; @ConfigurationParameter(name = PARAM_EXTRA_DEPENDENCIES, mandatory = true, defaultValue="NONE") GrammaticalStructure.Extras extraDependencies; /** * Sets whether to create or not to create constituent tags. This is required for POS-tagging * and lemmatization. * <p> * Default: {@code true} */ public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT; @ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean writeConstituent; /** * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, * containing the whole parse tree in Penn Treebank style format. * <p> * Default: {@code false} */ public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") private boolean writePennTree; /** * Sets whether to use or not to use existing POS tags. * <p> * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; /** * Sets whether to create or not to create POS tags. The creation of constituent tags must be * turned on for this to work. * <p> * Default: {@code false} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") private boolean writePos; /** * Sets whether to create or not to create dependency annotations. * * <p>Default: {@code true} */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean writeDependency; public static final String PARAM_ORIGINAL_DEPENDENCIES = "originalDependencies"; @ConfigurationParameter(name = PARAM_ORIGINAL_DEPENDENCIES, mandatory = true, defaultValue = "true") private boolean originalDependencies; // CoreNlpParser PARAM_KEEP_PUNCTUATION has no effect #965 public static final String PARAM_KEEP_PUNCTUATION = "keepPunctuation"; @ConfigurationParameter(name = PARAM_KEEP_PUNCTUATION, mandatory = true, defaultValue = "false") private boolean keepPunctuation; private CasConfigurableProviderBase<ParserAnnotator> annotatorProvider; private MappingProvider dependencyMappingProvider; private MappingProvider constituentMappingProvider; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); annotatorProvider = new CoreNlpParserModelProvider(this); constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( constituentMappingLocation, language, annotatorProvider); dependencyMappingProvider = MappingProviderFactory.createDependencyMappingProvider( dependencyMappingLocation, language, annotatorProvider); posMappingProvider = MappingProviderFactory.createPosMappingProvider( posMappingLocation, language, annotatorProvider); numThreads = ComponentParameters.computeNumThreads(numThreads); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); annotatorProvider.configure(cas); // Transfer from CAS to CoreNLP DKPro2CoreNlp converter = new DKPro2CoreNlp(); converter.setPtb3Escaping(ptb3Escaping); converter.setQuoteBegin(quoteBegin); converter.setQuoteEnd(quoteEnd); converter.setEncoding(modelEncoding); converter.setReadPos(readPos); Annotation document = new Annotation((String) null); converter.convert(aJCas, document); // Actual processing ParserAnnotator annotator = annotatorProvider.getResource(); annotator.annotate(document); // Get TreebankLanguagePack ParserGrammar parser; try { parser = (ParserGrammar) FieldUtils.readField(annotator, "parser", true); } catch (IllegalAccessException e) { throw new AnalysisEngineProcessException(e); } TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack(); // Transfer back into the CAS if (writePos) { posMappingProvider.configure(cas); CoreNlp2DKPro.convertPOSs(aJCas, document, posMappingProvider, internStrings); } if (writeConstituent) { constituentMappingProvider.configure(cas); CoreNlp2DKPro.convertConstituents(aJCas, document, constituentMappingProvider, internStrings, tlp); } if (writePennTree) { CoreNlp2DKPro.convertPennTree(aJCas, document); } if (writeDependency) { dependencyMappingProvider.configure(cas); CoreNlp2DKPro.convertDependencies(aJCas, document, dependencyMappingProvider, internStrings); } } private class CoreNlpParserModelProvider extends ModelProviderBase<ParserAnnotator> { public CoreNlpParserModelProvider(Object aObject) { super(aObject, "stanfordnlp", "parser"); // setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/parser-${language}-${variant}.properties"); } @SuppressWarnings("unchecked") @Override protected ParserAnnotator produceResource(URL aUrl) throws IOException { String modelFile = aUrl.toString(); // Loading gzipped files from URL is broken in CoreNLP // https://github.com/stanfordnlp/CoreNLP/issues/94 if (modelFile.startsWith("jar:") && modelFile.endsWith(".gz")) { modelFile = org.apache.commons.lang.StringUtils.substringAfter(modelFile, "!/"); } Properties coreNlpProps = new Properties(); // Need to set annotators because CoreNLP checks for the presence of the sentiment // annotator to decide the default value for "parse.binaryTrees" coreNlpProps.setProperty("annotators", ""); coreNlpProps.setProperty("parse.model", modelFile); // coreNlpProps.setProperty("parse.flags", ...); coreNlpProps.setProperty("parse.maxlen", Integer.toString(maxSentenceLength)); coreNlpProps.setProperty("parse.kbest", Integer.toString(3)); // CoreNlpParser PARAM_KEEP_PUNCTUATION has no effect #965 coreNlpProps.setProperty("parse.keepPunct", Boolean.toString(keepPunctuation)); // coreNlpProps.setProperty("parse.treemap", ...); coreNlpProps.setProperty("parse.maxtime", Integer.toString(maxTime)); coreNlpProps.setProperty("parse.buildgraphs", Boolean.toString(writeDependency)); coreNlpProps.setProperty("parse.originalDependencies", Boolean.toString(originalDependencies)); coreNlpProps.setProperty("parse.nthreads", Integer.toString(numThreads)); // coreNlpProps.setProperty("parse.binaryTrees", ...); // coreNlpProps.setProperty("parse.nosquash", ...); coreNlpProps.setProperty("parse.extradependencies", extraDependencies.toString()); ParserAnnotator annotator = new ParserAnnotator("parse", coreNlpProps); getLogger().info(ParserAnnotator.signature("parse", coreNlpProps)); ParserGrammar parser; try { parser = (ParserGrammar) FieldUtils.readField(annotator, "parser", true); } catch (IllegalAccessException e) { throw new IOException(e); } Properties metadata = getResourceMetaData(); AbstractTreebankLanguagePack lp = (AbstractTreebankLanguagePack) parser .getTLPParams().treebankLanguagePack(); // https://mailman.stanford.edu/pipermail/parser-user/2012-November/002117.html // The tagIndex does give all and only the set of POS tags used in the // current grammar. However, these are the split tags actually used by the // grammar. If you really want the user-visible non-split tags of the // original treebank, then you'd need to map them all through the // op.treebankLanguagePack().basicCategory(). -- C. Manning SingletonTagset posTags = new SingletonTagset(POS.class, metadata.getProperty("pos.tagset")); if (parser instanceof LexicalizedParser) { LexicalizedParser lexParser = (LexicalizedParser) parser; for (String tag : lexParser.tagIndex) { String t = lp.basicCategory(tag); // Strip grammatical function from tag int gfIdx = t.indexOf(lp.getGfCharacter()); if (gfIdx > 0) { // TODO should collect syntactic functions in separate tagset // syntacticFunction = nodeLabelValue.substring(gfIdx + 1); t = t.substring(0, gfIdx); } posTags.add(lp.basicCategory(t)); } posTags.remove(Lexicon.BOUNDARY_TAG); addTagset(posTags, writePos); } // https://mailman.stanford.edu/pipermail/parser-user/2012-November/002117.html // For constituent categories, there isn't an index of just them. The // stateIndex has both constituent categories and POS tags in it, so you'd // need to set difference out the tags from the tagIndex, and then it's as // above. -- C. Manning SingletonTagset constTags = new SingletonTagset( Constituent.class, metadata.getProperty("constituent.tagset")); Iterable<String> states; if (parser instanceof LexicalizedParser) { states = ((LexicalizedParser) parser).stateIndex; } else if (parser instanceof ShiftReduceParser) { try { BaseModel model = (BaseModel) FieldUtils.readField(parser, "model", true); states = (Iterable<String>) FieldUtils.readField(model, "knownStates", true); // states = ((ShiftReduceParser) pd).tagSet(); } catch (IllegalAccessException e) { throw new IOException(e); } } else { throw new IllegalStateException("Unknown parser type [" + parser.getClass().getName() + "]"); } for (String tag : states) { String t = lp.basicCategory(tag); // https://mailman.stanford.edu/pipermail/parser-user/2012-December/002156.html // The parser algorithm used is a binary parser, so what we do is // binarize trees by turning A -> B, C, D into A -> B, @A, @A -> C, D. // (That's roughly how it goes, although the exact details are somewhat // different.) When parsing, we parse to a binarized tree and then // unbinarize it before returning. That's the origin of the @ classes. // -- J. Bauer if (!t.startsWith("@")) { // Strip grammatical function from tag int gfIdx = t.indexOf(lp.getGfCharacter()); if (gfIdx > 0) { // TODO should collect syntactic functions in separate tagset // syntacticFunction = nodeLabelValue.substring(gfIdx + 1); t = t.substring(0, gfIdx); } if (t.length() > 0) { constTags.add(t); } } } constTags.remove(Lexicon.BOUNDARY_TAG); constTags.removeAll(posTags); if (writeConstituent) { addTagset(constTags); } // There is no way to determine the relations via the GrammaticalStructureFactory // API, so we do it manually here for the languages known to support this. GrammaticalStructureFactory gsf = null; try { gsf = lp.grammaticalStructureFactory(lp.punctuationWordRejectFilter(), lp.typedDependencyHeadFinder()); } catch (UnsupportedOperationException e) { getContext().getLogger().log(WARNING, "Current model does not seem to support " + "dependencies."); } // TODO: Consider whether r.getShortName() or r.toString() is the right one to use // here. Cf. // https://mailman.stanford.edu/pipermail/java-nlp-user/2016-January/007417.html // https://mailman.stanford.edu/pipermail/java-nlp-user/2013-December/004429.html if (gsf != null && EnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { SingletonTagset depTags = new SingletonTagset(Dependency.class, "stanford341"); for (GrammaticalRelation r : EnglishGrammaticalRelations.values()) { depTags.add(r.getShortName()); } if (writeDependency) { addTagset(depTags, writeDependency); } } else if (gsf != null && UniversalEnglishGrammaticalStructureFactory.class.equals(gsf.getClass())) { SingletonTagset depTags = new SingletonTagset(Dependency.class, "universal"); for (GrammaticalRelation r : UniversalEnglishGrammaticalRelations.values()) { depTags.add(r.getShortName()); } if (writeDependency) { addTagset(depTags, writeDependency); } } else if (gsf != null && ChineseGrammaticalRelations.class.equals(gsf.getClass())) { SingletonTagset depTags = new SingletonTagset(Dependency.class, "stanford"); for (GrammaticalRelation r : ChineseGrammaticalRelations.values()) { depTags.add(r.getShortName()); } if (writeDependency) { addTagset(depTags, writeDependency); } } if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return annotator; } } }