/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.berkeleyparser; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.toText; import static org.apache.uima.util.Level.INFO; import java.io.IOException; import java.io.ObjectInputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.zip.GZIPInputStream; import org.apache.commons.lang.mutable.MutableInt; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.OperationalProperties; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser; import edu.berkeley.nlp.PCFGLA.Grammar; import edu.berkeley.nlp.PCFGLA.Lexicon; import edu.berkeley.nlp.PCFGLA.ParserData; import edu.berkeley.nlp.PCFGLA.TreeAnnotations; import edu.berkeley.nlp.syntax.Tree; import edu.berkeley.nlp.util.Numberer; /** * Berkeley Parser annotator . Requires {@link Sentence}s to be annotated before. * * @see CoarseToFineMaxRuleParser */ @OperationalProperties(multipleDeploymentAllowed = false) @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree" }) public class BerkeleyParser extends JCasAnnotator_ImplBase { /** * Use this language instead of the language set in the CAS to locate the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Location of the mapping file for constituent tags to UIMA types. */ public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) protected String constituentMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid spaming * the heap with thousands of strings representing only a few different tags. * * Default: {@code true} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; /** * Sets whether to use or not to use already existing POS tags from another annotator for the * parsing process. * <p> * Default: {@code false} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; /** * Sets whether to create or not to create POS tags. The creation of constituent tags must be * turned on for this to work. * <p> * Default: {@code true} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "false") private boolean writePos; /** * If this parameter is set to true, each sentence is annotated with a PennTree-Annotation, * containing the whole parse tree in Penn Treebank style format. * <p> * Default: {@code false} */ public static final String PARAM_WRITE_PENN_TREE = ComponentParameters.PARAM_WRITE_PENN_TREE; @ConfigurationParameter(name = PARAM_WRITE_PENN_TREE, mandatory = true, defaultValue = "false") private boolean writePennTree; /** * Compute Viterbi derivation instead of max-rule tree. * <p> * Default: {@code false} (max-rule) */ public static final String PARAM_VITERBI = "viterbi"; @ConfigurationParameter(name = PARAM_VITERBI, mandatory = true, defaultValue = "false") private boolean viterbi; /** * Output sub-categories (only for binarized Viterbi trees). * <p> * Default: {@code false} */ public static final String PARAM_SUBSTATES = "substates"; @ConfigurationParameter(name = PARAM_SUBSTATES, mandatory = true, defaultValue = "false") private boolean substates; /** * Output inside scores (only for binarized viterbi trees). * <p> * Default: {@code false} */ public static final String PARAM_SCORES = "scores"; @ConfigurationParameter(name = PARAM_SCORES, mandatory = true, defaultValue = "false") private boolean scores; /** * Set thresholds for accuracy. * <p> * Default: {@code false} (set thresholds for efficiency) */ public static final String PARAM_ACCURATE = "accurate"; @ConfigurationParameter(name = PARAM_ACCURATE, mandatory = true, defaultValue = "false") private boolean accurate; /** * Use variational rule score approximation instead of max-rule * <p> * Default: {@code false} */ public static final String PARAM_VARIATIONAL = "variational"; @ConfigurationParameter(name = PARAM_VARIATIONAL, mandatory = true, defaultValue = "false") private boolean variational; /** * Retain predicted function labels. Model must have been trained with function labels. * <p> * Default: {@code false} */ public static final String PARAM_KEEP_FUNCTION_LABELS = "keepFunctionLabels"; @ConfigurationParameter(name = PARAM_KEEP_FUNCTION_LABELS, mandatory = true, defaultValue = "false") private boolean keepFunctionLabels; /** * Output binarized trees. * <p> * Default: {@code false} */ public static final String PARAM_BINARIZE = "binarize"; @ConfigurationParameter(name = PARAM_BINARIZE, mandatory = true, defaultValue = "false") private boolean binarize; private CasConfigurableProviderBase<CoarseToFineMaxRuleParser> modelProvider; private MappingProvider posMappingProvider; private MappingProvider constituentMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new BerkeleyParserModelProvider(); posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, language, modelProvider); constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( constituentMappingLocation, language, modelProvider); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); modelProvider.configure(cas); posMappingProvider.configure(cas); constituentMappingProvider.configure(cas); for (Sentence sentence : select(aJCas, Sentence.class)) { List<Token> tokens = selectCovered(aJCas, Token.class, sentence); List<String> tokenText = toText(tokens); List<String> posTags = null; if (readPos) { posTags = new ArrayList<String>(tokens.size()); for (Token t : tokens) { posTags.add(t.getPos().getPosValue()); } } Tree<String> parseOutput = modelProvider.getResource().getBestConstrainedParse( tokenText, posTags, false); // Check if the sentence could be parsed or not if (parseOutput.getChildren().isEmpty()) { getLogger().warn("Unable to parse sentence: [" + sentence.getCoveredText() + "]"); continue; } if (!binarize) { parseOutput = TreeAnnotations.unAnnotateTree(parseOutput, keepFunctionLabels); } createConstituentAnnotationFromTree(aJCas, parseOutput, null, tokens, new MutableInt(0)); if (writePennTree) { PennTree pTree = new PennTree(aJCas, sentence.getBegin(), sentence.getEnd()); pTree.setPennTree(parseOutput.toString()); pTree.addToIndexes(); } } } /** * Creates linked constituent annotations + POS annotations * * @param aNode * the source tree * @return the child-structure (needed for recursive call only) */ private Annotation createConstituentAnnotationFromTree(JCas aJCas, Tree<String> aNode, Annotation aParentFS, List<Token> aTokens, MutableInt aIndex) { // If the node is a word-level constituent node (== POS): // create parent link on token and (if not turned off) create POS tag if (aNode.isPreTerminal()) { Token token = aTokens.get(aIndex.intValue()); // link token to its parent constituent if (aParentFS != null) { token.setParent(aParentFS); } // only add POS to index if we want POS-tagging if (writePos) { String typeName = aNode.getLabel(); Type posTag = posMappingProvider.getTagType(typeName); POS posAnno = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); posAnno.setPosValue(internTags ? typeName.intern() : typeName); posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null : posAnno.getType().getShortName().intern()); posAnno.addToIndexes(); token.setPos(posAnno); } aIndex.add(1); return token; } // Check if node is a constituent node on sentence or phrase-level else { String typeName = aNode.getLabel(); // create the necessary objects and methods Type constType = constituentMappingProvider.getTagType(typeName); Constituent constAnno = (Constituent) aJCas.getCas().createAnnotation(constType, 0, 0); constAnno.setConstituentType(typeName); // link to parent if (aParentFS != null) { constAnno.setParent(aParentFS); } // Do we have any children? List<Annotation> childAnnotations = new ArrayList<Annotation>(); for (Tree<String> child : aNode.getChildren()) { Annotation childAnnotation = createConstituentAnnotationFromTree(aJCas, child, constAnno, aTokens, aIndex); if (childAnnotation != null) { childAnnotations.add(childAnnotation); } } constAnno.setBegin(childAnnotations.get(0).getBegin()); constAnno.setEnd(childAnnotations.get(childAnnotations.size() - 1).getEnd()); // Now that we know how many children we have, link annotation of // current node with its children FSArray childArray = FSCollectionFactory.createFSArray(aJCas, childAnnotations); constAnno.setChildren(childArray); // write annotation for current node to index aJCas.addFsToIndexes(constAnno); return constAnno; } } private class BerkeleyParserModelProvider extends ModelProviderBase<CoarseToFineMaxRuleParser> { { setContextObject(BerkeleyParser.this); setDefault(ARTIFACT_ID, "${groupId}.berkeleyparser-model-parser-${language}-${variant}"); setDefault(LOCATION, "classpath:/${package}/lib/parser-${language}-${variant}.bin"); setDefaultVariantsLocation("${package}/lib/parser-default-variants.map"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected CoarseToFineMaxRuleParser produceResource(URL aUrl) throws IOException { ObjectInputStream is = null; try { is = new ObjectInputStream(new GZIPInputStream(aUrl.openStream())); ParserData pData = (ParserData) is.readObject(); Grammar grammar = pData.getGrammar(); Lexicon lexicon = pData.getLexicon(); Numberer.setNumberers(pData.getNumbs()); double threshold = 1.0; Properties metadata = getResourceMetaData(); SingletonTagset posTags = new SingletonTagset( POS.class, metadata.getProperty("pos.tagset")); SingletonTagset constTags = new SingletonTagset( Constituent.class, metadata.getProperty("constituent.tagset")); Numberer tagNumberer = (Numberer) pData.getNumbs().get("tags"); for (int i = 0; i < tagNumberer.size(); i++) { String tag = (String) tagNumberer.object(i); if (!binarize && tag.startsWith("@")) { continue; // Only show aux. binarization tags if it is enabled. } if (tag.endsWith("^g")) { constTags.add(tag.substring(0, tag.length() - 2)); } else if ("ROOT".equals(tag)) { constTags.add(tag); } else { posTags.add(tag); } } addTagset(posTags, writePos); addTagset(constTags); if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return new CoarseToFineMaxRuleParser(grammar, lexicon, threshold, -1, viterbi, substates, scores, accurate, variational, true, true); } catch (ClassNotFoundException e) { throw new IOException(e); } finally { closeQuietly(is); } } }; }