/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.mstparser; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.exists; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.util.Level.INFO; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Properties; import mstparser.DependencyInstance; import mstparser.DependencyParser; import mstparser.DependencyPipe; import mstparser.DependencyPipe2O; import mstparser.ParserOptions; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; /** * Dependency parsing using MSTParser. * <p> * Wrapper for the MSTParser (<b>high memory requirements</b>). More information about the parser * can be found <a href="http://www.seas.upenn.edu/~strctlrn/MSTParser/MSTParser.html">here</a> <a * href="http://sourceforge.net/projects/mstparser/">here</a> * </p> * <p> * The MSTParser models tend to be very large, e.g. the <a * href="http://nlp.stanford.edu/software/stanford-dependencies.shtml">Eisner</a> model is about 600 * MB uncompressed. With this model, parsing a simple sentence with MSTParser requires about 3 GB * heap memory. * </p> * <p> * This component feeds MSTParser only with the FORM (token) and POS (part-of-speech) fields. LEMMA, * CPOS, and other columns from the CONLL 2006 format are not generated (cf. * {@link mstparser.DependencyInstance DependencyInstance}). * </p> * */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) public class MstParser extends JCasConsumer_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; /** * Load the dependency to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) protected String dependencyMappingLocation; /** * Specifies the order/scope of features. 1 only has features over single edges * and 2 has features over pairs of adjacent edges in the tree. The model must have been * trained with the respective order set here. */ public static final String PARAM_ORDER = "order"; @ConfigurationParameter(name = PARAM_ORDER, mandatory = false) private Integer order; private ModelProviderBase<DependencyParser> modelProvider; private MappingProvider mappingProvider; /** * Initializes the MSTParser and creates a ModelResourceProvicer * * @throws ResourceInitializationException * Cannot be initialized */ @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // the modelProvider reads in the model and produces a parser modelProvider = new ModelProviderBase<DependencyParser>(this, "mstparser", "parser") { @Override protected DependencyParser produceResource(URL aUrl) throws IOException { Properties metadata = getResourceMetaData(); // Configure parser ParserOptions options = createOptions(aUrl, metadata); DependencyPipe pipe = createPipe(options); DependencyParser dp = loadParser(aUrl, pipe, options); // Check if the model order corresponds to the order the component is configured for boolean secondOrderModel = isSecondOrderModel(pipe); if (secondOrderModel != options.secondOrder) { String model = secondOrderModel ? "second" : "first"; String component = options.secondOrder ? "second" : "first"; getLogger().warn("Model is " + model + " but component has been configured " + "for " + component + " order. I am going to reload the model now " + "with the correct order. To avoid loading the model twice, please " + "configure the component for the correct order."); // Reconfigure pipe and reload options.secondOrder = secondOrderModel; pipe = createPipe(options); dp = loadParser(aUrl, pipe, options); } // Extract dependency tagset SingletonTagset depTags = new SingletonTagset( Dependency.class, metadata.getProperty("dependency.tagset")); depTags.addAll(asList(pipe.types)); //depTags.remove("<no-type>"); addTagset(depTags); // Extract POS tagset (from POS, not from CPOS!) SingletonTagset posTags = new SingletonTagset( POS.class, metadata.getProperty("pos.tagset")); for (Object key : pipe.dataAlphabet.toArray()) { if (key instanceof String) { String sKey = (String) key; // See mstparser.DependencyPipe.addLinearFeatures(...) if (sKey.startsWith("POSPC=")) { String[] fragments = sKey.substring(6).split(" ",3); posTags.add(fragments[0]); posTags.add(fragments[1]); } } } //posTags.remove("<root-POS>"); addTagset(posTags); if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return dp; }; }; mappingProvider = MappingProviderFactory.createDependencyMappingProvider( dependencyMappingLocation, language, modelProvider); } /** * Processes the given text using the MSTParser. As the MSTParser expects an input file, a * temporary file is created. * * @param jcas * The JCas containing the textual input * @throws AnalysisEngineProcessException * No parse created */ @Override public void process(JCas jcas) throws AnalysisEngineProcessException { CAS cas = jcas.getCas(); modelProvider.configure(cas); mappingProvider.configure(cas); DependencyParser dp = modelProvider.getResource(); // If there are no sentences or tokens in the CAS, skip it. if (!exists(jcas, Sentence.class) || !exists(jcas, Token.class)) { return; } // currently the parser needs a file as input, it cannot yet work directly with the // cas-structure try { String tempfile = generateTempInputFile(jcas); dp.options.testfile = tempfile; } catch (IOException e) { throw new AnalysisEngineProcessException(e); } // Run the parser // dp.getParses() is a method that we added to the MSTParser codebase, it returns a list of // parses. Originally this was dp.outputParses() and the method wrote the parses into a // file. // The old method is still available. List<DependencyInstance> parsedInstances; try { parsedInstances = dp.getParses(); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } List<Sentence> sentences = new ArrayList<Sentence>(select(jcas, Sentence.class)); for (int instanceIndex = 0; instanceIndex < parsedInstances.size(); instanceIndex++) { DependencyInstance instance = parsedInstances.get(instanceIndex); Sentence sentence = sentences.get(instanceIndex); List<Token> tokens = new ArrayList<Token>(selectCovered(jcas, Token.class, sentence)); // iterate through tokens for (int formsIndex = 0; formsIndex < instance.forms.length; formsIndex++) { Token token = tokens.get(formsIndex); // get dependency relation and head information for token int head = instance.heads[formsIndex]; // write dependency information as annotation to JCas Type depRel = mappingProvider.getTagType(instance.deprels[formsIndex]); if (head > 0) { Dependency dep = (Dependency) cas.createFS(depRel); dep.setDependencyType(instance.deprels[formsIndex]); dep.setFlavor(DependencyFlavor.BASIC); dep.setDependent(token); dep.setGovernor(tokens.get(head - 1)); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.addToIndexes(); } else { Dependency dep = new ROOT(jcas); dep.setDependencyType(instance.deprels[formsIndex]); dep.setFlavor(DependencyFlavor.BASIC); dep.setDependent(token); dep.setGovernor(token); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.addToIndexes(); } } } } /** * Generates a temporary file from a jcas. This is needed as input to the MST parser. * * @param jcas * The JCas containing the textual input * @return The path to the created temporary file. * @throws IOException * The temporary file could not be created */ private String generateTempInputFile(JCas jcas) throws IOException { File tempfile = File.createTempFile("MSTinput", "txt"); BufferedWriter out = new BufferedWriter(new FileWriter(tempfile, true)); // write sentences to temporary file in MST input format for (Sentence sentence : select(jcas, Sentence.class)) { int tokencount = 0; List<Token> tokens = selectCovered(jcas, Token.class, sentence); for (Token token : tokens) { out.write(token.getCoveredText() + "\t"); tokencount++; } out.write("\n"); for (Token token : tokens) { out.write(token.getPos().getPosValue() + "\t"); } // Dummy values for labels out.write("\n"); for (int k = 0; k < tokencount; k++) { out.write("Dummy\t"); } // Dummy values for heads out.write("\n"); for (int i = 0; i < tokencount; i++) { out.write("0\t"); } out.write("\n\n"); } IOUtils.closeQuietly(out); tempfile.deleteOnExit(); return tempfile.getPath(); } /** * Checks if the data alphabet loaded into the pipe contains features that are only generated * when a second-order model has been trained. * * @param aPipe * the parser pipeline. * @return if the pipeline uses a second-order model. */ private boolean isSecondOrderModel(DependencyPipe aPipe) { for (Object key : aPipe.dataAlphabet.toArray()) { if (key instanceof String) { String sKey = (String) key; if (sKey.startsWith("POS_TRIP=")) { return true; } } } return false; } private ParserOptions createOptions(URL aUrl, Properties aMetadata) { // mst.ParserOptions needs a String as argument ParserOptions options = new ParserOptions(new String[] {}); options.test = true; options.train = false; options.trainfile = ""; options.eval = false; options.format = "MST"; options.goldfile = ""; options.testfile = ""; options.modelName = aUrl.toString(); if (order == null) { String modelOrder = aMetadata.getProperty("mstparser.param.order"); if (StringUtils.isNotEmpty(modelOrder)) { getLogger().info( "Using model order (mstparser.param.order): " + modelOrder); options.secondOrder = "2".equals(modelOrder.trim()); } else { getLogger().info("Using default order: 1"); options.secondOrder = false; } } else { getLogger().info("Using user-specified order: " + order); options.secondOrder = order == 2; } return options; } private DependencyParser loadParser(URL aUrl, DependencyPipe aPipe, ParserOptions aOptions) throws IOException { DependencyParser dp = new DependencyParser(aPipe, aOptions); InputStream is = null; try { getLogger().info("Retrieving model"); is = CompressionUtils.getInputStream(aUrl.getFile(), aUrl.openStream()); dp.loadModel(is); } finally { closeQuietly(is); } return dp; } private DependencyPipe createPipe(ParserOptions aOptions) throws IOException { return aOptions.secondOrder ? new DependencyPipe2O(aOptions) : new DependencyPipe(aOptions); } }