/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.maltparser; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.util.Level.INFO; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import java.util.jar.JarEntry; import java.util.jar.JarInputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.AnalysisComponent; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.apache.uima.util.Logger; import org.maltparser.MaltParserService; import org.maltparser.core.exception.MaltChainedException; import org.maltparser.core.options.OptionManager; import org.maltparser.core.symbol.SymbolTable; import org.maltparser.core.symbol.parse.ParseSymbolTable; import org.maltparser.core.syntaxgraph.DependencyStructure; import org.maltparser.core.syntaxgraph.edge.Edge; import org.maltparser.core.syntaxgraph.node.TokenNode; import org.maltparser.parser.SingleMalt; import org.springframework.beans.PropertyAccessor; import org.springframework.beans.PropertyAccessorFactory; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; /** * Dependency parsing using MaltPaser. * <p> * Required annotations: * </p> * <ul> * <li>Token</li> * <li>Sentence</li> * <li>POS</li> * </ul> * * Generated annotations: * <ul> * <li>Dependency (annotated over sentence-span)</li> * </ul> * * */ @TypeCapability( inputs={ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, outputs={ "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) public class MaltParser extends JCasAnnotator_ImplBase { private static final String UNUSED = "_"; /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; /** * Process anyway, even if the model relies on features that are not supported by this * component. * * Default: {@code false} */ public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") protected boolean ignoreMissingFeatures; // Not sure if we'll ever have to use different symbol tables // public static final String SYMBOL_TABLE = "symbolTableName"; // @ConfigurationParameter(name = SYMBOL_TABLE, mandatory = true, defaultValue = "DEPREL") private final String symbolTableName = "DEPREL"; private Logger logger; private SymbolTable symbolTable; private File workingDir; private CasConfigurableProviderBase<MaltParserService> modelProvider; private Set<String> features; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); logger = getContext().getLogger(); try { workingDir = File.createTempFile("maltparser", ".tmp"); workingDir.delete(); workingDir.mkdirs(); workingDir.deleteOnExit(); } catch (IOException e) { throw new ResourceInitializationException(e); } modelProvider = new ModelProviderBase<MaltParserService>(this, "maltparser", "parser") { private MaltParserService parser; { setDefault(VARIANT, "linear"); } @Override protected MaltParserService produceResource(URL aUrl) throws IOException { if (parser != null) { // Terminates the parser model try { parser.terminateParserModel(); parser = null; } catch (MaltChainedException e) { logger.log(Level.SEVERE, "MaltParser exception while terminating parser model: " + e.getMessage()); } } try { // Warn if the model uses features that we currently do not support features = getFeatures(aUrl); Set<String> unsupportedFeatures = new HashSet<String>(features); getLogger().info("Model uses these features: " + features); unsupportedFeatures.remove("FORM"); // we know covered text unsupportedFeatures.remove("LEMMA"); // we know lemma if lemmatizer ran before unsupportedFeatures.remove("POSTAG"); // we know POS tag if POS tagger ran before // CPOSTAG - only supported if we know a mapping from POSTAG to CPOSTAG (FIXME) // FEATS - not properly supported in DKPro Core yet! (FIXME) if (!unsupportedFeatures.isEmpty()) { String message = "Model these uses unsupported features: " + unsupportedFeatures; if (ignoreMissingFeatures) { getLogger().warn(message); } else { throw new IOException(message); } } // However, Maltparser is not happy at all if the model file does not have the right // name, so we are forced to create a temporary directory and place the file there. File modelFile = new File(workingDir, getRealName(aUrl)); if (!modelFile.exists()) { InputStream is = null; OutputStream os = null; try { is = aUrl.openStream(); os = new FileOutputStream(modelFile); IOUtils.copy(is, os); modelFile.deleteOnExit(); } finally { IOUtils.closeQuietly(is); IOUtils.closeQuietly(os); } } // Maltparser has a very odd way of finding out which command line options it supports. // By manually initializing the OptionManager before Maltparser tries it, we can work // around Maltparsers' own broken code. if (OptionManager.instance().getOptionContainerIndices().size() == 0) { OptionManager.instance().loadOptionDescriptionFile( MaltParserService.class.getResource("/appdata/options.xml")); OptionManager.instance().generateMaps(); } // Ok, now we can finally initialize the parser parser = new MaltParserService(); parser.initializeParserModel("-w " + workingDir + " -c " + modelFile.getName() + " -m parse"); // parser.initializeParserModel("-u " + modelUrl.toString() + " -m parse"); Properties metadata = getResourceMetaData(); PropertyAccessor paDirect = PropertyAccessorFactory.forDirectFieldAccess(parser); SingleMalt singleMalt = (SingleMalt) paDirect.getPropertyValue("singleMalt"); SingletonTagset posTags = new SingletonTagset( POS.class, metadata.getProperty("pos.tagset")); ParseSymbolTable posTagTable = (ParseSymbolTable) singleMalt.getSymbolTables() .getSymbolTable("POSTAG"); for (int i = 0; i < posTagTable.getValueCounter(); i++) { posTags.add(posTagTable.getSymbolCodeToString(i)); } posTags.remove("#null#"); // Technical symbol introduced in MaltParser 1.8 addTagset(posTags, false); SingletonTagset depTags = new SingletonTagset( Dependency.class, metadata.getProperty("dependency.tagset")); ParseSymbolTable depRelTable = (ParseSymbolTable) singleMalt.getSymbolTables() .getSymbolTable("DEPREL"); for (int i = 0; i < depRelTable.getValueCounter(); i++) { depTags.add(depRelTable.getSymbolCodeToString(i)); } depTags.remove("#null#"); // Technical symbol introduced in MaltParser 1.8 addTagset(depTags); if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return parser; } catch (MaltChainedException e) { logger.log(Level.SEVERE, "MaltParser exception while initializing parser model: " + e.getMessage()); throw new IOException(e); } } }; } /** * @see AnalysisComponent#collectionProcessComplete() */ @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (workingDir != null && workingDir.isDirectory()) { FileUtils.deleteQuietly(workingDir); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { modelProvider.configure(aJCas.getCas()); // Iterate over all sentences for (Sentence curSentence : select(aJCas, Sentence.class)) { // Generate list of tokens for current sentence List<Token> tokens = selectCovered(Token.class, curSentence); // Generate input format required by parser String[] parserInput = new String[tokens.size()]; for (int i = 0; i < parserInput.length; i++) { Token t = tokens.get(i); int id = i + 1; String form = t.getCoveredText(); String lemma = UNUSED; String cpostag = UNUSED; String postag = UNUSED; String feats = UNUSED; if (features.contains("LEMMA")) { if (t.getLemma() != null) { lemma = t.getLemma().getValue(); } else if (!ignoreMissingFeatures) { throw new IllegalStateException( "Model uses feature LEMMA but there is no lemma information in CAS"); } } // Actually, this cannot work, because we only know about the DKPro Core coarse // grained categories, which are most likely different from the coarse-grained // categories required by the model. We would need to include a mapping with the // model to recover the required coarse grained categories from the fine-grained // categories in POSTAG. if (features.contains("CPOSTAG")) { // if (t.getPos() != null) { // cpostag = t.getPos().getPosValue(); // } // else if (!ignoreMissingFeatures) { throw new IllegalStateException( "Model uses feature CPOSTAG but there is no part-of-speech information in CAS"); } } if (features.contains("POSTAG")) { if (t.getPos() != null) { postag = t.getPos().getPosValue(); } else if (!ignoreMissingFeatures) { throw new IllegalStateException( "Model uses feature POSTAG but there is no part-of-speech information in CAS"); } } if (features.contains("FEATS")) { if (t.getMorph() != null) { feats = t.getMorph().getValue(); } else if (!ignoreMissingFeatures) { throw new IllegalStateException( "Model uses feature FEATS but there is no morphology information in CAS"); } } // This only works for the English model. Other models have different input // formats. See http://www.maltparser.org/mco/mco.html parserInput[i] = String.format("%d\t%s\t%s\t%s\t%s\t%s", id, form, lemma, cpostag, postag, feats); } // Parse sentence DependencyStructure graph = null; try { // Parses the sentence graph = modelProvider.getResource().parse(parserInput); symbolTable = graph.getSymbolTables().getSymbolTable(symbolTableName); } catch (MaltChainedException e) { logger.log(Level.WARNING, "MaltParser exception while parsing sentence: " + e.getMessage(), e); // don't pass on exception - go on with next sentence continue; } /* * Generate annotations: NOTE: Index of token in tokenList corresponds to node in * DependencyGraph with NodeIndex+1 */ try { // iterate over all tokens in current sentence for (int i = 0; i < tokens.size(); i++) { // Start with Node 1 - we omit ROOT-dependencies, // because we don't have a ROOT-token. TokenNode curNode = graph.getTokenNode(i + 1); // iterate over all dependencies for current token for (Edge edge : curNode.getHeadEdges()) { int sourceIdx = edge.getSource().getIndex(); int targetIdx = edge.getTarget().getIndex(); // get corresponding token for node in DependencyGraph Token sourceToken = sourceIdx > 0 ? tokens.get(sourceIdx - 1) : null; Token targetToken = targetIdx > 0 ? tokens.get(targetIdx - 1) : null; // create dep-annotation for current edge if (sourceToken != null && targetToken != null) { Dependency dep = new Dependency(aJCas); dep.setDependencyType(edge.getLabelSymbol(symbolTable)); dep.setFlavor(DependencyFlavor.BASIC); dep.setGovernor(sourceToken); // TODO check if source=Governor dep.setDependent(targetToken); // TODO check if target=Dependent dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.addToIndexes(); } else if (targetToken != null && sourceToken == null) { Dependency dep = new ROOT(aJCas); // Trying to get the label triggers Exception dep.setDependencyType("ROOT"); dep.setFlavor(DependencyFlavor.BASIC); dep.setGovernor(targetToken); dep.setDependent(targetToken); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); dep.addToIndexes(); } else { throw new IllegalStateException("Source token must exist."); } } } } catch (MaltChainedException e) { logger.log(Level.WARNING, "MaltParser exception creating dependency annotations: " + e.getMessage(), e); // don't pass on exception - go on with next sentence continue; } } } private String getRealName(URL aUrl) throws IOException { JarEntry je = null; JarInputStream jis = null; try { jis = new JarInputStream(aUrl.openConnection().getInputStream()); while ((je = jis.getNextJarEntry()) != null) { String entryName = je.getName(); if (entryName.endsWith(".info")) { int indexUnderScore = entryName.lastIndexOf('_'); int indexSeparator = entryName.lastIndexOf(File.separator); if (indexSeparator == -1) { indexSeparator = entryName.lastIndexOf('/'); } if (indexSeparator == -1) { indexSeparator = entryName.lastIndexOf('\\'); } int indexDot = entryName.lastIndexOf('.'); if (indexUnderScore == -1 || indexDot == -1) { throw new IllegalStateException( "Could not find the configuration name and type from the URL '" + aUrl.toString() + "'. "); } return entryName.substring(indexSeparator+1, indexUnderScore) + ".mco"; } } throw new IllegalStateException( "Could not find the configuration name and type from the URL '" + aUrl.toString() + "'. "); } finally { IOUtils.closeQuietly(jis); } } private Set<String> getFeatures(URL aUrl) throws IOException { JarEntry je = null; JarInputStream jis = null; try { jis = new JarInputStream(aUrl.openConnection().getInputStream()); while ((je = jis.getNextJarEntry()) != null) { String entryName = je.getName(); if (entryName.endsWith(".info")) { Set<String> features = new HashSet<String>(); for (String line : IOUtils.readLines(jis, "UTF-8")) { if (line.contains("InputColumn(")) { int offset = line.indexOf("InputColumn("); while (offset >= 0) { int comma = line.indexOf(',', offset+1); features.add(line.substring(offset+12,comma).trim()); offset = line.indexOf("InputColumn(", comma); } } } return features; } } throw new IllegalStateException( "Could not find the configuration name and type from the URL '" + aUrl.toString() + "'. "); } finally { IOUtils.closeQuietly(jis); } } }