//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.language; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.FileUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceInitializationException; import org.maltparser.concurrent.ConcurrentMaltParserModel; import org.maltparser.concurrent.ConcurrentMaltParserService; import org.maltparser.concurrent.graph.ConcurrentDependencyGraph; import org.maltparser.concurrent.graph.ConcurrentDependencyNode; import org.maltparser.core.exception.MaltChainedException; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.language.Dependency; import uk.gov.dstl.baleen.types.language.Sentence; import uk.gov.dstl.baleen.types.language.WordLemma; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Uses a MaltParser to create a dependency grammar. * <p> * See http://www.maltparser.org/ for more details of the implementation. * <p> * The English language model of maltparser is trained on the Penn Treebank corpus, and as such it * is is not freely licensed. To avoid this this project contains an English model trained from the * English universal dependencies dataset (http://universaldependencies.org/docs/) where the * original data is licensed under https://creativecommons.org/licenses/by-sa/4.0/. As such the * training data is licensed under the same agreement. * <p> * The universal dependency model uses their own tags. This annotator converts between the UD and * standard tags. * <p> * The MaltParser appears to be fast, low memory use and stable. As all trained algorithms it will * function only as well as its training set. We found the original Penn Treebank to be * (subjectively) better than the Universal Dependency model. However if an algorithm requires only * dependency distance or an understanding of word linkage the universal dependency model functions * well enough. * <p> * The output of this annotator is Dependency annotations. * * @baleen.javadoc */ public class MaltParser extends BaleenAnnotator { private static final String INTJ = "INTJ"; private static final String PRON = "PRON"; private static final String PART = "PART"; private static final String PROPN = "PROPN"; private static final String ADP = "ADP"; private static final String X = "X"; private static final String NOUN = "NOUN"; private static final String ADV = "ADV"; private static final String DET = "DET"; private static final String NUM = "NUM"; private static final String VERB = "VERB"; private static final String CONJ = "CONJ"; private static final String ADJ = "ADJ"; private static final String SYM = "SYM"; private static final String PUNCT = "PUNCT"; private static final Map<String, String> PENN_TO_UNIVERSAL_TAGS = new HashMap<>(); /** * The model file, (.mco), to be loaded into the parser. * * If no file is provided, then the built in model (trained on Universal Dependency data) will be used. * * @baleen.config */ public static final String PARAM_FILE_NAME = "model"; @ConfigurationParameter(name = MaltParser.PARAM_FILE_NAME, defaultValue = "") private String modelFilename; /** * Convert to POS annotations to Universal Dependendency tags before input. * * This is required if the model is trained on a UD dataset. * * @baleen.config true */ public static final String PARAM_CONVERT_TO_UD = "udTags"; @ConfigurationParameter(name = MaltParser.PARAM_CONVERT_TO_UD, defaultValue = "true") private Boolean udTags; private ConcurrentMaltParserModel model; @Override public void doInitialize(final UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); File modelFile = null; if(!Strings.isNullOrEmpty(modelFilename)) modelFile = new File(modelFilename); if (modelFile == null || !modelFile.exists()) { // If the file doesn't exist then we will use try reading from the classpath. // Unfortunately Maltparser.doInitialise doesn't seem to like reading it from the Baleen shaded Jar // So we copy it our and delete it on exit getMonitor().info("Dependency model not provided or does not exist, using built in model"); InputStream is = getClass().getResourceAsStream("maltparser-universaldependencies-en.mco"); if (is != null) { try { modelFile = File.createTempFile("baleen", "maltpaser-model"); FileUtils.copyInputStreamToFile(is, modelFile); modelFile.deleteOnExit(); } catch (IOException e) { getMonitor().error("Unable to copy internal model {}", e); } } } try { model = ConcurrentMaltParserService.initializeParserModel(modelFile); } catch (final MaltChainedException | MalformedURLException e) { throw new ResourceInitializationException(e); } udTags = udTags == null ? true : udTags; } @Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { for (final Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { final List<WordToken> wordTokens = JCasUtil.selectCovered(jCas, WordToken.class, sentence); final String[] tokens = new String[wordTokens.size()]; int i = 0; for (final WordToken wt : wordTokens) { final String pos = wt.getPartOfSpeech(); final String lemma = getLemma(wt); final String tag = udTags ? convertPennToUniversal(pos) : pos; tokens[i] = String.format("%d\t%s\t%s\t%s\t%s\t_", i + 1, wt.getCoveredText(), lemma, tag, pos); i++; } try { final ConcurrentDependencyGraph graph = model.parse(tokens); for (int j = 0; j < graph.nDependencyNodes(); j++) { final ConcurrentDependencyNode node = graph.getDependencyNode(j); if (node.hasHead()) { final Dependency dep = new Dependency(jCas); if (node.getHeadIndex() != 0) { dep.setGovernor(wordTokens.get(node.getHeadIndex() - 1)); final String label = node.getLabel(7); dep.setDependencyType(label); } else { dep.setGovernor(wordTokens.get(node.getIndex() - 1)); dep.setDependencyType("ROOT"); } dep.setDependent(wordTokens.get(node.getIndex() - 1)); dep.setBegin(dep.getDependent().getBegin()); dep.setEnd(dep.getDependent().getEnd()); addToJCasIndex(dep); } } } catch (final Exception e) { throw new AnalysisEngineProcessException(e); } } } /** * Gets the lemma. * * @param token * the token * @return the lemma */ private String getLemma(final WordToken token) { final FSArray array = token.getLemmas(); if (array == null || array.size() == 0) { return "_"; } else { return ((WordLemma) array.get(0)).getLemmaForm(); } } static { // See http://universaldependencies.github.io/docs/tagset-conversion/en-penn-uposf.html MaltParser.PENN_TO_UNIVERSAL_TAGS.put("#", SYM); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("$", SYM); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("\"", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put(",", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("-LRB-", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("-RRB-", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put(".", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put(":", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("AFX", ADJ); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("CC", CONJ); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("CD", NUM); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("DT", DET); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("EX", ADV); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("FW", X); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("HYPH", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("IN", ADP); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("JJ", ADJ); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("JJR", ADJ); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("JJS", ADJ); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("LS", PUNCT); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("MD", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NN", NOUN); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NNP", PROPN); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NNPS", PROPN); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NNS", NOUN); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("PDT", DET); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("POS", PART); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("PRP", PRON); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("PRP$", DET); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RB", ADV); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RBR", ADV); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RBS", ADV); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RP", PART); MaltParser.PENN_TO_UNIVERSAL_TAGS.put(SYM, SYM); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("TO", PART); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("UH", INTJ); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VB", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBD", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBG", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBN", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBP", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBZ", VERB); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WDT", DET); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WP", PRON); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WP$", DET); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WRB", ADV); MaltParser.PENN_TO_UNIVERSAL_TAGS.put("`", PUNCT); } /** * Convert penn to universal. * * @param tag * the tag * @return the string */ private String convertPennToUniversal(final String tag) { return MaltParser.PENN_TO_UNIVERSAL_TAGS.getOrDefault(tag, tag); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(WordToken.class, Sentence.class, WordLemma.class), ImmutableSet.of(Dependency.class)); } }