MaltParser.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.language;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.maltparser.concurrent.ConcurrentMaltParserModel;
import org.maltparser.concurrent.ConcurrentMaltParserService;
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph;
import org.maltparser.concurrent.graph.ConcurrentDependencyNode;
import org.maltparser.core.exception.MaltChainedException;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;

import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.Dependency;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordLemma;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;

/**
 * Uses a MaltParser to create a dependency grammar.
 * <p>
 * See http://www.maltparser.org/ for more details of the implementation.
 * <p>
 * The English language model of maltparser is trained on the Penn Treebank corpus, and as such it
 * is is not freely licensed. To avoid this this project contains an English model trained from the
 * English universal dependencies dataset (http://universaldependencies.org/docs/) where the
 * original data is licensed under https://creativecommons.org/licenses/by-sa/4.0/. As such the
 * training data is licensed under the same agreement.
 * <p>
 * The universal dependency model uses their own tags. This annotator converts between the UD and
 * standard tags.
 * <p>
 * The MaltParser appears to be fast, low memory use and stable. As all trained algorithms it will
 * function only as well as its training set. We found the original Penn Treebank to be
 * (subjectively) better than the Universal Dependency model. However if an algorithm requires only
 * dependency distance or an understanding of word linkage the universal dependency model functions
 * well enough.
 * <p>
 * The output of this annotator is Dependency annotations.
 *
 * @baleen.javadoc
 */
public class MaltParser extends BaleenAnnotator {
	private static final String INTJ = "INTJ";
	private static final String PRON = "PRON";
	private static final String PART = "PART";
	private static final String PROPN = "PROPN";
	private static final String ADP = "ADP";
	private static final String X = "X";
	private static final String NOUN = "NOUN";
	private static final String ADV = "ADV";
	private static final String DET = "DET";
	private static final String NUM = "NUM";
	private static final String VERB = "VERB";
	private static final String CONJ = "CONJ";
	private static final String ADJ = "ADJ";
	private static final String SYM = "SYM";
	private static final String PUNCT = "PUNCT";
	
	private static final Map<String, String> PENN_TO_UNIVERSAL_TAGS = new HashMap<>();

	/**
	 * The model file, (.mco), to be loaded into the parser.
	 *
	 * If no file is provided, then the built in model (trained on Universal Dependency data) will be used.
	 *
	 * @baleen.config
	 */
	public static final String PARAM_FILE_NAME = "model";
	@ConfigurationParameter(name = MaltParser.PARAM_FILE_NAME, defaultValue = "")
	private String modelFilename;

	/**
	 * Convert to POS annotations to Universal Dependendency tags before input.
	 *
	 * This is required if the model is trained on a UD dataset.
	 *
	 * @baleen.config true
	 */
	public static final String PARAM_CONVERT_TO_UD = "udTags";
	@ConfigurationParameter(name = MaltParser.PARAM_CONVERT_TO_UD, defaultValue = "true")
	private Boolean udTags;

	private ConcurrentMaltParserModel model;

	@Override
	public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
		super.doInitialize(aContext);

		File modelFile = null;
		if(!Strings.isNullOrEmpty(modelFilename))	
			modelFile = new File(modelFilename);

		if (modelFile == null || !modelFile.exists()) {
			// If the file doesn't exist then we will use try reading from the classpath.

			// Unfortunately Maltparser.doInitialise doesn't seem to like reading it from the Baleen shaded Jar
			// So we copy it our and delete it on exit
			getMonitor().info("Dependency model not provided or does not exist, using built in model");
			
			InputStream is = getClass().getResourceAsStream("maltparser-universaldependencies-en.mco");
			if (is != null) {
				try {
					modelFile = File.createTempFile("baleen", "maltpaser-model");
					FileUtils.copyInputStreamToFile(is, modelFile);
					modelFile.deleteOnExit();
				} catch (IOException e) {
					getMonitor().error("Unable to copy internal model {}", e);
				}
			}
		}

		try {
			model = ConcurrentMaltParserService.initializeParserModel(modelFile);
		} catch (final MaltChainedException | MalformedURLException e) {
			throw new ResourceInitializationException(e);
		}

		udTags = udTags == null ? true : udTags;
	}

	@Override
	protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {

		for (final Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {

			final List<WordToken> wordTokens = JCasUtil.selectCovered(jCas, WordToken.class, sentence);

			final String[] tokens = new String[wordTokens.size()];

			int i = 0;
			for (final WordToken wt : wordTokens) {
				final String pos = wt.getPartOfSpeech();
				final String lemma = getLemma(wt);
				final String tag = udTags ? convertPennToUniversal(pos) : pos;
				tokens[i] = String.format("%d\t%s\t%s\t%s\t%s\t_", i + 1, wt.getCoveredText(), lemma, tag, pos);
				i++;
			}

			try {
				final ConcurrentDependencyGraph graph = model.parse(tokens);
				for (int j = 0; j < graph.nDependencyNodes(); j++) {
					final ConcurrentDependencyNode node = graph.getDependencyNode(j);

					if (node.hasHead()) {
						final Dependency dep = new Dependency(jCas);
						if (node.getHeadIndex() != 0) {
							dep.setGovernor(wordTokens.get(node.getHeadIndex() - 1));
							final String label = node.getLabel(7);
							dep.setDependencyType(label);
						} else {
							dep.setGovernor(wordTokens.get(node.getIndex() - 1));
							dep.setDependencyType("ROOT");
						}
						dep.setDependent(wordTokens.get(node.getIndex() - 1));
						dep.setBegin(dep.getDependent().getBegin());
						dep.setEnd(dep.getDependent().getEnd());
						addToJCasIndex(dep);
					}
				}

			} catch (final Exception e) {
				throw new AnalysisEngineProcessException(e);
			}

		}
	}

	/**
	 * Gets the lemma.
	 *
	 * @param token
	 *            the token
	 * @return the lemma
	 */
	private String getLemma(final WordToken token) {
		final FSArray array = token.getLemmas();
		if (array == null || array.size() == 0) {
			return "_";
		} else {
			return ((WordLemma) array.get(0)).getLemmaForm();
		}
	}

	static {
		// See http://universaldependencies.github.io/docs/tagset-conversion/en-penn-uposf.html
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("#", SYM);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("$", SYM);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("\"", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put(",", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("-LRB-", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("-RRB-", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put(".", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put(":", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("AFX", ADJ);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("CC", CONJ);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("CD", NUM);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("DT", DET);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("EX", ADV);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("FW", X);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("HYPH", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("IN", ADP);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("JJ", ADJ);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("JJR", ADJ);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("JJS", ADJ);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("LS", PUNCT);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("MD", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NN", NOUN);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NNP", PROPN);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NNPS", PROPN);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("NNS", NOUN);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("PDT", DET);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("POS", PART);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("PRP", PRON);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("PRP$", DET);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RB", ADV);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RBR", ADV);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RBS", ADV);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("RP", PART);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put(SYM, SYM);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("TO", PART);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("UH", INTJ);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VB", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBD", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBG", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBN", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBP", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("VBZ", VERB);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WDT", DET);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WP", PRON);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WP$", DET);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("WRB", ADV);
		MaltParser.PENN_TO_UNIVERSAL_TAGS.put("`", PUNCT);
	}

	/**
	 * Convert penn to universal.
	 *
	 * @param tag
	 *            the tag
	 * @return the string
	 */
	private String convertPennToUniversal(final String tag) {
		return MaltParser.PENN_TO_UNIVERSAL_TAGS.getOrDefault(tag, tag);
	}
	
	@Override
	public AnalysisEngineAction getAction() {
		return new AnalysisEngineAction(ImmutableSet.of(WordToken.class, Sentence.class, WordLemma.class), ImmutableSet.of(Dependency.class));
	}
}