/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.matetools;
import static java.util.Arrays.asList;
import static org.apache.uima.util.Level.INFO;
import is2.data.SentenceData09;
import is2.io.CONLLReader09;
import is2.parser.MFO;
import is2.parser.Options;
import is2.parser.Parser;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT;
/**
* DKPro Annotator for the MateToolsParser.
*
* <p>
* Please cite the following paper, if you use the parser: Bernd Bohnet. 2010. Top Accuracy and Fast
* Dependency Parsing is not a Contradiction. The 23rd International Conference on Computational
* Linguistics (COLING 2010), Beijing, China.
* </p>
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" })
public class MateParser
extends JCasAnnotator_ImplBase
{
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Log the tag set(s) when a model is loaded.
*
* Default: {@code false}
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false")
protected boolean printTagSet;
/**
* Load the dependency to UIMA type mapping from this location instead of locating
* the mapping automatically.
*/
public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false)
protected String dependencyMappingLocation;
private CasConfigurableProviderBase<Parser> modelProvider;
private MappingProvider mappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new ModelProviderBase<Parser>(this, "matetools", "parser")
{
@Override
protected Parser produceResource(URL aUrl)
throws IOException
{
File modelFile = ResourceUtils.getUrlAsFile(aUrl, true);
String[] args = { "-model", modelFile.getPath() };
Options option = new Options(args);
Parser parser = new Parser(option); // create a parser
Properties metadata = getResourceMetaData();
HashMap<String, HashMap<String, Integer>> featureSet = MFO.getFeatureSet();
SingletonTagset posTags = new SingletonTagset(
POS.class, metadata.getProperty("pos.tagset"));
HashMap<String, Integer> posTagFeatures = featureSet.get("POS");
posTags.addAll(posTagFeatures.keySet());
posTags.removeAll(asList("<None>", "<root-POS>"));
addTagset(posTags);
SingletonTagset depTags = new SingletonTagset(
Dependency.class, metadata.getProperty("dependency.tagset"));
HashMap<String, Integer> depTagFeatures = featureSet.get("REL");
depTags.addAll(depTagFeatures.keySet());
depTags.removeAll(asList("<None>", "<no-type>", "<root-type>"));
addTagset(depTags);
if (printTagSet) {
getContext().getLogger().log(INFO, getTagset().toString());
}
return parser;
}
};
mappingProvider = MappingProviderFactory.createDependencyMappingProvider(
dependencyMappingLocation, language, modelProvider);
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
CAS cas = jcas.getCas();
modelProvider.configure(cas);
mappingProvider.configure(cas);
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence);
List<String> forms = new LinkedList<String>();
forms.add(CONLLReader09.ROOT);
forms.addAll(JCasUtil.toText(tokens));
List<String> lemmas = new LinkedList<String>();
List<String> posTags = new LinkedList<String>();
lemmas.add(CONLLReader09.ROOT_LEMMA);
posTags.add(CONLLReader09.ROOT_POS);
for (Token token : tokens) {
if (token.getLemma() != null) {
lemmas.add(token.getLemma().getValue());
}
else {
lemmas.add("_");
}
posTags.add(token.getPos().getPosValue());
}
SentenceData09 sd = new SentenceData09();
sd.init(forms.toArray(new String[forms.size()]));
sd.setLemmas(lemmas.toArray(new String[lemmas.size()]));
sd.setPPos(posTags.toArray(new String[posTags.size()]));
SentenceData09 parsed = modelProvider.getResource().apply(sd);
for (int i = 0; i < parsed.labels.length; i++) {
if (parsed.pheads[i] != 0) {
Token sourceToken = tokens.get(parsed.pheads[i] - 1);
Token targetToken = tokens.get(i);
Type depRel = mappingProvider.getTagType(parsed.plabels[i]);
Dependency dep = (Dependency) cas.createFS(depRel);
dep.setGovernor(sourceToken);
dep.setDependent(targetToken);
dep.setDependencyType(parsed.plabels[i]);
dep.setFlavor(DependencyFlavor.BASIC);
dep.setBegin(dep.getDependent().getBegin());
dep.setEnd(dep.getDependent().getEnd());
dep.addToIndexes();
}
else {
Token rootToken = tokens.get(i);
Dependency dep = new ROOT(jcas);
dep.setGovernor(rootToken);
dep.setDependent(rootToken);
dep.setDependencyType(parsed.plabels[i]);
dep.setFlavor(DependencyFlavor.BASIC);
dep.setBegin(dep.getDependent().getBegin());
dep.setEnd(dep.getDependent().getEnd());
dep.addToIndexes();
}
}
}
}
}