/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.lingpipe;
import static java.util.Arrays.asList;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import static org.apache.uima.fit.util.JCasUtil.toText;
import static org.apache.uima.util.Level.INFO;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.List;
import java.util.Locale;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.tag.Tagging;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* LingPipe part-of-speech tagger.
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" })
public class LingPipePosTagger
extends JCasAnnotator_ImplBase
{
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Load the part-of-speech tag to UIMA type mapping from this location instead of locating
* the mapping automatically.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String posMappingLocation;
/**
* Use the {@link String#intern()} method on tags. This is usually a good idea to avoid
* spaming the heap with thousands of strings representing only a few different tags.
*
* Default: {@code true}
*/
public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
@ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true")
private boolean internTags;
/**
* Log the tag set(s) when a model is loaded.
*
* Default: {@code false}
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false")
protected boolean printTagSet;
/**
* Lingpipe models tend to be trained on lower-case tags, but our POS mappings use uppercase.
*/
public static final String PARAM_UPPERCASE_TAGS = "uppercaseTags";
@ConfigurationParameter(name = PARAM_UPPERCASE_TAGS, mandatory = true, defaultValue="true")
protected boolean uppercaseTags;
private CasConfigurableProviderBase<HmmDecoder> modelProvider;
private MappingProvider mappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new ModelProviderBase<HmmDecoder>(this, "lingpipe", "tagger") {
@Override
protected HmmDecoder produceResource(InputStream aStream)
throws Exception
{
ObjectInputStream ois = new ObjectInputStream(aStream);
HiddenMarkovModel hmm = (HiddenMarkovModel) ois.readObject();
SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData()
.getProperty(("pos.tagset")));
for (int n = 0; n < hmm.stateSymbolTable().numSymbols(); n++) {
String tag = hmm.stateSymbolTable().idToSymbol(n);
if (uppercaseTags) {
tag = tag.toUpperCase(Locale.US);
}
tags.add(tag);
}
addTagset(tags);
if (printTagSet) {
getContext().getLogger().log(INFO, getTagset().toString());
}
return new HmmDecoder(hmm);
}
};
mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation,
language, modelProvider);
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
CAS cas = aJCas.getCas();
modelProvider.configure(cas);
mappingProvider.configure(cas);
for (Sentence sentence : select(aJCas, Sentence.class)) {
List<Token> tokens = selectCovered(aJCas, Token.class, sentence);
String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]);
Tagging<String> tagging = modelProvider.getResource().tag(asList(tokenTexts));
for (int n = 0; n < tagging.size(); n++) {
Token t = tokens.get(n);
String tag = tagging.tag(n);
if (uppercaseTags) {
tag = tag.toUpperCase(Locale.US);
}
Type posTag = mappingProvider.getTagType(tag);
POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd());
posAnno.setPosValue(internTags ? tag.intern() : tag);
posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null
: posAnno.getType().getShortName().intern());
posAnno.addToIndexes();
t.setPos(posAnno);
}
}
}
}