/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.clearnlp; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.toText; import static org.apache.uima.util.Level.INFO; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.clearnlp.classification.model.StringModel; import com.clearnlp.component.AbstractComponent; import com.clearnlp.component.morph.EnglishMPAnalyzer; import com.clearnlp.component.pos.AbstractPOSTagger; import com.clearnlp.component.pos.DefaultPOSTagger; import com.clearnlp.component.pos.EnglishPOSTagger; import com.clearnlp.dependency.DEPTree; import com.clearnlp.nlp.NLPGetter; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableStreamProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Part-of-Speech annotator using Clear NLP. Requires {@link Sentence}s to be annotated before. * */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) public class ClearNlpPosTagger extends JCasAnnotator_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the dictionary. */ public static final String PARAM_DICT_VARIANT = "dictVariant"; @ConfigurationParameter(name = PARAM_DICT_VARIANT, mandatory = false) protected String dictVariant; /** * Load the dictionary from this location instead of locating the dictionary automatically. */ public static final String PARAM_DICT_LOCATION = "dictLocation"; @ConfigurationParameter(name = PARAM_DICT_LOCATION, mandatory = false) protected String dictLocation; /** * Override the default variant used to locate the pos-tagging model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String posVariant; /** * Load the model from this location instead of locating the pos-tagging model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String posModelLocation; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating the * mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid spaming * the heap with thousands of strings representing only a few different tags. */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; /** * Log the tag set(s) when a model is loaded. */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; private CasConfigurableProviderBase<InputStream> dictModelProvider; private CasConfigurableProviderBase<AbstractPOSTagger> posTaggingModelProvider; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); dictModelProvider = new CasConfigurableStreamProviderBase<InputStream>() { { setContextObject(ClearNlpPosTagger.this); setDefault(ARTIFACT_ID, "${groupId}.clearnlp-model-dictionary-${language}-${variant}"); setDefault(LOCATION, "classpath:/${package}/lib/dictionary-${language}-${variant}.properties"); setDefaultVariantsLocation("${package}/lib/dictionary-default-variants.map"); setDefault(VARIANT, "default"); setOverride(LOCATION, dictLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, dictVariant); } @Override protected InputStream produceResource(InputStream aStream) throws Exception { ByteArrayOutputStream os = new ByteArrayOutputStream(); IOUtils.copy(aStream, os); byte[] array = os.toByteArray(); InputStream is = new ByteArrayInputStream(array); return is; } }; posTaggingModelProvider = new ModelProviderBase<AbstractPOSTagger>(this, "clearnlp", "tagger") { { setDefault(VARIANT, "ontonotes"); } @Override protected AbstractPOSTagger produceResource(InputStream aStream) throws Exception { BufferedInputStream bis = null; ObjectInputStream ois = null; GZIPInputStream gis = null; try { gis = new GZIPInputStream(aStream); bis = new BufferedInputStream(gis); ois = new ObjectInputStream(bis); String language = getAggregatedProperties().getProperty(LANGUAGE); AbstractPOSTagger tagger; if(language.equals("en")){ tagger = new DkproPosTagger(ois); }else{ tagger = new DefaultPOSTagger(ois); } SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() .getProperty(("pos.tagset"))); for (StringModel model : tagger.getModels()) { tags.addAll(asList(model.getLabels())); } addTagset(tags, true); if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return tagger; } catch (Exception e) { throw new IOException(e); } finally { closeQuietly(ois); closeQuietly(bis); closeQuietly(gis); } } }; posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, language, posTaggingModelProvider); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); dictModelProvider.configure(cas); posTaggingModelProvider.configure(cas); posMappingProvider.configure(cas); for (Sentence sentence : select(aJCas, Sentence.class)) { List<Token> tokens = selectCovered(aJCas, Token.class, sentence); List<String> tokenTexts = asList(toText(tokens).toArray(new String[tokens.size()])); DEPTree tree = NLPGetter.toDEPTree(tokenTexts); AbstractComponent tagger = posTaggingModelProvider.getResource(); tagger.process(tree); String[] posTags = tree.getPOSTags(); int i = 0; for (Token t : tokens) { String tag = internTags ? posTags[i + 1].intern() : posTags[i + 1]; Type posTag = posMappingProvider.getTagType(tag); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); posAnno.setPosValue(tag); posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null : posAnno.getType().getShortName().intern()); posAnno.addToIndexes(); t.setPos(posAnno); i++; } } } private class DkproPosTagger extends EnglishPOSTagger{ public DkproPosTagger(ObjectInputStream in) { super(in); } @Override protected void initMorphologicalAnalyzer() { mp_analyzer = new EnglishMPAnalyzer(dictModelProvider.getResource()); } } }