/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.dictionaryannotator;
import static org.apache.uima.fit.util.CasUtil.getType;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* Takes a plain text file with phrases as input and annotates the phrases in the CAS file. The
* annotation type defaults to {@link NGram}, but can be changed.
*
* The component requires that {@link Token}s and {@link Sentence}es are annotated in the CAS.
*
* The format of the phrase file is one phrase per line, tokens are separated by space:
*
* <pre>
* this is a phrase
* another phrase
* </pre>
*
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" })
public class DictionaryAnnotator
extends JCasAnnotator_ImplBase
{
/**
* The file must contain one phrase per line - phrases will be split at " "
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true)
private String phraseFile;
/**
* The character encoding used by the model.
*/
public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING;
@ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue="UTF-8")
private String modelEncoding;
/**
* The annotation to create on matching phases. If nothing is specified, this defaults to
* {@link NGram}.
*/
public static final String PARAM_ANNOTATION_TYPE = "annotationType";
@ConfigurationParameter(name = PARAM_ANNOTATION_TYPE, mandatory = false)
private String annotationType;
/**
* Set this feature on the created annotations.
*/
public static final String PARAM_VALUE_FEATURE = "valueFeature";
@ConfigurationParameter(name = PARAM_VALUE_FEATURE, mandatory = false, defaultValue = "value")
private String valueFeature;
/**
* The value to set the feature configured in {@link #PARAM_VALUE_FEATURE} to.
*/
public static final String PARAM_VALUE = "value";
@ConfigurationParameter(name = PARAM_VALUE, mandatory = false)
private String value;
private PhraseTree phrases;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
if (annotationType == null) {
annotationType = NGram.class.getName();
}
phrases = new PhraseTree();
InputStream is = null;
try {
URL phraseFileUrl = ResourceUtils.resolveLocation(phraseFile, aContext);
is = phraseFileUrl.openStream();
for (String inputLine : IOUtils.readLines(is, modelEncoding)) {
String[] phraseSplit = inputLine.split(" ");
phrases.addPhrase(phraseSplit);
}
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
finally {
IOUtils.closeQuietly(is);
}
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
Type type = getType(jcas.getCas(), annotationType);
Feature f = null;
if ((valueFeature != null) && (value != null)) {
f = type.getFeatureByBaseName(valueFeature);
if (f == null) {
throw new IllegalArgumentException("Undeclared feature [" + valueFeature
+ "] in type [" + annotationType + "]");
}
}
for (Sentence currSentence : select(jcas, Sentence.class)) {
ArrayList<Token> tokens = new ArrayList<Token>(selectCovered(Token.class, currSentence));
for (int i = 0; i < tokens.size(); i++) {
List<Token> tokensToSentenceEnd = tokens.subList(i, tokens.size() - 1);
String[] sentenceToEnd = new String[tokens.size()];
for (int j = 0; j < tokensToSentenceEnd.size(); j++) {
sentenceToEnd[j] = tokensToSentenceEnd.get(j).getCoveredText();
}
String[] longestMatch = phrases.getLongestMatch(sentenceToEnd);
if (longestMatch != null) {
Token beginToken = tokens.get(i);
Token endToken = tokens.get(i + longestMatch.length - 1);
AnnotationFS newFound = jcas.getCas().createAnnotation(type,
beginToken.getBegin(), endToken.getEnd());
if (f != null) {
newFound.setFeatureValueFromString(f, value);
}
jcas.getCas().addFsToIndexes(newFound);
}
}
}
}
}