//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.language;
import java.util.Optional;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import com.google.common.collect.ImmutableSet;
import net.sf.extjwnl.data.IndexWord;
import net.sf.extjwnl.data.POS;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.SharedWordNetResource;
import uk.gov.dstl.baleen.resources.utils.WordNetUtils;
import uk.gov.dstl.baleen.types.language.WordLemma;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Add lemma form of word to the WordToken (if the WordToken has no getLemma already).
* <p>
* Uses WordNet, hence coverage will be as good as their dictionary.
*
* @baleen.javadoc
*/
public class WordNetLemmatizer extends BaleenAnnotator {
/**
* Connection to Wordnet
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedWordNetResource
*/
public static final String KEY_WORDNET = "wordnet";
@ExternalResource(key = KEY_WORDNET)
private SharedWordNetResource wordnet;
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
for (final WordToken t : JCasUtil.select(jCas, WordToken.class)) {
if (t.getLemmas() == null || t.getLemmas().size() == 0) {
final String text = t.getCoveredText();
final POS pos = WordNetUtils.toPos(t.getPartOfSpeech());
if (pos != null) {
final Optional<IndexWord> lookupWord = wordnet.lookupWord(pos, text);
if (lookupWord.isPresent()) {
t.setLemmas(new FSArray(jCas, 1));
final WordLemma wordLemma = new WordLemma(jCas);
wordLemma.setLemmaForm(lookupWord.get().getLemma());
t.setLemmas(0, wordLemma);
}
}
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(WordToken.class), ImmutableSet.of(WordLemma.class));
}
}