//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.language;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.resources.SharedOpenNLPModel;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Annotate linguistic features using the OpenNLP libraries
*
* <p>
* The document content is passed through the OpenNLP Tokenizer, Sentence Detector, Part of Speech
* Tagger, and Chunker (in that order). The appropriate annotations and properties are added to the
* CAS, and associations between the relevant annotations (e.g. WordTokens associated with the
* Sentence) are made.
* </p>
*
* @baleen.javadoc
*/
public class OpenNLP extends BaleenTextAwareAnnotator {
/**
* OpenNLP Resource (Tokens)
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
*/
public static final String KEY_TOKEN = "tokens";
@ExternalResource(key = KEY_TOKEN)
SharedOpenNLPModel tokensModel;
/**
* OpenNLP Resource (Sentences)
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
*/
public static final String KEY_SENTENCES = "sentences";
@ExternalResource(key = KEY_SENTENCES)
SharedOpenNLPModel sentencesModel;
/**
* OpenNLP Resource (Part of Speech Tags)
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
*/
public static final String KEY_POS = "posTags";
@ExternalResource(key = KEY_POS)
SharedOpenNLPModel posModel;
/**
* OpenNLP Resource (Phrase Chunks)
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
*/
public static final String KEY_CHUNKS = "phraseChunks";
@ExternalResource(key = KEY_CHUNKS)
SharedOpenNLPModel chunkModel;
private SentenceDetectorME sentenceDetector;
private TokenizerME wordTokenizer;
private POSTaggerME posTagger;
private ChunkerME phraseChunker;
private final Set<String> prepositions = new HashSet<String>(
Arrays.asList("about", "above", "across", "against", "amid", "around", "at", "atop",
"behind", "below", "beneath", "beside", "between", "beyond", "by", "for", "from",
"down", "in", "including", "inside", "into", "mid", "near", "of", "off", "on", "onto",
"opposite", "out",
"outside", "over", "round", "through", "throughout", "to", "under", "underneath", "with",
"within", "without"));
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
try {
tokensModel.loadModel(TokenizerModel.class, getClass().getResourceAsStream("en_token.bin"));
sentencesModel.loadModel(SentenceModel.class, getClass().getResourceAsStream("en_sent.bin"));
posModel.loadModel(POSModel.class, getClass().getResourceAsStream("en_pos_maxent.bin"));
chunkModel.loadModel(ChunkerModel.class, getClass().getResourceAsStream("en_chunker.bin"));
} catch (BaleenException be) {
getMonitor().error("Unable to load OpenNLP Language Models", be);
throw new ResourceInitializationException(be);
}
try {
sentenceDetector = new SentenceDetectorME((SentenceModel) sentencesModel.getModel());
wordTokenizer = new TokenizerME((TokenizerModel) tokensModel.getModel());
posTagger = new POSTaggerME((POSModel) posModel.getModel());
phraseChunker = new ChunkerME((ChunkerModel) chunkModel.getModel());
} catch (Exception e) {
getMonitor().error("Unable to create OpenNLP taggers", e);
throw new ResourceInitializationException(e);
}
}
@Override
protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
List<Sentence> sentences = createBaseSentences(block);
for (Sentence sentence : sentences) {
List<WordToken> wordTokens = addSentenceWordTokensWithPosTags(sentence, block);
addSentencePhraseChunk(wordTokens, block);
}
}
@Override
public void doDestroy() {
tokensModel = null;
sentencesModel = null;
posModel = null;
chunkModel = null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(WordToken.class, PhraseChunk.class, Sentence.class));
}
/**
* Use the OpenNLP Sentence Detector to detect sentences and add them to the JCas index
*/
private List<Sentence> createBaseSentences(TextBlock block)
throws AnalysisEngineProcessException {
List<Sentence> sentences = new ArrayList<>();
try {
String text = block.getCoveredText();
Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);
for (Span sentSpan : sentenceSpans) {
Sentence sent =
block.newAnnotation(Sentence.class, sentSpan.getStart(), sentSpan.getEnd());
addToJCasIndex(sent);
sentences.add(sent);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
return sentences;
}
/**
* Use the OpenNLP Word Tokenizer and POS Tagger to produce word tokens for each sentence and add
* them to the JCas index
*/
private List<WordToken> addSentenceWordTokensWithPosTags(Sentence sentIn,
TextBlock block)
throws AnalysisEngineProcessException {
List<WordToken> wordTokens = new ArrayList<>();
try {
String sentValue = sentIn.getCoveredText();
if (isUpperCase(sentValue)) {
// The sentence model was trained on mixed-case text, and assumes upper-case words are
// proper nouns.
// If the sentence is entirely upper-case, then make it lower case to improve accuracy.
sentValue = sentValue.toLowerCase();
}
Span[] tokens = wordTokenizer.tokenizePos(sentValue);
String[] words = new String[tokens.length];
for (int a = 0; a < tokens.length; a++) {
words[a] = tokens[a].getCoveredText(sentValue).toString();
}
String[] posTags = posTagger.tag(words);
for (int a = 0; a < tokens.length; a++) {
Span wordSpan = tokens[a];
WordToken wordToken = new WordToken(block.getJCas());
// No need to use the block offset, as we are offseting relative to sentence
wordToken.setBegin(sentIn.getBegin() + wordSpan.getStart());
wordToken.setEnd(sentIn.getBegin() + wordSpan.getEnd());
wordToken.setSentenceOrder(a);
wordToken.setPartOfSpeech(posTags[a]);
addToJCasIndex(wordToken);
wordTokens.add(wordToken);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
return wordTokens;
}
/**
* Add phrase chunks and POS tags to a sentence
*/
private void addSentencePhraseChunk(List<WordToken> tokenList, TextBlock block) {
List<PhraseChunk> sentPhraseChunks = new ArrayList<PhraseChunk>();
String[] tokens = new String[tokenList.size()];
String[] posTags = new String[tokenList.size()];
int ix = 0;
for (WordToken token : tokenList) {
tokens[ix] = token.getCoveredText();
posTags[ix] = token.getPartOfSpeech();
ix++;
}
Span[] result = phraseChunker.chunkAsSpans(tokens, posTags);
for (Span element : result) {
PhraseChunk chunk = new PhraseChunk(block.getJCas());
chunk.setBegin( tokenList.get(element.getStart()).getBegin() );
chunk.setEnd( tokenList.get(element.getEnd() - 1).getEnd() );
chunk.setChunkType(element.getType());
chunk = addPhraseWordsAndHead(chunk, block);
addToJCasIndex(chunk);
sentPhraseChunks.add(chunk);
}
}
/**
* Add constituent words and the head word to a PhraseChunk
*/
private PhraseChunk addPhraseWordsAndHead(PhraseChunk chunk, TextBlock block) {
List<WordToken> constituentWords = new ArrayList<WordToken>();
for (WordToken word : JCasUtil.selectCovered(block.getJCas(), WordToken.class, chunk)) {
constituentWords.add(word);
}
chunk.setConstituentWords(FSCollectionFactory.createFSArray(block.getJCas(), constituentWords));
int headWordId = constituentWords.size() - 1;
// Run through prior words, check for propositional - if so skip, if not break
for (int a = constituentWords.size() - 2; a > 1; a--) {
WordToken wtA = constituentWords.get(a);
// If a POS tag or word value is prepositional, end increment head word index
if ("IN".equals(wtA.getPartOfSpeech()) || ",".equals(wtA.getPartOfSpeech())
|| prepositions.contains(wtA.getCoveredText())) {
headWordId = a - 1;
} else {
headWordId = a;
break;
}
}
chunk.setHeadWord(constituentWords.get(headWordId));
return chunk;
}
private static boolean isUpperCase(String s) {
for (char c : s.toCharArray()) {
if (Character.isLetter(c) && Character.isLowerCase(c)) {
return false;
}
}
return true;
}
}