OpenNLP.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.language;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.collect.ImmutableSet;

import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.resources.SharedOpenNLPModel;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;

/**
 * Annotate linguistic features using the OpenNLP libraries
 * 
 * <p>
 * The document content is passed through the OpenNLP Tokenizer, Sentence Detector, Part of Speech
 * Tagger, and Chunker (in that order). The appropriate annotations and properties are added to the
 * CAS, and associations between the relevant annotations (e.g. WordTokens associated with the
 * Sentence) are made.
 * </p>
 * 
 * @baleen.javadoc
 */
public class OpenNLP extends BaleenTextAwareAnnotator {
	/**
	 * OpenNLP Resource (Tokens)
	 * 
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
	 */
	public static final String KEY_TOKEN = "tokens";
	@ExternalResource(key = KEY_TOKEN)
	SharedOpenNLPModel tokensModel;

	/**
	 * OpenNLP Resource (Sentences)
	 * 
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
	 */
	public static final String KEY_SENTENCES = "sentences";
	@ExternalResource(key = KEY_SENTENCES)
	SharedOpenNLPModel sentencesModel;

	/**
	 * OpenNLP Resource (Part of Speech Tags)
	 * 
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
	 */
	public static final String KEY_POS = "posTags";
	@ExternalResource(key = KEY_POS)
	SharedOpenNLPModel posModel;

	/**
	 * OpenNLP Resource (Phrase Chunks)
	 * 
	 * @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
	 */
	public static final String KEY_CHUNKS = "phraseChunks";
	@ExternalResource(key = KEY_CHUNKS)
	SharedOpenNLPModel chunkModel;

	private SentenceDetectorME sentenceDetector;
	private TokenizerME wordTokenizer;
	private POSTaggerME posTagger;
	private ChunkerME phraseChunker;

	private final Set<String> prepositions = new HashSet<String>(
			Arrays.asList("about", "above", "across", "against", "amid", "around", "at", "atop",
					"behind", "below", "beneath", "beside", "between", "beyond", "by", "for", "from",
					"down", "in", "including", "inside", "into", "mid", "near", "of", "off", "on", "onto",
					"opposite", "out",
					"outside", "over", "round", "through", "throughout", "to", "under", "underneath", "with",
					"within", "without"));

	@Override
	public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
		try {
			tokensModel.loadModel(TokenizerModel.class, getClass().getResourceAsStream("en_token.bin"));
			sentencesModel.loadModel(SentenceModel.class, getClass().getResourceAsStream("en_sent.bin"));
			posModel.loadModel(POSModel.class, getClass().getResourceAsStream("en_pos_maxent.bin"));
			chunkModel.loadModel(ChunkerModel.class, getClass().getResourceAsStream("en_chunker.bin"));
		} catch (BaleenException be) {
			getMonitor().error("Unable to load OpenNLP Language Models", be);
			throw new ResourceInitializationException(be);
		}

		try {
			sentenceDetector = new SentenceDetectorME((SentenceModel) sentencesModel.getModel());
			wordTokenizer = new TokenizerME((TokenizerModel) tokensModel.getModel());
			posTagger = new POSTaggerME((POSModel) posModel.getModel());
			phraseChunker = new ChunkerME((ChunkerModel) chunkModel.getModel());
		} catch (Exception e) {
			getMonitor().error("Unable to create OpenNLP taggers", e);
			throw new ResourceInitializationException(e);
		}
	}


	@Override
	protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
		List<Sentence> sentences = createBaseSentences(block);

		for (Sentence sentence : sentences) {
			List<WordToken> wordTokens = addSentenceWordTokensWithPosTags(sentence, block);
			addSentencePhraseChunk(wordTokens, block);
		}
	}

	@Override
	public void doDestroy() {
		tokensModel = null;
		sentencesModel = null;
		posModel = null;
		chunkModel = null;
	}

	@Override
	public AnalysisEngineAction getAction() {
		return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(WordToken.class, PhraseChunk.class, Sentence.class));
	}

	/**
	 * Use the OpenNLP Sentence Detector to detect sentences and add them to the JCas index
	 */
	private List<Sentence> createBaseSentences(TextBlock block)
			throws AnalysisEngineProcessException {
		List<Sentence> sentences = new ArrayList<>();

		try {
			String text = block.getCoveredText();
			Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);

			for (Span sentSpan : sentenceSpans) {
				Sentence sent =
						block.newAnnotation(Sentence.class, sentSpan.getStart(), sentSpan.getEnd());

				addToJCasIndex(sent);
				sentences.add(sent);
			}
		} catch (Exception e) {
			throw new AnalysisEngineProcessException(e);
		}

		return sentences;
	}

	/**
	 * Use the OpenNLP Word Tokenizer and POS Tagger to produce word tokens for each sentence and add
	 * them to the JCas index
	 */
	private List<WordToken> addSentenceWordTokensWithPosTags(Sentence sentIn,
			TextBlock block)
					throws AnalysisEngineProcessException {
		List<WordToken> wordTokens = new ArrayList<>();

		try {
			String sentValue = sentIn.getCoveredText();

			if (isUpperCase(sentValue)) {
				// The sentence model was trained on mixed-case text, and assumes upper-case words are
				// proper nouns.
				// If the sentence is entirely upper-case, then make it lower case to improve accuracy.
				sentValue = sentValue.toLowerCase();
			}

			Span[] tokens = wordTokenizer.tokenizePos(sentValue);
			String[] words = new String[tokens.length];
			for (int a = 0; a < tokens.length; a++) {
				words[a] = tokens[a].getCoveredText(sentValue).toString();
			}
			String[] posTags = posTagger.tag(words);

			for (int a = 0; a < tokens.length; a++) {
				Span wordSpan = tokens[a];

				WordToken wordToken = new WordToken(block.getJCas());
				// No need to use the block offset, as we are offseting relative to sentence
				wordToken.setBegin(sentIn.getBegin() + wordSpan.getStart());
				wordToken.setEnd(sentIn.getBegin() + wordSpan.getEnd());
				wordToken.setSentenceOrder(a);
				wordToken.setPartOfSpeech(posTags[a]);

				addToJCasIndex(wordToken);
				wordTokens.add(wordToken);
			}
		} catch (Exception e) {
			throw new AnalysisEngineProcessException(e);
		}

		return wordTokens;
	}

	/**
	 * Add phrase chunks and POS tags to a sentence
	 */
	private void addSentencePhraseChunk(List<WordToken> tokenList, TextBlock block) {
		List<PhraseChunk> sentPhraseChunks = new ArrayList<PhraseChunk>();

		String[] tokens = new String[tokenList.size()];
		String[] posTags = new String[tokenList.size()];

		int ix = 0;
		for (WordToken token : tokenList) {
			tokens[ix] = token.getCoveredText();
			posTags[ix] = token.getPartOfSpeech();
			ix++;
		}

		Span[] result = phraseChunker.chunkAsSpans(tokens, posTags);

		for (Span element : result) {
			PhraseChunk chunk = new PhraseChunk(block.getJCas());

			chunk.setBegin( tokenList.get(element.getStart()).getBegin() );
			chunk.setEnd( tokenList.get(element.getEnd() - 1).getEnd() );
			chunk.setChunkType(element.getType());

			chunk = addPhraseWordsAndHead(chunk, block);
			addToJCasIndex(chunk);

			sentPhraseChunks.add(chunk);
		}
	}

	/**
	 * Add constituent words and the head word to a PhraseChunk
	 */
	private PhraseChunk addPhraseWordsAndHead(PhraseChunk chunk, TextBlock block) {
		List<WordToken> constituentWords = new ArrayList<WordToken>();
		for (WordToken word : JCasUtil.selectCovered(block.getJCas(), WordToken.class, chunk)) {
			constituentWords.add(word);
		}

		chunk.setConstituentWords(FSCollectionFactory.createFSArray(block.getJCas(), constituentWords));
		int headWordId = constituentWords.size() - 1;

		// Run through prior words, check for propositional - if so skip, if not break
		for (int a = constituentWords.size() - 2; a > 1; a--) {
			WordToken wtA = constituentWords.get(a);

			// If a POS tag or word value is prepositional, end increment head word index
			if ("IN".equals(wtA.getPartOfSpeech()) || ",".equals(wtA.getPartOfSpeech())
					|| prepositions.contains(wtA.getCoveredText())) {
				headWordId = a - 1;
			} else {
				headWordId = a;
				break;
			}
		}

		chunk.setHeadWord(constituentWords.get(headWordId));

		return chunk;
	}

	private static boolean isUpperCase(String s) {
		for (char c : s.toCharArray()) {
			if (Character.isLetter(c) && Character.isLowerCase(c)) {
				return false;
			}
		}

		return true;
	}
}