//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.language;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import opennlp.tools.parser.AbstractBottomUpParser;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.util.Span;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.resources.SharedOpenNLPModel;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Perform grammatical parsing with OpenNLP parser.
* <p>
* The document content is passed through the OpenNLP parser in order to create a parse tree.
* <p>
* It is assumed that first the document has been passed through the OpenNLP pipeline (or similar)
* so that sentences, POS, etc are extracted into the jCas.
* <p>
* <b>Be aware</b> that this annotator will REMOVE any existing PhraseChunks and replace them with its output.
*
*
* @baleen.javadoc
*/
public class OpenNLPParser extends BaleenAnnotator {
private static final Set<String> PHRASE_TYPES = new HashSet<String>(
Arrays.asList("ADJP", "ADVP", "FRAG", "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "RRC",
"UCP", "VP", "WHADJP", "WHAVP", "WHNP", "WHPP", "X"));
/**
* OpenNLP Resource (chunker) - use en-parser-chunking.bin
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedOpenNLPModel
*/
public static final String PARAM_TOKEN = "parserChunking";
@ExternalResource(key = OpenNLPParser.PARAM_TOKEN)
private SharedOpenNLPModel parserChunkingModel;
private Parser parser;
@Override
public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
try {
parserChunkingModel.loadModel(ParserModel.class, getClass().getResourceAsStream("en_parser_chunking.bin"));
} catch (final BaleenException be) {
getMonitor().error("Unable to load OpenNLP Language Models", be);
throw new ResourceInitializationException(be);
}
try {
parser = ParserFactory.create((ParserModel) parserChunkingModel.getModel());
} catch (final Exception e) {
getMonitor().error("Unable to create OpenNLP parser", e);
throw new ResourceInitializationException(e);
}
}
@Override
public void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
// For each sentence (in the JCas)e, we recreate the spans from our
// WordTokens.
final Map<Sentence, Collection<WordToken>> sentences = JCasUtil.indexCovered(jCas, Sentence.class,
WordToken.class);
sentences.entrySet().stream().filter(e -> !e.getValue().isEmpty()).forEach(e -> {
final Sentence sentence = e.getKey();
final Collection<WordToken> tokens = e.getValue();
final Parse parsed = parseSentence(sentence, tokens);
updatePhraseChunks(jCas, sentence, parsed);
});
}
/**
* Update phrase chunks.
*
* @param jCas
* the j cas
* @param sentence
* the sentence
* @param parsed
* the parsed
*/
private void updatePhraseChunks(final JCas jCas, final Sentence sentence, final Parse parsed) {
// We remove all the existing PhraseChunks as they are going to be
// replace with the parsed
// version
// TODO: Should we create a new ConstiuentPhraseChunk type in Uima?
removeFromJCasIndex(JCasUtil.selectCovered(jCas, PhraseChunk.class, sentence));
addParsedAsAnnotations(jCas, sentence.getBegin(), parsed);
}
/**
* Adds the parsed as annotations.
*
* @param jCas
* the j cas
* @param offset
* the offset
* @param parsed
* the parsed
*/
private void addParsedAsAnnotations(final JCas jCas, final int offset, final Parse parsed) {
final String type = parsed.getType();
// Ignore non phrase types
if (OpenNLPParser.PHRASE_TYPES.contains(type)) {
// Otherwise add new ParseChunks
final Span span = parsed.getSpan();
final PhraseChunk phraseChunk = new PhraseChunk(jCas);
phraseChunk.setBegin(offset + span.getStart());
phraseChunk.setEnd(offset + span.getEnd());
phraseChunk.setChunkType(parsed.getType());
addToJCasIndex(phraseChunk);
}
Arrays.stream(parsed.getChildren()).forEach(p -> addParsedAsAnnotations(jCas, offset, p));
}
/**
* Parses the sentence.
*
* @param sentence
* the sentence
* @param tokens
* the tokens
* @return the parses the
*/
private Parse parseSentence(final Sentence sentence, final Collection<WordToken> tokens) {
final String text = sentence.getCoveredText();
final Parse parse = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 1, 0);
// Add in the POS
int index = 0;
for (final WordToken token : tokens) {
final Span span = new Span(token.getBegin() - sentence.getBegin(), token.getEnd() - sentence.getBegin());
parse.insert(new Parse(text, span, AbstractBottomUpParser.TOK_NODE, 0, index));
index++;
}
// Parse the sentence
return parser.parse(parse);
}
@Override
public void doDestroy() {
parserChunkingModel = null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(WordToken.class, Sentence.class), ImmutableSet.of(PhraseChunk.class));
}
}