//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.language;
import static org.junit.Assert.assertEquals;
import java.util.List;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.resource.ExternalResourceDescription;
import org.junit.Test;
import uk.gov.dstl.baleen.annotators.language.OpenNLP;
import uk.gov.dstl.baleen.annotators.testing.AnnotatorTestBase;
import uk.gov.dstl.baleen.resources.SharedOpenNLPModel;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.Text;
import uk.gov.dstl.baleen.types.language.WordToken;
public class OpenNLPTest extends AnnotatorTestBase {
AnalysisEngine ae;
@Override
public void beforeTest() throws UIMAException {
super.beforeTest();
ExternalResourceDescription tokensDesc = ExternalResourceFactory.createExternalResourceDescription("tokens", SharedOpenNLPModel.class);
ExternalResourceDescription sentencesDesc = ExternalResourceFactory.createExternalResourceDescription("sentences", SharedOpenNLPModel.class);
ExternalResourceDescription posDesc = ExternalResourceFactory.createExternalResourceDescription("posTags", SharedOpenNLPModel.class);
ExternalResourceDescription chunksDesc = ExternalResourceFactory.createExternalResourceDescription("phraseChunks", SharedOpenNLPModel.class);
AnalysisEngineDescription desc = AnalysisEngineFactory.createEngineDescription(OpenNLP.class, "tokens", tokensDesc, "sentences", sentencesDesc, "posTags", posDesc, "phraseChunks", chunksDesc);
ae = AnalysisEngineFactory.createEngine(desc);
}
@Test
public void test() throws Exception{
String text = "This is some text. It has three sentences. The first sentence has four words.";
jCas.setDocumentText(text);
SimplePipeline.runPipeline(jCas, ae);
assertEquals(3, JCasUtil.select(jCas, Sentence.class).size()); // 3 sentences
Sentence s1 = JCasUtil.selectByIndex(jCas, Sentence.class, 0);
List<WordToken> tokens = JCasUtil.selectCovered(jCas, WordToken.class, s1);
assertEquals(5, tokens.size()); // 5 tokens in the first sentence
assertEquals("NN", tokens.get(3).getPartOfSpeech()); // 4th token of first sentence is a noun
List<PhraseChunk> phrases = JCasUtil.selectCovered(jCas, PhraseChunk.class, s1);
assertEquals(3, phrases.size()); // 3 chunks in the first sentence
assertEquals("some text", phrases.get(2).getCoveredText()); // 3rd chunk of 1st sentence is "some text"
}
@Test
public void testWiithText() throws Exception{
String text = "This is some text. It has three sentences. The first sentence has four words.";
jCas.setDocumentText(text);
Text t1 = new Text(jCas, 19, 43);
t1.addToIndexes();
Text t2 = new Text(jCas, 43, jCas.getDocumentText().length());
t2.addToIndexes();
SimplePipeline.runPipeline(jCas, ae);
assertEquals(2, JCasUtil.select(jCas, Sentence.class).size()); // 2 sentences
// note due to text the first sentence annotation is the second in the text
Sentence s1 = JCasUtil.selectByIndex(jCas, Sentence.class, 0);
List<WordToken> tokens = JCasUtil.selectCovered(jCas, WordToken.class, s1);
assertEquals(5, tokens.size()); // 5 tokens in the first sentence
assertEquals("NNS", tokens.get(3).getPartOfSpeech()); // 4th token of first sentence is a noun
List<PhraseChunk> phrases = JCasUtil.selectCovered(jCas, PhraseChunk.class, s1);
assertEquals(3, phrases.size()); // 3 chunks in the first sentence
assertEquals("three sentences", phrases.get(2).getCoveredText()); // 3rd chunk of 1st sentence is "some text"
}
}