//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.resource.ExternalResourceDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.junit.Test;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import uk.gov.dstl.baleen.annotators.misc.RakeKeywords;
import uk.gov.dstl.baleen.annotators.testing.AnnotatorTestBase;
import uk.gov.dstl.baleen.resources.SharedStopwordResource;
import uk.gov.dstl.baleen.types.common.Buzzword;
import uk.gov.dstl.baleen.types.language.Text;
import uk.gov.dstl.baleen.types.metadata.Metadata;
public class RakeKeywordsTest extends AnnotatorTestBase {
private static String STOPWORDS = "stopwords";
@Test
public void testNoBuzzwords() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, false);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for contructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(9, keywords.size());
assertTrue(keywords.contains("minimal generating sets"));
assertTrue(keywords.contains("linear diophantine equations"));
assertTrue(keywords.contains("minimal supporting set"));
assertTrue(keywords.contains("minimal set"));
assertTrue(keywords.contains("linear constraints"));
assertTrue(keywords.contains("natural numbers"));
assertTrue(keywords.contains("strict inequations"));
assertTrue(keywords.contains("nonstrict inequations"));
assertTrue(keywords.contains("upper bounds"));
ae.destroy();
}
@Test
public void testBuzzwords() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, true);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for contructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(9, keywords.size());
assertTrue(keywords.contains("minimal generating sets"));
assertTrue(keywords.contains("linear diophantine equations"));
assertTrue(keywords.contains("minimal supporting set"));
assertTrue(keywords.contains("minimal set"));
assertTrue(keywords.contains("linear constraints"));
assertTrue(keywords.contains("natural numbers"));
assertTrue(keywords.contains("strict inequations"));
assertTrue(keywords.contains("nonstrict inequations"));
assertTrue(keywords.contains("upper bounds"));
assertEquals(9, JCasUtil.select(jCas, Buzzword.class).size());
assertEquals("linear constraints", JCasUtil.selectByIndex(jCas, Buzzword.class, 0).getCoveredText());
assertEquals("keyword", JCasUtil.selectByIndex(jCas, Buzzword.class, 0).getTags(0));
assertEquals("natural numbers", JCasUtil.selectByIndex(jCas, Buzzword.class, 1).getCoveredText());
assertEquals("linear Diophantine equations", JCasUtil.selectByIndex(jCas, Buzzword.class, 2).getCoveredText());
assertEquals("strict inequations", JCasUtil.selectByIndex(jCas, Buzzword.class, 3).getCoveredText());
assertEquals("nonstrict inequations", JCasUtil.selectByIndex(jCas, Buzzword.class, 4).getCoveredText());
assertEquals("Upper bounds", JCasUtil.selectByIndex(jCas, Buzzword.class, 5).getCoveredText());
assertEquals("minimal set", JCasUtil.selectByIndex(jCas, Buzzword.class, 6).getCoveredText());
assertEquals("minimal generating sets", JCasUtil.selectByIndex(jCas, Buzzword.class, 7).getCoveredText());
assertEquals("minimal supporting set", JCasUtil.selectByIndex(jCas, Buzzword.class, 8).getCoveredText());
ae.destroy();
}
@Test
public void testMaxNumber() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 3, RakeKeywords.PARAM_ADD_BUZZWORDS, false);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for contructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(3, keywords.size());
assertTrue(keywords.contains("minimal generating sets"));
assertTrue(keywords.contains("linear diophantine equations"));
assertTrue(keywords.contains("minimal supporting set"));
ae.destroy();
}
@Test
public void testCharacters() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, false);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Thursday 28th January - Test Report\n\n\tMichelle was seen meeting Katie at the Diner (Mary's Diner, on Main Street), at approximately 6:00pm. Michelle was later seen to be leaving the Diner, carrying a black folder of unknown contents. Katie is a known sympathiser, and it is hypothesised that she passed training materials to Michelle. When questioned later, Michelle stated: \"I know nothing of any training materials/folder!\".");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
for(String keyword : keywords){
assertEquals("", keyword.replaceAll("[a-z0-9 ]", ""));
}
ae.destroy();
}
@Test
public void testFoxStoplist() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, false, RakeKeywords.PARAM_STOPLIST, SharedStopwordResource.StopwordList.FOX);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for contructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(8, keywords.size());
assertTrue(keywords.contains("minimal generating sets"));
assertTrue(keywords.contains("linear diophantine equations"));
assertTrue(keywords.contains("minimal supporting set"));
assertTrue(keywords.contains("minimal set"));
assertTrue(keywords.contains("linear constraints"));
assertTrue(keywords.contains("strict inequations"));
assertTrue(keywords.contains("nonstrict inequations"));
assertTrue(keywords.contains("upper bounds"));
ae.destroy();
}
@Test
public void testCustomStoplist() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, false, RakeKeywords.PARAM_STOPLIST, getClass().getResource("exampleStoplist.txt").getPath());
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Bill and Ben went off to the shops in London town.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(1, keywords.size());
assertTrue(keywords.contains("london town"));
ae.destroy();
}
@Test
public void testStemmer() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, true, RakeKeywords.PARAM_STEMMING, SnowballStemmer.ALGORITHM.ENGLISH);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for contructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(11, keywords.size());
assertTrue(keywords.contains("minimal generating sets"));
assertTrue(keywords.contains("linear diophantine equations"));
assertTrue(keywords.contains("minimal supporting set"));
assertTrue(keywords.contains("minimal set"));
assertTrue(keywords.contains("linear constraints"));
assertTrue(keywords.contains("natural numbers"));
assertTrue(keywords.contains("strict inequations"));
assertTrue(keywords.contains("nonstrict inequations"));
assertTrue(keywords.contains("upper bounds"));
assertTrue(keywords.contains("considered types"));
assertTrue(keywords.contains("mixed types"));
assertEquals(11, JCasUtil.select(jCas, Buzzword.class).size());
assertEquals("linear constraints", JCasUtil.selectByIndex(jCas, Buzzword.class, 0).getCoveredText());
assertEquals("keyword", JCasUtil.selectByIndex(jCas, Buzzword.class, 0).getTags(0));
assertEquals("natural numbers", JCasUtil.selectByIndex(jCas, Buzzword.class, 1).getCoveredText());
assertEquals("linear Diophantine equations", JCasUtil.selectByIndex(jCas, Buzzword.class, 2).getCoveredText());
assertEquals("strict inequations", JCasUtil.selectByIndex(jCas, Buzzword.class, 3).getCoveredText());
assertEquals("nonstrict inequations", JCasUtil.selectByIndex(jCas, Buzzword.class, 4).getCoveredText());
assertEquals("Upper bounds", JCasUtil.selectByIndex(jCas, Buzzword.class, 5).getCoveredText());
assertEquals("minimal set", JCasUtil.selectByIndex(jCas, Buzzword.class, 6).getCoveredText());
assertEquals("minimal generating sets", JCasUtil.selectByIndex(jCas, Buzzword.class, 7).getCoveredText());
assertEquals("minimal supporting set", JCasUtil.selectByIndex(jCas, Buzzword.class, 8).getCoveredText());
assertEquals("considered types", JCasUtil.selectByIndex(jCas, Buzzword.class, 9).getCoveredText());
assertEquals("mixed types", JCasUtil.selectByIndex(jCas, Buzzword.class, 10).getCoveredText());
ae.destroy();
}
@Test
public void testBadStemmer() throws ResourceInitializationException, AnalysisEngineProcessException{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, false, RakeKeywords.PARAM_STEMMING, "NotARealStemmer");
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText("Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for contructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.");
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
List<String> keywords = Arrays.asList(md.getValue().split(";"));
assertEquals(9, keywords.size());
assertTrue(keywords.contains("minimal generating sets"));
assertTrue(keywords.contains("linear diophantine equations"));
assertTrue(keywords.contains("minimal supporting set"));
assertTrue(keywords.contains("minimal set"));
assertTrue(keywords.contains("linear constraints"));
assertTrue(keywords.contains("natural numbers"));
assertTrue(keywords.contains("strict inequations"));
assertTrue(keywords.contains("nonstrict inequations"));
assertTrue(keywords.contains("upper bounds"));
ae.destroy();
}
@Test
public void testLongDocument() throws Exception{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, true);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText(new String(Files.readAllBytes(Paths.get(getClass().getResource("turing.txt").toURI()))));
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
assertNotNull(md.getValue());
ae.destroy();
}
@Test
public void testLongDocumentWithText() throws Exception{
ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class);
AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(RakeKeywords.class, STOPWORDS, erd, RakeKeywords.PARAM_MAX_KEYWORDS, 12, RakeKeywords.PARAM_ADD_BUZZWORDS, true);
AnalysisEngine ae = AnalysisEngineFactory.createEngine(aed);
jCas.setDocumentText(new String(Files.readAllBytes(Paths.get(getClass().getResource("turing.txt").toURI()))));
ae.process(jCas);
String fullDocKeywords = JCasUtil.selectByIndex(jCas, Metadata.class, 0).getValue();
jCas.reset();
jCas.setDocumentText(new String(Files.readAllBytes(Paths.get(getClass().getResource("turing.txt").toURI()))));
new Text(jCas, 0, 250).addToIndexes();
ae.process(jCas);
assertEquals(1, JCasUtil.select(jCas, Metadata.class).size());
Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
assertEquals("keywords", md.getKey());
assertNotNull(md.getValue());
assertNotEquals(fullDocKeywords, md.getValue());
ae.destroy();
}
}