//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.misc; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.resource.ExternalResourceDescription; import org.junit.Test; import uk.gov.dstl.baleen.annotators.misc.CommonKeywords; import uk.gov.dstl.baleen.annotators.testing.AbstractAnnotatorTest; import uk.gov.dstl.baleen.resources.SharedStopwordResource; import uk.gov.dstl.baleen.types.common.Buzzword; import uk.gov.dstl.baleen.types.language.Text; import uk.gov.dstl.baleen.types.metadata.Metadata; public class CommonKeywordsTest extends AbstractAnnotatorTest{ private static String STOPWORDS = "stopwords"; private ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(STOPWORDS, SharedStopwordResource.class); public CommonKeywordsTest(){ super(CommonKeywords.class); } @Test public void testProcess() throws Exception{ jCas.setDocumentText(new String(Files.readAllBytes(Paths.get(getClass().getResource("turing.txt").toURI())))); processJCas(STOPWORDS, erd); assertEquals(1, JCasUtil.select(jCas, Metadata.class).size()); Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0); assertEquals("keywords", md.getKey()); List<String> keywords = Arrays.asList(md.getValue().split(";")); assertEquals(6, keywords.size()); //Question and Digital get the same score, so we end up with 6 keywords not 5 assertTrue(keywords.contains("machine")); assertTrue(keywords.contains("computer")); assertTrue(keywords.contains("digital computers")); assertTrue(keywords.contains("state")); assertTrue(keywords.contains("question")); assertTrue(keywords.contains("digital")); assertTrue(JCasUtil.select(jCas, Buzzword.class).size() > 0); Set<String> buzzwords = new HashSet<>(); for(Buzzword bw : JCasUtil.select(jCas, Buzzword.class)){ assertEquals("keyword", bw.getTags(0)); buzzwords.add(bw.getValue()); } assertTrue(buzzwords.contains("machines")); assertTrue(buzzwords.contains("computing")); assertTrue(buzzwords.contains("questioning")); } @Test public void testProcessWithText() throws Exception{ jCas.setDocumentText(new String(Files.readAllBytes(Paths.get(getClass().getResource("turing.txt").toURI())))); // THe only text we are going to process is "The Imitation Game" new Text(jCas, 54, 74).addToIndexes(); processJCas(STOPWORDS, erd); assertEquals(1, JCasUtil.select(jCas, Metadata.class).size()); Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0); assertEquals("keywords", md.getKey()); List<String> keywords = Arrays.asList(md.getValue().split(";")); assertEquals(3, keywords.size()); //Question and Digital get the same score, so we end up with 6 keywords not 5 assertTrue(keywords.contains("imitation")); assertFalse(keywords.contains("machine")); assertFalse(keywords.contains("computer")); assertFalse(keywords.contains("digital computers")); assertFalse(keywords.contains("state")); assertFalse(keywords.contains("question")); assertFalse(keywords.contains("digital")); assertTrue(JCasUtil.select(jCas, Buzzword.class).size() > 0); Set<String> buzzwords = new HashSet<>(); for(Buzzword bw : JCasUtil.select(jCas, Buzzword.class)){ assertEquals("keyword", bw.getTags(0)); buzzwords.add(bw.getValue()); } assertFalse(buzzwords.contains("machines")); assertFalse(buzzwords.contains("computing")); assertFalse(buzzwords.contains("questioning")); } }