//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.coreference; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.resource.ExternalResourceDescription; import org.apache.uima.resource.ResourceInitializationException; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import uk.gov.dstl.baleen.annotators.coreference.SieveCoreference; import uk.gov.dstl.baleen.annotators.language.MaltParser; import uk.gov.dstl.baleen.annotators.language.OpenNLP; import uk.gov.dstl.baleen.annotators.language.OpenNLPParser; import uk.gov.dstl.baleen.annotators.language.WordNetLemmatizer; import uk.gov.dstl.baleen.annotators.testing.AnnotatorTestBase; import uk.gov.dstl.baleen.resources.SharedGenderMultiplicityResource; import uk.gov.dstl.baleen.resources.SharedOpenNLPModel; import uk.gov.dstl.baleen.resources.SharedStopwordResource; import uk.gov.dstl.baleen.resources.SharedWordNetResource; import uk.gov.dstl.baleen.types.common.Organisation; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.semantic.Location; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.uima.BaleenAnnotator; public class SieveCoreferenceTest extends AnnotatorTestBase { private static ExternalResourceDescription stopwordsDesc; private static ExternalResourceDescription gMDesc; private static AnalysisEngine[] analysisEngines; @BeforeClass public static void before() throws ResourceInitializationException{ analysisEngines = createAnalysisEngines(); } protected static AnalysisEngine[] createAnalysisEngines() throws ResourceInitializationException { ExternalResourceDescription parserChunkingDesc = ExternalResourceFactory .createExternalResourceDescription("parserChunking", SharedOpenNLPModel.class); ExternalResourceDescription wordnetDesc = ExternalResourceFactory.createExternalResourceDescription("wordnet", SharedWordNetResource.class); ExternalResourceDescription tokensDesc = ExternalResourceFactory.createExternalResourceDescription("tokens", SharedOpenNLPModel.class); ExternalResourceDescription sentencesDesc = ExternalResourceFactory .createExternalResourceDescription("sentences", SharedOpenNLPModel.class); ExternalResourceDescription posDesc = ExternalResourceFactory.createExternalResourceDescription("posTags", SharedOpenNLPModel.class); ExternalResourceDescription chunksDesc = ExternalResourceFactory .createExternalResourceDescription("phraseChunks", SharedOpenNLPModel.class); stopwordsDesc = ExternalResourceFactory .createExternalResourceDescription(SieveCoreference.KEY_STOPWORDS, SharedStopwordResource.class); gMDesc = ExternalResourceFactory .createExternalResourceDescription(SieveCoreference.KEY_GENDER_MULTIPLICITY, SharedGenderMultiplicityResource.class); return asArray( createAnalysisEngine(OpenNLP.class, "tokens", tokensDesc, "sentences", sentencesDesc, "posTags", posDesc, "phraseChunks", chunksDesc), createAnalysisEngine(WordNetLemmatizer.class, "wordnet", wordnetDesc), createAnalysisEngine(OpenNLPParser.class, "parserChunking", parserChunkingDesc), createAnalysisEngine(MaltParser.class)); } protected static AnalysisEngine[] asArray(AnalysisEngine... args) { return args; } protected static AnalysisEngine createAnalysisEngine(Class<? extends BaleenAnnotator> annotatorClass, Object... args) throws ResourceInitializationException { return AnalysisEngineFactory.createEngine(annotatorClass, args); } public void processJCas() throws ResourceInitializationException, AnalysisEngineProcessException { SimplePipeline.runPipeline(jCas, analysisEngines); } public void processJCasWithSieve(int sieve) throws AnalysisEngineProcessException, ResourceInitializationException{ AnalysisEngine ae = createAnalysisEngine(SieveCoreference.class, SieveCoreference.KEY_GENDER_MULTIPLICITY, gMDesc, SieveCoreference.KEY_STOPWORDS, stopwordsDesc, "pass", sieve, "pronomial", true); ae.process(jCas); ae.destroy(); } @Test public void test() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "Chris Smith went to London and he saw Big Ben. Chris saw his sister there."; jCas.setDocumentText(text); Person chrisSmith = new Person(jCas); chrisSmith.setBegin(text.indexOf("Chris Smith")); chrisSmith.setEnd(chrisSmith.getBegin() + "Chris Smith".length()); chrisSmith.setValue("Chris Smith"); chrisSmith.addToIndexes(); Person chris = new Person(jCas); chris.setBegin(text.indexOf("Chris", chrisSmith.getEnd())); chris.setEnd(chris.getBegin() + "Chris".length()); chris.setValue("Chris"); chris.addToIndexes(); Location london = new Location(jCas); london.setBegin(text.indexOf("London")); london.setEnd(london.getBegin() + "London".length()); london.setValue("London"); london.addToIndexes(); Location bigBen = new Location(jCas); bigBen.setBegin(text.indexOf("Big Ben")); bigBen.setEnd(bigBen.getBegin() + "Big Ben".length()); bigBen.setValue("Big Ben"); bigBen.addToIndexes(); processJCas(); processJCasWithSieve(-1); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); List<Person> people = new ArrayList<>(JCasUtil.select(jCas, Person.class)); List<WordToken> words = new ArrayList<>(JCasUtil.select(jCas, WordToken.class)); long referenceId = people.get(0).getReferent().getInternalId(); assertEquals("Chris Smith", people.get(0).getValue()); assertEquals("Chris", people.get(1).getValue()); assertEquals(referenceId, people.get(1).getReferent().getInternalId()); // Check all the he and his connect to Chris boolean allMatch = words.stream() .filter(p -> p.getCoveredText().equalsIgnoreCase("his") || p.getCoveredText().equalsIgnoreCase("he")) .allMatch(p -> p.getReferent().getInternalId() == referenceId); assertTrue(allMatch); // We should have London or Big Ben to there - hence this should be 2, but something is off // at the moment... assertEquals(1, targets.size()); } @Test public void testExtractReferenceTargets() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "Chris went to London and he saw Big Ben there."; // there - london jCas.setDocumentText(text); Person chris = new Person(jCas); chris.setBegin(text.indexOf("Chris")); chris.setEnd(chris.getBegin() + "Chris".length()); chris.addToIndexes(); ReferenceTarget target = new ReferenceTarget(jCas); target.addToIndexes(); Location london = new Location(jCas); london.setBegin(text.indexOf("London")); london.setEnd(london.getBegin() + "London".length()); london.setReferent(target); london.addToIndexes(); Location there = new Location(jCas); there.setBegin(text.indexOf("there")); there.setEnd(there.getBegin() + "there".length()); there.setReferent(target); there.addToIndexes(); processJCas(); processJCasWithSieve(0); // We should have a reference target and it should be different to the previous, as its been recreated. Collection<ReferenceTarget> targets = JCasUtil.select(jCas, ReferenceTarget.class); assertEquals(1, targets.size()); assertTrue(targets.iterator().next().getInternalId() != target.getInternalId()); } @Test public void testExactStringMatch() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "Chris went to London and in London he saw Big Ben."; // london - london jCas.setDocumentText(text); Person chris = new Person(jCas); chris.setBegin(text.indexOf("Chris")); chris.setEnd(chris.getBegin() + "Chris".length()); chris.addToIndexes(); Location london = new Location(jCas); london.setBegin(text.indexOf("London")); london.setEnd(london.getBegin() + "London".length()); london.addToIndexes(); Location london2 = new Location(jCas); london2.setBegin(text.indexOf("London", london.getEnd())); london2.setEnd(london2.getBegin() + "London".length()); london2.addToIndexes(); processJCas(); processJCasWithSieve(1); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); List<Location> location = new ArrayList<>(JCasUtil.select(jCas, Location.class)); assertEquals(1, targets.size()); assertSame(targets.get(0), location.get(0).getReferent()); assertSame(targets.get(0), location.get(1).getReferent()); } @Test public void testRelaxedStringMatch() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The University of Warwick is near Coventry and that was the University at which Chris studied."; // university of warwick - university jCas.setDocumentText(text); Person chris = new Person(jCas); chris.setBegin(text.indexOf("Chris")); chris.setEnd(chris.getBegin() + "Chris".length()); chris.addToIndexes(); Organisation uow = new Organisation(jCas); uow.setBegin(text.indexOf("University of Warwick")); uow.setEnd(uow.getBegin() + "University of Warwick".length()); uow.addToIndexes(); Organisation u = new Organisation(jCas); u.setBegin(text.indexOf("University", uow.getEnd())); u.setEnd(u.getBegin() + "University".length()); u.addToIndexes(); processJCas(); processJCasWithSieve(2); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); List<Organisation> location = new ArrayList<>(JCasUtil.select(jCas, Organisation.class)); assertEquals(1, targets.size()); assertSame(targets.get(0), location.get(0).getReferent()); assertSame(targets.get(0), location.get(1).getReferent()); } @Test public void testInSentencePronoun() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "He said he has not been in touch with her."; jCas.setDocumentText(text); processJCas(); processJCasWithSieve(3); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testPreciseConstructApositive() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The prime minister, David Cameron explained on Tuesday."; // david camera - prime minister jCas.setDocumentText(text); Person p = new Person(jCas); p.setBegin(text.indexOf("David Cameron")); p.setEnd(p.getBegin() + "David Cameron".length()); p.addToIndexes(); processJCas(); processJCasWithSieve(4); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testPreciseConstructPredicate() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "David Cameron is the prime minister."; // david camera - prime minister jCas.setDocumentText(text); Person p = new Person(jCas); p.setBegin(text.indexOf("David Cameron")); p.setEnd(p.getBegin() + "David Cameron".length()); p.addToIndexes(); processJCas(); processJCasWithSieve(4); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } // NOT IMPLEMENTED @Test @Ignore public void testPreciseConstructRole() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "President Obama visited today."; // president - obama jCas.setDocumentText(text); processJCas(); processJCasWithSieve(4); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testPreciseConstructRelativePronoun() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The police want to catch a man who ran away."; // man - who jCas.setDocumentText(text); processJCas(); processJCasWithSieve(4); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testPreciseConstructAcronym() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The British Broadcasting Corporation or the BBC if you prefer shows television programmes."; // British Broadcasting Corporation - BBC jCas.setDocumentText(text); // We need these in otherwise we just get one long setence from the mention detector Organisation beeb = new Organisation(jCas); beeb.setBegin(text.indexOf("British Broadcasting Corporation")); beeb.setEnd(beeb.getBegin() + "British Broadcasting Corporation".length()); beeb.addToIndexes(); Organisation bbc = new Organisation(jCas); bbc.setBegin(text.indexOf("BBC")); bbc.setEnd(bbc.getBegin() + "BBC".length()); bbc.addToIndexes(); processJCas(); processJCasWithSieve(4); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); List<Organisation> orgs = new ArrayList<Organisation>(JCasUtil.select(jCas, Organisation.class)); assertEquals(2, orgs.size()); assertNotNull(orgs.get(0).getReferent()); assertEquals(orgs.get(0).getReferent().getInternalId(), orgs.get(1).getReferent().getInternalId()); } @Test public void testStrictHeadMatch() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The Florida Supreme Court sat today, and the Florida Court made a decision."; jCas.setDocumentText(text); Organisation fsc = new Organisation(jCas); fsc.setBegin(text.indexOf("Florida Supreme Court")); fsc.setEnd(fsc.getBegin() + "Florida Supreme Court".length()); fsc.addToIndexes(); Organisation fc = new Organisation(jCas); fc.setBegin(text.indexOf("Florida Court")); fc.setEnd(fc.getBegin() + "Florida Court".length()); fc.addToIndexes(); processJCas(); processJCasWithSieve(5); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testProperHeadMatchSameNumbers() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The 200 people visited and then the people left."; jCas.setDocumentText(text); processJCas(); processJCasWithSieve(8); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testProperHeadMatchDifferentNumbers() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "The 200 people visited and 100 people left."; jCas.setDocumentText(text); processJCas(); processJCasWithSieve(8); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(0, targets.size()); } @Test public void testProperHeadMatchSameLocation() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "We visited the south of Amercia and travelled to the deep south of America."; jCas.setDocumentText(text); processJCas(); processJCasWithSieve(8); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testProperHeadMatchDifferentLocations() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "We visited the south of Amercia and went to the north of America."; jCas.setDocumentText(text); processJCas(); processJCasWithSieve(8); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(0, targets.size()); } @Test public void testRelaxedHeadMatch() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "Circuit Judge N. Sanders has been seen talking to James when the Judge said ok."; jCas.setDocumentText(text); Person fsc = new Person(jCas); fsc.setBegin(text.indexOf("Circuit Judge N. Sanders")); fsc.setEnd(fsc.getBegin() + "Circuit Judge N. Sanders".length()); fsc.addToIndexes(); Person fc = new Person(jCas); fc.setBegin(text.indexOf("Judge", fsc.getEnd())); fc.setEnd(fc.getBegin() + "Judge".length()); fc.addToIndexes(); Person j = new Person(jCas); j.setBegin(text.indexOf("James")); j.setEnd(j.getBegin() + "James".length()); j.addToIndexes(); processJCas(); processJCasWithSieve(9); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(1, targets.size()); } @Test public void testPronounResolutionSingleSentence() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "John went to see Lucy and he ate with her."; jCas.setDocumentText(text); Person john = new Person(jCas); john.setBegin(text.indexOf("John")); john.setEnd(john.getBegin() + "John".length()); john.addToIndexes(); Person lucy = new Person(jCas); lucy.setBegin(text.indexOf("Lucy")); lucy.setEnd(lucy.getBegin() + "Lucy".length()); lucy.addToIndexes(); processJCas(); processJCasWithSieve(10); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(2, targets.size()); // TODO: Need to test what that its he which is matched } @Test public void testPronounResolutionTwoSentence() throws AnalysisEngineProcessException, ResourceInitializationException { String text = "John went to see Lucy at the weekend. That was the first time that he saw her there."; jCas.setDocumentText(text); Person john = new Person(jCas); john.setBegin(text.indexOf("John")); john.setEnd(john.getBegin() + "John".length()); john.addToIndexes(); Person lucy = new Person(jCas); lucy.setBegin(text.indexOf("Lucy")); lucy.setEnd(lucy.getBegin() + "Lucy".length()); lucy.addToIndexes(); processJCas(); processJCasWithSieve(10); List<ReferenceTarget> targets = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); assertEquals(2, targets.size()); // TODO: Need to test what that its he which is matched } }