package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.CoreMap; import junit.framework.TestCase; import java.util.List; import java.util.Properties; /** * Unit test for the mentions annotator. * * @author Angel Chang */ public class EntityMentionsAnnotatorITest extends TestCase { static AnnotationPipeline pipeline = null; protected static final String ENTITY_MENTIONS_ANNOTATOR_NAME = "entitymentions"; @Override public void setUp() throws Exception { synchronized(EntityMentionsAnnotatorITest.class) { if (pipeline == null) { Properties props = new Properties(); // TODO: remove need for ner and just have the mentions annotator props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner"); pipeline = new StanfordCoreNLP(props); } } } protected static Properties getDefaultProperties() { Properties props = new Properties(); return props; } protected EntityMentionsAnnotator getMentionsAnnotator() { return new EntityMentionsAnnotator(ENTITY_MENTIONS_ANNOTATOR_NAME, getDefaultProperties()); } protected static EntityMentionsAnnotator getMentionsAnnotator(Properties props) { return new EntityMentionsAnnotator(ENTITY_MENTIONS_ANNOTATOR_NAME, props); } protected static Annotation createDocument(String text) { Annotation annotation = new Annotation(text); pipeline.annotate(annotation); return annotation; } protected static void compareMentions(String prefix, String[] expectedMentions, List<CoreMap> mentions) { if (expectedMentions == null) { for (int i = 0; i < mentions.size(); i++) { String actual = mentions.get(i).toShorterString(); System.out.println(prefix + ": Got mention." + i + " " + actual); } assertTrue(prefix + ": No expected mentions provided", false); } int minMatchable = Math.min(expectedMentions.length, mentions.size()); for (int i = 0; i < minMatchable; i++) { String expected = expectedMentions[i]; String actual = mentions.get(i).toShorterString(); assertEquals(prefix + ".mention." + i, expected, actual); } assertEquals(prefix + ".length", mentions.size(), expectedMentions.length); } // Actual tests public void testBasicMentions() { Annotation doc = createDocument("I was at Stanford University Albert Peacock"); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); tokens.get(3).setNER("ORGANIZATION"); tokens.get(4).setNER("ORGANIZATION"); EntityMentionsAnnotator annotator = getMentionsAnnotator(); annotator.annotate(doc); List<CoreMap> mentions = doc.get(CoreAnnotations.MentionsAnnotation.class); String[] expectedMentions = { "[Text=Stanford University CharacterOffsetBegin=9 CharacterOffsetEnd=28 Tokens=[Stanford-4, University-5] TokenBegin=3 TokenEnd=5 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=0]", "[Text=Albert Peacock CharacterOffsetBegin=29 CharacterOffsetEnd=43 Tokens=[Albert-6, Peacock-7] TokenBegin=5 TokenEnd=7 NamedEntityTag=PERSON EntityType=PERSON SentenceIndex=0]" }; compareMentions("testBasicMentions", expectedMentions, mentions); } public void testDates() { Annotation doc = createDocument("July 3rd July 4th are two different dates"); EntityMentionsAnnotator annotator = getMentionsAnnotator(); annotator.annotate(doc); List<CoreMap> mentions = doc.get(CoreAnnotations.MentionsAnnotation.class); String[] expectedMentions = { "[Text=July 3rd CharacterOffsetBegin=0 CharacterOffsetEnd=8 Tokens=[July-1, 3rd-2] TokenBegin=0 TokenEnd=2 NamedEntityTag=DATE NormalizedNamedEntityTag=XXXX-07-03 EntityType=DATE SentenceIndex=0 Timex=<TIMEX3 tid=\"t1\" type=\"DATE\" value=\"XXXX-07-03\">July 3rd</TIMEX3>]", "[Text=July 4th CharacterOffsetBegin=9 CharacterOffsetEnd=17 Tokens=[July-3, 4th-4] TokenBegin=2 TokenEnd=4 NamedEntityTag=DATE NormalizedNamedEntityTag=XXXX-07-04 EntityType=DATE SentenceIndex=0 Timex=<TIMEX3 tid=\"t2\" type=\"DATE\" value=\"XXXX-07-04\">July 4th</TIMEX3>]", "[Text=two CharacterOffsetBegin=22 CharacterOffsetEnd=25 Tokens=[two-6] TokenBegin=5 TokenEnd=6 NamedEntityTag=NUMBER NormalizedNamedEntityTag=2.0 EntityType=NUMBER SentenceIndex=0]" }; compareMentions("testDates", expectedMentions, mentions); } public void testDates2() { Annotation doc = createDocument("July 3rd July 3rd are two mentions of the same date"); EntityMentionsAnnotator annotator = getMentionsAnnotator(); annotator.annotate(doc); List<CoreMap> mentions = doc.get(CoreAnnotations.MentionsAnnotation.class); // TODO: Fixme - separate out the two mentions of July 3rd!!! String[] expectedMentions = { "[Text=July 3rd July 3rd CharacterOffsetBegin=0 CharacterOffsetEnd=17 Tokens=[July-1, 3rd-2, July-3, 3rd-4] TokenBegin=0 TokenEnd=4 NamedEntityTag=DATE NormalizedNamedEntityTag=XXXX-07-03 EntityType=DATE SentenceIndex=0 Timex=<TIMEX3 tid=\"t1\" type=\"DATE\" value=\"XXXX-07-03\">July 3rd July 3rd</TIMEX3>]", "[Text=two CharacterOffsetBegin=22 CharacterOffsetEnd=25 Tokens=[two-6] TokenBegin=5 TokenEnd=6 NamedEntityTag=NUMBER NormalizedNamedEntityTag=2.0 EntityType=NUMBER SentenceIndex=0]" }; compareMentions("testDates2", expectedMentions, mentions); } public void testNumbers() { Annotation doc = createDocument("one two three four five"); EntityMentionsAnnotator annotator = getMentionsAnnotator(); annotator.annotate(doc); List<CoreMap> mentions = doc.get(CoreAnnotations.MentionsAnnotation.class); String[] expectedMentions = { "[Text=one CharacterOffsetBegin=0 CharacterOffsetEnd=3 Tokens=[one-1] TokenBegin=0 TokenEnd=1 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0 EntityType=NUMBER SentenceIndex=0]", "[Text=two CharacterOffsetBegin=4 CharacterOffsetEnd=7 Tokens=[two-2] TokenBegin=1 TokenEnd=2 NamedEntityTag=NUMBER NormalizedNamedEntityTag=2.0 EntityType=NUMBER SentenceIndex=0]", "[Text=three CharacterOffsetBegin=8 CharacterOffsetEnd=13 Tokens=[three-3] TokenBegin=2 TokenEnd=3 NamedEntityTag=NUMBER NormalizedNamedEntityTag=3.0 EntityType=NUMBER SentenceIndex=0]", "[Text=four CharacterOffsetBegin=14 CharacterOffsetEnd=18 Tokens=[four-4] TokenBegin=3 TokenEnd=4 NamedEntityTag=NUMBER NormalizedNamedEntityTag=4.0 EntityType=NUMBER SentenceIndex=0]", "[Text=five CharacterOffsetBegin=19 CharacterOffsetEnd=23 Tokens=[five-5] TokenBegin=4 TokenEnd=5 NamedEntityTag=NUMBER NormalizedNamedEntityTag=5.0 EntityType=NUMBER SentenceIndex=0]" }; compareMentions("testNumbers", expectedMentions, mentions); } public void testNewsText() { Annotation doc = createDocument("Duke of Cambridge, Prince William, unveiled a new China Center in the University of Oxford Monday.\n" + "Covering an area nearly 5,500 square meters, the new Dickson Poon University of Oxford China Center in St Hugh's College cost about 21 million pounds.\n" + "Dickson Poon, a philanthropist from Hong Kong, China, is the one of the major donors of the center, who contributed 10 million British pounds (16.14 million U.S. dollars)."); EntityMentionsAnnotator annotator = getMentionsAnnotator(); annotator.annotate(doc); List<CoreMap> mentions = doc.get(CoreAnnotations.MentionsAnnotation.class); // TODO: "Duke of Cambridge" should be one mention. // TODO: Not sure if should get "Prince William" rather than just "William", but going with the flow. // TODO: "nearly 5,500 square meters"? "10 million British pounds", "16.14 million U.S. dollars" // TODO: "China Center should definitely be an organization! String[] expectedMentions = { "[Text=Duke CharacterOffsetBegin=0 CharacterOffsetEnd=4 Tokens=[Duke-1] TokenBegin=0 TokenEnd=1 NamedEntityTag=PERSON EntityType=PERSON SentenceIndex=0]", "[Text=Cambridge CharacterOffsetBegin=8 CharacterOffsetEnd=17 Tokens=[Cambridge-3] TokenBegin=2 TokenEnd=3 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=0]", "[Text=William CharacterOffsetBegin=26 CharacterOffsetEnd=33 Tokens=[William-6] TokenBegin=5 TokenEnd=6 NamedEntityTag=PERSON EntityType=PERSON SentenceIndex=0]", "[Text=China Center CharacterOffsetBegin=50 CharacterOffsetEnd=62 Tokens=[China-11, Center-12] TokenBegin=10 TokenEnd=12 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=0]", "[Text=University of Oxford CharacterOffsetBegin=70 CharacterOffsetEnd=90 Tokens=[University-15, of-16, Oxford-17] TokenBegin=14 TokenEnd=17 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=0]", "[Text=Monday CharacterOffsetBegin=91 CharacterOffsetEnd=97 Tokens=[Monday-18] TokenBegin=17 TokenEnd=18 NamedEntityTag=DATE NormalizedNamedEntityTag=XXXX-WXX-1 EntityType=DATE SentenceIndex=0 Timex=<TIMEX3 tid=\"t1\" type=\"DATE\" value=\"XXXX-WXX-1\">Monday</TIMEX3>]", "[Text=5,500 CharacterOffsetBegin=123 CharacterOffsetEnd=128 Tokens=[5,500-5] TokenBegin=23 TokenEnd=24 NamedEntityTag=NUMBER NormalizedNamedEntityTag=~5500.0 EntityType=NUMBER SentenceIndex=1]", "[Text=Dickson Poon University of Oxford China Center CharacterOffsetBegin=152 CharacterOffsetEnd=198 Tokens=[Dickson-11, Poon-12, University-13, of-14, Oxford-15, China-16, Center-17] TokenBegin=29 TokenEnd=36 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=1]", "[Text=St Hugh's College CharacterOffsetBegin=202 CharacterOffsetEnd=219 Tokens=[St-19, Hugh-20, 's-21, College-22] TokenBegin=37 TokenEnd=41 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=1]", "[Text=21 million pounds CharacterOffsetBegin=231 CharacterOffsetEnd=248 Tokens=[21-25, million-26, pounds-27] TokenBegin=43 TokenEnd=46 NamedEntityTag=MONEY NormalizedNamedEntityTag=~£2.1E7 EntityType=MONEY SentenceIndex=1]", "[Text=Dickson Poon CharacterOffsetBegin=250 CharacterOffsetEnd=262 Tokens=[Dickson-1, Poon-2] TokenBegin=47 TokenEnd=49 NamedEntityTag=PERSON EntityType=PERSON SentenceIndex=2]", "[Text=Hong Kong CharacterOffsetBegin=286 CharacterOffsetEnd=295 Tokens=[Hong-7, Kong-8] TokenBegin=53 TokenEnd=55 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2]", "[Text=China CharacterOffsetBegin=297 CharacterOffsetEnd=302 Tokens=[China-10] TokenBegin=56 TokenEnd=57 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2]", "[Text=one CharacterOffsetBegin=311 CharacterOffsetEnd=314 Tokens=[one-14] TokenBegin=60 TokenEnd=61 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0 EntityType=NUMBER SentenceIndex=2]", "[Text=10 million CharacterOffsetBegin=366 CharacterOffsetEnd=376 Tokens=[10-25, million-26] TokenBegin=71 TokenEnd=73 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0E7 EntityType=NUMBER SentenceIndex=2]", "[Text=British CharacterOffsetBegin=377 CharacterOffsetEnd=384 Tokens=[British-27] TokenBegin=73 TokenEnd=74 NamedEntityTag=MISC EntityType=MISC SentenceIndex=2]", "[Text=16.14 million CharacterOffsetBegin=393 CharacterOffsetEnd=406 Tokens=[16.14-30, million-31] TokenBegin=76 TokenEnd=78 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.614E7 EntityType=NUMBER SentenceIndex=2]", "[Text=U.S. CharacterOffsetBegin=407 CharacterOffsetEnd=411 Tokens=[U.S.-32] TokenBegin=78 TokenEnd=79 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2]" }; compareMentions("testNewsText", expectedMentions, mentions); } }