package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; import edu.stanford.nlp.ie.machinereading.structure.RelationMention; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.StringUtils; import org.junit.Assert; import junit.framework.TestCase; import java.io.*; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; public class StanfordCoreNLPITest extends TestCase { public void testRequires() throws Exception { Properties props = new Properties(); try { // Put the lemma before pos and you have a problem props.setProperty("annotators", "tokenize,ssplit,lemma,pos,ner,parse"); new StanfordCoreNLP(props); throw new RuntimeException("Should have thrown an exception"); } catch (IllegalArgumentException e) { // yay } // This should be okay: parse can take the place of pos props.setProperty("annotators", "tokenize,ssplit,parse,lemma,ner"); new StanfordCoreNLP(props); } public void testRequiresForCoref() throws Exception { Properties props = new Properties(); try { // Put the lemma before pos and you have a problem props.setProperty("annotators", "tokenize,ssplit,lemma,pos,ner,coref"); new StanfordCoreNLP(props); throw new RuntimeException("Should have thrown an exception"); } catch (IllegalArgumentException e) { // yay } // This should be okay: parse can take the place of pos props.setProperty("annotators", "tokenize,ssplit,parse,lemma,ner"); new StanfordCoreNLP(props); } public void test() throws Exception { // create a properties that enables all the annotators Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse"); // run an annotation through the pipeline String text = "Dan Ramage is working for\nMicrosoft. He's in Seattle! \n"; Annotation document = new Annotation(text); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); // check that tokens are present List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(tokens); Assert.assertEquals(12, tokens.size()); // check that sentences are present List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); Assert.assertNotNull(sentences); Assert.assertEquals(2, sentences.size()); // check that pos, lemma and ner and parses are present for (CoreMap sentence : sentences) { List<CoreLabel> sentenceTokens = sentence.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(sentenceTokens); for (CoreLabel token : sentenceTokens) { Assert.assertNotNull(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)); Assert.assertNotNull(token.get(CoreAnnotations.LemmaAnnotation.class)); Assert.assertNotNull(token.get(CoreAnnotations.NamedEntityTagAnnotation.class)); } // check for parse tree Assert.assertNotNull(sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); // check that dependency graph Labels have word() SemanticGraph deps = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); for (IndexedWord vertex : deps.vertexSet()) { Assert.assertNotNull(vertex.word()); Assert.assertEquals(vertex.word(), vertex.value()); } } // test pretty print StringWriter stringWriter = new StringWriter(); pipeline.prettyPrint(document, new PrintWriter(stringWriter)); String result = stringWriter.getBuffer().toString(); Assert.assertTrue("Tokens are wrong in " + result, StringUtils.find(result, "\\[Text=Dan .*PartOfSpeech=NNP Lemma=Dan NamedEntityTag=PERSON\\]")); Assert.assertTrue("Parses are wrong in " + result, result.contains("(NP (PRP He))")); Assert.assertTrue("Parses are wrong in " + result, result.contains("(VP (VBZ 's)")); Assert.assertTrue("Sentence header is wrong in " + result, result.contains("Sentence #1 (7 tokens)")); Assert.assertTrue("Dependencies are wrong in " + result, result.contains("nsubj(working-4, Ramage-2)")); // test XML ByteArrayOutputStream os = new ByteArrayOutputStream(); pipeline.xmlPrint(document, os); result = new String(os.toByteArray(), "UTF-8"); Assert.assertTrue("XML header is wrong in " + result, result.startsWith("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")); Assert.assertTrue("XML root is wrong in " + result, result.contains("<?xml-stylesheet href=\"CoreNLP-to-HTML.xsl\" type=\"text/xsl\"?>")); Assert.assertTrue("XML word info is wrong in " + result, StringUtils.find(result, "<token id=\"2\">\\s*" + "<word>Ramage</word>\\s*" + "<lemma>Ramage</lemma>\\s*" + "<CharacterOffsetBegin>4</CharacterOffsetBegin>\\s*" + "<CharacterOffsetEnd>10</CharacterOffsetEnd>\\s*" + "<POS>NNP</POS>\\s*" + "<NER>PERSON</NER>")); Assert.assertTrue("XML dependencies are wrong in " + result, StringUtils.find(result, "<dep type=\"compound\">\\s*<governor idx=\"2\">" + "Ramage</governor>\\s*<dependent idx=\"1\">Dan</dependent>\\s*</dep>")); } private static void checkNer(String message, String[][][] expected, CoreMap coremap, String coremapOutput) { List<CoreMap> sentences = coremap.get(CoreAnnotations.SentencesAnnotation.class); assertEquals(message + ": number of sentences for\n" + coremapOutput, expected.length, sentences.size()); for (int i = 0; i < expected.length; i++) { CoreMap sentence = sentences.get(i); List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); assertEquals(message + ": number of tokens for sentence " + (i + 1) + "\n" + coremapOutput, expected[i].length, tokens.size()); for (int j = 0; j < expected[i].length; j++) { String text = expected[i][j][0]; String ner = expected[i][j][1]; String debug = "sentence " + (i + 1) + ", token " + (j + 1); assertEquals(message + ": text mismatch for " + debug + "\n" + coremapOutput, text, tokens.get(j).word()); assertEquals(message + ": ner mismatch for " + debug + "(" + tokens.get(j).word() + ")\n" + coremapOutput, ner, tokens.get(j).ner()); } } } public void testRegexNer() throws Exception { // Check the regexner is integrated with the StanfordCoreNLP Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,regexner"); props.setProperty("regexner.ignorecase", "true"); // Maybe ignorecase should be on by default... String text = "Barack Obama is the 44th President of the United States. He is the first African American president."; Annotation document = new Annotation(text); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); StringWriter stringWriter = new StringWriter(); pipeline.prettyPrint(document, new PrintWriter(stringWriter)); String result = stringWriter.getBuffer().toString(); // Check the named entity types String[][][] expected = { { {"Barack", "PERSON"}, {"Obama", "PERSON"}, {"is", "O"}, {"the", "O"}, {"44th", "ORDINAL"}, {"President", "TITLE"}, {"of", "O"}, {"the", "O"}, {"United", "COUNTRY"}, {"States", "COUNTRY"}, {".", "O"}, }, { {"He", "O"}, {"is", "O"}, {"the", "O"}, {"first", "ORDINAL"}, {"African", "NATIONALITY"}, {"American", "NATIONALITY"}, {"president", "TITLE"}, {".", "O"}, }, }; checkNer("testRegexNer", expected, document, result); } public void testRelationExtractor() throws Exception { // Check the regexner is integrated with the StanfordCoreNLP Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,relation"); //props.setProperty("sup.relation.model", "/home/sonalg/javanlp/tmp/roth_relation_model_pipeline.ser"); String text = "Barack Obama, a Yale professor, is president."; Annotation document = new Annotation(text); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0); List<RelationMention> rel = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class); assertEquals(rel.get(0).getType(), "Work_For"); // StringWriter stringWriter = new StringWriter(); // pipeline.prettyPrint(document, new PrintWriter(stringWriter)); // String result = stringWriter.getBuffer().toString(); // System.out.println(result); } /* This test no longer supported. Do not mess with AnnotatorPool outside of StanfordCoreNLP */ /* public void testAnnotatorPool() throws Exception { AnnotatorPool pool = new AnnotatorPool(); pool.register("tokenize", new Factory<Annotator>() { private static final long serialVersionUID = 1L; public Annotator create() { return new Tokenize(); } }); // make sure only one Tokenize is created even with multiple pipelines Properties properties = this.newProperties("annotators=tokenize"); new StanfordCoreNLP(pool, properties); new StanfordCoreNLP(pool, properties); Assert.assertEquals(1, Tokenize.N); } private static class Tokenize implements Annotator { public static int N = 0; public Tokenize() { ++N; } public void annotate(Annotation annotation) { } } private Properties newProperties(String desc) { Properties properties = new Properties(); for (String nameValue: desc.split("\\s*,\\s*")) { String[] nameValueArray = nameValue.split("\\s*=\\s*"); if (nameValueArray.length != 2) { throw new IllegalArgumentException("invalid name=value string: " + nameValue); } properties.setProperty(nameValueArray[0], nameValueArray[1]); } return properties; } */ public void testSerialization() throws Exception { // Test that an annotation can be serialized and deserialized StanfordCoreNLP pipeline = new StanfordCoreNLP(); Annotation document = new Annotation("Stanford University is located in California. It is a great university."); pipeline.annotate(document); CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0); SemanticGraph g = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); processSerialization(g); processSerialization(sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); processSerialization(sentence.get(CoreAnnotations.TokensAnnotation.class)); processSerialization(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)); processSerialization(sentence); Object processed = processSerialization(document); assertTrue(processed instanceof Annotation); Annotation newDocument = (Annotation) processed; assertEquals(document.get(CoreAnnotations.SentencesAnnotation.class).size(), newDocument.get(CoreAnnotations.SentencesAnnotation.class).size()); for (int i = 0; i < document.get(CoreAnnotations.SentencesAnnotation.class).size(); ++i) { CoreMap oldSentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0); CoreMap newSentence = newDocument.get(CoreAnnotations.SentencesAnnotation.class).get(0); assertEquals(oldSentence.get(TreeCoreAnnotations.TreeAnnotation.class), newSentence.get(TreeCoreAnnotations.TreeAnnotation.class)); assertEquals(oldSentence.get(CoreAnnotations.TokensAnnotation.class), newSentence.get(CoreAnnotations.TokensAnnotation.class)); } assertTrue(document.equals(newDocument)); } private static Object processSerialization(Object input) throws Exception { ByteArrayOutputStream bout = new ByteArrayOutputStream(); ObjectOutputStream oout = new ObjectOutputStream(bout); oout.writeObject(input); oout.flush(); oout.close(); ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); ObjectInputStream oin = new ObjectInputStream(bin); return oin.readObject(); } public void testSentenceNewlines() { // create a properties that enables all the annotators Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos"); props.setProperty("ssplit.isOneSentence", "true"); // run an annotation through the pipeline String text = "At least a few female committee members are from Scandinavia. \n"; Annotation document = new Annotation(text); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); // check that tokens are present List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(tokens); Assert.assertEquals("Wrong number of tokens: " + tokens, 11, tokens.size()); // check that sentences are present List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); Assert.assertNotNull(sentences); Assert.assertEquals("Wrong number of sentences", 1, sentences.size()); } public void testSentenceNewlinesTwo() { // create a properties that enables all the annotators Properties props = new Properties(); props.setProperty("annotators", "tokenize"); // run an annotation through the pipeline String text = "At least a few female committee members\nare from Scandinavia.\n"; Annotation document = new Annotation(text); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); // check that tokens are present List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(tokens); Assert.assertEquals("Wrong number of tokens: " + tokens, 11, tokens.size()); } public void testSentenceNewlinesThree() { // create a properties that enables all the annotators Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos"); // run an annotation through the pipeline String text = "At least a few female committee members\nare from Scandinavia.\n"; Annotation document = new Annotation(text); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); // check that tokens are present List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(tokens); Assert.assertEquals("Wrong number of tokens: " + tokens, 11, tokens.size()); // check that sentences are present List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); Assert.assertNotNull(sentences); Assert.assertEquals("Wrong number of sentences", 1, sentences.size()); CoreMap firstSentence = sentences.get(0); List<CoreLabel> sentTokens = firstSentence.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(sentTokens); Assert.assertEquals("Wrong number of sentTokens: " + sentTokens, 11, sentTokens.size()); } private static void checkSUTimeAnnotation(String message, StanfordCoreNLP pipeline, String text, int nExpectedSentences, int nExpectedTokens, Map<Integer,String> expectedNormalizedNER) { Annotation doc = new Annotation(text); pipeline.annotate(doc); // check that sentences and tokens are present List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); Assert.assertNotNull(sentences); Assert.assertEquals(message + ": number of sentences", nExpectedSentences, sentences.size()); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); Assert.assertNotNull(tokens); Assert.assertEquals(message + ": number of tokens", nExpectedTokens, tokens.size()); for (Map.Entry<Integer,String> kv : expectedNormalizedNER.entrySet()) { Assert.assertEquals(message + ": token " + kv.getKey(), kv.getValue(), tokens.get(kv.getKey()).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class)); } } public void testSUTimeProperty() { // Test that SUTime properties are passed through String text = "The date is 2001-10-02. There is a meeting tomorrow."; int nExpectedSentences = 2; int nExpectedTokens = 11; // CoreNLP without properties StanfordCoreNLP pipeline1 = new StanfordCoreNLP(); Map<Integer,String> expectedValues1 = new HashMap<>(); expectedValues1.put(3, "2001-10-02"); expectedValues1.put(9, "OFFSET P1D"); checkSUTimeAnnotation("Default properties", pipeline1, text, nExpectedSentences, nExpectedTokens, expectedValues1); // CoreNLP with properties Properties props = new Properties(); props.setProperty("sutime.searchForDocDate", "true"); StanfordCoreNLP pipeline2 = new StanfordCoreNLP(props); Map<Integer,String> expectedValues2 = new HashMap<>(); expectedValues2.put(3, "2001-10-02"); expectedValues2.put(9, "2001-10-03"); checkSUTimeAnnotation("With searchForDocDate", pipeline2, text, nExpectedSentences, nExpectedTokens, expectedValues2); } }