package edu.stanford.nlp.pipeline;
import java.util.*;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import junit.framework.Assert;
import junit.framework.TestCase;
/** @author Adam Vogel */
public class WordsToSentencesAnnotatorTest extends TestCase {
public void testAnnotator() {
String text = "I saw Dr. Spock yesterday, he was speaking with Mr. McCoy. They were walking down Mullholand Dr. talking about www.google.com. Dr. Spock returns!";
runSentence(text, 3);
// This would fail for "Yahoo! Research", since we don't yet know to chunk "Yahoo!"
text = "I visited Google Research. Dr. Spock, Ph.D., was working there and said it's an awful place! What a waste of Ms. Pacman's last remaining life.";
runSentence(text, 3);
}
public static boolean runSentence(String text, int num_sentences) {
Annotation doc = new Annotation(text);
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit");
props.setProperty("tokenize.language", "en");
//Annotator annotator = new TokenizerAnnotator("en");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(doc);
// now check what's up...
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
Assert.assertNotNull(sentences);
Assert.assertEquals(num_sentences, sentences.size());
/*
for(CoreMap s : sentences) {
String position = s.get(SentencePositionAnnotation.class); // what's wrong here?
System.out.print("position: ");
System.out.println(position);
//throw new RuntimeException(position);
}
*/
return true;
}
public void testSentenceSplitting() {
String text = "Date :\n01/02/2012\nContent :\nSome words are here .\n";
// System.out.println(text);
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit");
props.setProperty("ssplit.eolonly", "true");
props.setProperty("tokenize.whitespace", "true");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document1 = new Annotation(text);
pipeline.annotate(document1);
List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
// System.out.println("* Num of sentences in text = "+sentences.size());
// System.out.println("Sentences is " + sentences);
assertEquals(4, sentences.size());
}
public void testTokenizeNLsDoesntChangeSsplitResults() {
String text = "This is one sentence\n\nThis is not another with default ssplit settings.";
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit");
props.setProperty("tokenize.options", "tokenizeNLs");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document1 = new Annotation(text);
pipeline.annotate(document1);
List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
assertEquals(1, sentences.size());
// make sure that there are the correct # of tokens
// (does NOT contain NL tokens)
List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(15, tokens.size());
}
public void testDefaultNewlineIsSentenceBreakSettings() {
String text = "This is one sentence\n\nThis is not another with default ssplit settings.";
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document1 = new Annotation(text);
pipeline.annotate(document1);
List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
assertEquals(1, sentences.size());
// make sure that there are the correct # of tokens
// (does NOT contain NL tokens)
List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(13, tokens.size());
}
public void testTwoNewlineIsSentenceBreakSettings() {
String text = "This is \none sentence\n\nThis is not another.";
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit");
props.setProperty("ssplit.newlineIsSentenceBreak", "two");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document1 = new Annotation(text);
pipeline.annotate(document1);
List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
assertEquals(2, sentences.size());
// make sure that there are the correct # of tokens (does contain NL tokens)
List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(12, tokens.size());
}
public void testAlwaysNewlineIsSentenceBreakSettings() {
String text = "This is \none sentence\n\nThis is not another.";
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit");
props.setProperty("ssplit.newlineIsSentenceBreak", "always");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document1 = new Annotation(text);
pipeline.annotate(document1);
List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
assertEquals(3, sentences.size());
// make sure that there are the correct # of tokens (does contain NL tokens)
List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(12, tokens.size());
}
}