WordsToSentencesAnnotatorTest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import java.util.*;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;

import edu.stanford.nlp.util.PropertiesUtils;
import junit.framework.Assert;
import junit.framework.TestCase;


/** @author Adam Vogel */
public class WordsToSentencesAnnotatorTest extends TestCase {

  public void testAnnotator() {
    String text = "I saw Dr. Spock yesterday, he was speaking with Mr. McCoy.  They were walking down Mullholand Dr. talking about www.google.com.  Dr. Spock returns!";
    runSentence(text, 3);

    // This would fail for "Yahoo! Research", since we don't yet know to chunk "Yahoo!"
    text = "I visited Google Research.  Dr. Spock, Ph.D., was working there and said it's an awful place!  What a waste of Ms. Pacman's last remaining life.";
    runSentence(text, 3);
  }

  public static boolean runSentence(String text, int num_sentences) {
    Annotation doc = new Annotation(text);
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit");
    props.setProperty("tokenize.language", "en");
    //Annotator annotator = new TokenizerAnnotator("en");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(doc);

    // now check what's up...
    List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
    Assert.assertNotNull(sentences);
    Assert.assertEquals(num_sentences, sentences.size());
    /*
    for(CoreMap s : sentences) {
      String position = s.get(SentencePositionAnnotation.class); // what's wrong here?
      System.out.print("position: ");
      System.out.println(position);
      //throw new RuntimeException(position);
    }
    */
    return true;
  }

  public void testSentenceSplitting() {
    String text = "Date :\n01/02/2012\nContent :\nSome words are here .\n";
    // System.out.println(text);
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit");
    props.setProperty("ssplit.eolonly", "true");
    props.setProperty("tokenize.whitespace", "true");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation document1 = new Annotation(text);
    pipeline.annotate(document1);
    List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
    // System.out.println("* Num of sentences in text = "+sentences.size());
    // System.out.println("Sentences is " + sentences);
    assertEquals(4, sentences.size());
  }

  public void testTokenizeNLsDoesntChangeSsplitResults() {
    String text = "This is one sentence\n\nThis is not another with default ssplit settings.";
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit");
    props.setProperty("tokenize.options", "tokenizeNLs");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation document1 = new Annotation(text);
    pipeline.annotate(document1);
    List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
    assertEquals(1, sentences.size());

    // make sure that there are the correct # of tokens
    // (does NOT contain NL tokens)
    List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
    assertEquals(15, tokens.size());
  }

  public void testDefaultNewlineIsSentenceBreakSettings() {
    String text = "This is one sentence\n\nThis is not another with default ssplit settings.";
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation document1 = new Annotation(text);
    pipeline.annotate(document1);
    List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
    assertEquals(1, sentences.size());

    // make sure that there are the correct # of tokens
    // (does NOT contain NL tokens)
    List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
    assertEquals(13, tokens.size());
  }

  public void testTwoNewlineIsSentenceBreakSettings() {
    String text = "This is \none sentence\n\nThis is not another.";
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit");
    props.setProperty("ssplit.newlineIsSentenceBreak", "two");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation document1 = new Annotation(text);
    pipeline.annotate(document1);
    List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
    assertEquals(2, sentences.size());

    // make sure that there are the correct # of tokens (does contain NL tokens)
    List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
    assertEquals(12, tokens.size());
  }

  public void testAlwaysNewlineIsSentenceBreakSettings() {
    String text = "This is \none sentence\n\nThis is not another.";
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit");
    props.setProperty("ssplit.newlineIsSentenceBreak", "always");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    Annotation document1 = new Annotation(text);
    pipeline.annotate(document1);
    List<CoreMap> sentences = document1.get(CoreAnnotations.SentencesAnnotation.class);
    assertEquals(3, sentences.size());

    // make sure that there are the correct # of tokens (does contain NL tokens)
    List<CoreLabel> tokens = document1.get(CoreAnnotations.TokensAnnotation.class);
    assertEquals(12, tokens.size());
  }


}