TrueCaseAnnotatorITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import junit.framework.TestCase;
import org.junit.Assert;

/**
 * @author Christopher Manning
 */
public class TrueCaseAnnotatorITest extends TestCase {

  private static final boolean VERBOSE = false;

  private static void runSentence(StanfordCoreNLP pipeline, String sentence, String[] ans) {
    Annotation document = new Annotation(sentence);
    pipeline.annotate(document);

    // check that tokens are present
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    Assert.assertNotNull(tokens);
    Assert.assertEquals("Wrong number of tokens: " + tokens + " vs. " + ans.length, ans.length, tokens.size());

    // check that sentences are present
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    Assert.assertNotNull(sentences);
    Assert.assertEquals("Wrong number of sentences", 1, sentences.size());

    for (CoreMap sent: document.get(CoreAnnotations.SentencesAnnotation.class)){
      List<? extends CoreLabel> words = sent.get(CoreAnnotations.TokensAnnotation.class);
      for(int i = 0; i < words.size(); i ++) {
        String w = words.get(i).word();
        String tcw = words.get(i).get(CoreAnnotations.TrueCaseTextAnnotation.class);
        if (VERBOSE) {
          if (!w.equals(tcw)) {
            System.err.print("\"" + w + "\" true cased to \"" + tcw + "\" in context:");
            for (int j = Math.max(0, i - 2); j < Math.min(words.size(), i + 2); j++) {
              System.err.print(" " + words.get(j).word());
            }
            System.err.println();
          }
        }
        assertEquals("Error in truecasing", ans[i], tcw);
      }
    }
  }


  public void testTrueCaseAnnotator() {
    // run an annotation through the pipeline
    String text1 = "HEATHER BROWN WAS LEAD WOMAN AT DUKE UNIVERSITY.";
    String text2 = "heather brown was lead woman at duke university.";
    String text3 = "Heather Brown was lead woman at Duke University.";
    String[] ans1 = { "Heather", "Brown", "was", "lead", "woman", "at", "Duke", "University", "." };

    String text4 = "\"GOOD MORNING AMERICA FROM MCVEY!\"";
    String text5 = "\"good morning america from mcvey!\"";
    String text6 = "\"Good Morning America From McVey!\"";
    String[] ans4 = { "``", "Good", "Morning", "America", "from", "McVey", "!", "''" };

    Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit, truecase");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    runSentence(pipeline, text1, ans1);
    runSentence(pipeline, text2, ans1);
    runSentence(pipeline, text3, ans1);

    runSentence(pipeline, text4, ans4);
    runSentence(pipeline, text5, ans4);
    runSentence(pipeline, text6, ans4);
  }

}