WordToSentenceProcessorTest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.TokenizerAnnotator;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import junit.framework.TestCase;


public class WordToSentenceProcessorTest extends TestCase {

  private static final Annotator ptb = new TokenizerAnnotator(false, "en");
  private static final Annotator ptbNL = new TokenizerAnnotator(false, "en", "invertible,ptb3Escaping=true,tokenizeNLs=true");
  private static final Annotator wsNL = new TokenizerAnnotator(false,
          PropertiesUtils.asProperties("tokenize.whitespace", "true", "invertible", "true", "tokenizeNLs", "true"));

  private static final WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>();
  private static final WordToSentenceProcessor<CoreLabel> wtsNull =
          new WordToSentenceProcessor<>(true); // treat input as one sentence
  private static final WordToSentenceProcessor<CoreLabel> cwts = new WordToSentenceProcessor<>(
          "[.。]|[!?！？]+", WordToSentenceProcessor.NewlineIsSentenceBreak.TWO_CONSECUTIVE, false);


  private static void checkResult(WordToSentenceProcessor<CoreLabel> wts,
                                 String testSentence, String ... gold) {
    checkResult(wts, ptb, testSentence, gold);
  }

  private static void checkResult(WordToSentenceProcessor<CoreLabel> wts,
                                  Annotator tokenizer,
                                 String testSentence, String ... gold) {
    Annotation annotation = new Annotation(testSentence);
    ptbNL.annotate(annotation);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    List<List<CoreLabel>> sentences = wts.process(tokens);

    assertEquals("Output number of sentences didn't match:\n" +
            Arrays.toString(gold) + " vs. \n" + sentences + '\n',
            gold.length, sentences.size());

    Annotation[] goldAnnotations = new Annotation[gold.length];
    for (int i = 0; i < gold.length; ++i) {
      goldAnnotations[i] = new Annotation(gold[i]);
      tokenizer.annotate(goldAnnotations[i]);
      List<CoreLabel> goldTokens =
        goldAnnotations[i].get(CoreAnnotations.TokensAnnotation.class);
      List<CoreLabel> testTokens = sentences.get(i);
      int goldTokensSize = goldTokens.size();
      assertEquals("Sentence lengths didn't match:\n" +
              goldTokens + " vs. \n" + testTokens + '\n',
              goldTokensSize, testTokens.size());
      for (int j = 0; j < goldTokensSize; ++j) {
        assertEquals(goldTokens.get(j).word(), testTokens.get(j).word());
      }
    }
  }

  public void testNoSplitting() {
    checkResult(wts, "This should only be one sentence.",
                "This should only be one sentence.");
  }

  public void testTwoSentences() {
    checkResult(wts, "This should be two sentences.  There is a split.",
                "This should be two sentences.",
                "There is a split.");
    checkResult(wts, "This should be two sentences!  There is a split.",
                "This should be two sentences!",
                "There is a split.");
    checkResult(wts, "This should be two sentences?  There is a split.",
                "This should be two sentences?",
                "There is a split.");
    checkResult(wts, "This should be two sentences!!!?!!  There is a split.",
                "This should be two sentences!!!?!!",
                "There is a split.");
  }

  public void testEdgeCases() {
    checkResult(wts, "This should be two sentences.  Second one incomplete",
                "This should be two sentences.",
                "Second one incomplete");
    checkResult(wts, "One incomplete sentence",
                "One incomplete sentence");
    checkResult(wts, "(Break after a parenthesis.)  (Or after \"quoted stuff!\")",
                "(Break after a parenthesis.)",
                "(Or after \"quoted stuff!\")");
    checkResult(wts, "  ");
    checkResult(wts, "This should be\n one sentence.",
                "This should be one sentence.");
    checkResult(wts, "'') Funny stuff joined on.",
                "'') Funny stuff joined on.");

  }

  public void testMr() {
    checkResult(wts, "Mr. White got a loaf of bread",
                "Mr. White got a loaf of bread");
  }

  public void testNullSplitter() {
    checkResult(wtsNull, "This should be one sentence.  There is no split.",
            "This should be one sentence.  There is no split.");
  }

  public void testParagraphStrategies() {
    final WordToSentenceProcessor<CoreLabel> wtsNever =
            new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER);
    final WordToSentenceProcessor<CoreLabel> wtsAlways =
            new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.ALWAYS);
    final WordToSentenceProcessor<CoreLabel> wtsTwo =
            new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.TWO_CONSECUTIVE);

    String input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.";
    String input2 = "Depending on the options,\nthis could be all sorts of things,\n as I like chocolate. And cookies.";
    checkResult(wtsNever, input1,
            "Depending on the options,\nthis could be all sorts of things,\n\nas I like chocolate.",
            "And cookies.");
    checkResult(wtsAlways, input1,
            "Depending on the options,",
            "this could be all sorts of things,",
            "as I like chocolate.",
            "And cookies.");
    checkResult(wtsTwo, input1,
            "Depending on the options, this could be all sorts of things,",
            "as I like chocolate.",
            "And cookies.");
    checkResult(wtsNever, input2,
            "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.",
            "And cookies.");
    checkResult(wtsAlways, input2,
            "Depending on the options,",
            "this could be all sorts of things,",
            "as I like chocolate.",
            "And cookies.");
    checkResult(wtsTwo, input2,
            "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.",
            "And cookies.");
  }

  public void testXmlElements() {
    final WordToSentenceProcessor<CoreLabel> wtsXml =
            new WordToSentenceProcessor<>(null, null,null,
                        Generics.newHashSet(Arrays.asList("p", "chapter")),
                        WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER, null, null);

    String input1 = "<chapter>Chapter 1</chapter><p>This is text. So is this.</p> <p>One without end</p><p>Another</p><p>And another</p>";
    checkResult(wtsXml, input1,
            "Chapter 1",
            "This is text.",
            "So is this.",
            "One without end",
            "Another",
            "And another");
  }

  public void testRegion() {
    final WordToSentenceProcessor<CoreLabel> wtsRegion =
            new WordToSentenceProcessor<>(WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX,
                    WordToSentenceProcessor.DEFAULT_BOUNDARY_FOLLOWERS_REGEX,
                    WordToSentenceProcessor.DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD,
                    Generics.newHashSet(Collections.singletonList("p")),
                    "chapter|preface", WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER, null, null, false, false);
    String input1 = "<title>Chris rules!</title><preface><p>Para one</p><p>Para two</p></preface>" +
            "<chapter><p>Text we like. Two sentences \n\n in it.</p></chapter><coda>Some more text here</coda>";
    checkResult(wtsRegion, input1,
            "Para one",
            "Para two",
            "Text we like.",
            "Two sentences in it.");

  }

  public void testBlankLines() {
    final WordToSentenceProcessor<CoreLabel> wtsLines =
            new WordToSentenceProcessor<>(Generics.newHashSet(WordToSentenceProcessor.DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD));
    String input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.";
    checkResult(wtsLines, input1,
            "Depending on the options,",
            "this could be all sorts of things,",
            "",
            "as I like chocolate. And cookies.");
    String input2 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n";
    checkResult(wtsLines, input2,
            "Depending on the options,",
            "this could be all sorts of things,",
            "",
            "as I like chocolate. And cookies.");
    String input3 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n\n";
    checkResult(wtsLines, input3,
            "Depending on the options,",
            "this could be all sorts of things,",
            "",
            "as I like chocolate. And cookies.",
            "");
  }

  public void testExclamationPoint() {
    Annotation annotation = new Annotation("Foo!!");
    ptb.annotate(annotation);
    List list = annotation.get(CoreAnnotations.TokensAnnotation.class);
    assertEquals("Wrong double bang", "[Foo, !!]", list.toString());
  }

  public void testChinese() {
    checkResult(cwts, wsNL,"巴拉特 说 ： 「 我们 未 再 获得 任何 结果 。 」 ＜ 金融时报 ？ ＞ 《 金融时报 》 周三",
            "巴拉特 说 ： 「 我们 未 再 获得 任何 结果 。 」",
             "＜ 金融时报 ？ ＞",
              "《 金融时报 》 周三");

  }
}