package edu.stanford.nlp.process; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotator; import edu.stanford.nlp.pipeline.TokenizerAnnotator; import java.util.Arrays; import java.util.Collections; import java.util.List; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; import junit.framework.TestCase; public class WordToSentenceProcessorTest extends TestCase { private static final Annotator ptb = new TokenizerAnnotator(false, "en"); private static final Annotator ptbNL = new TokenizerAnnotator(false, "en", "invertible,ptb3Escaping=true,tokenizeNLs=true"); private static final Annotator wsNL = new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.whitespace", "true", "invertible", "true", "tokenizeNLs", "true")); private static final WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(); private static final WordToSentenceProcessor<CoreLabel> wtsNull = new WordToSentenceProcessor<>(true); // treat input as one sentence private static final WordToSentenceProcessor<CoreLabel> cwts = new WordToSentenceProcessor<>( "[.。]|[!?!?]+", WordToSentenceProcessor.NewlineIsSentenceBreak.TWO_CONSECUTIVE, false); private static void checkResult(WordToSentenceProcessor<CoreLabel> wts, String testSentence, String ... gold) { checkResult(wts, ptb, testSentence, gold); } private static void checkResult(WordToSentenceProcessor<CoreLabel> wts, Annotator tokenizer, String testSentence, String ... gold) { Annotation annotation = new Annotation(testSentence); ptbNL.annotate(annotation); List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); List<List<CoreLabel>> sentences = wts.process(tokens); assertEquals("Output number of sentences didn't match:\n" + Arrays.toString(gold) + " vs. \n" + sentences + '\n', gold.length, sentences.size()); Annotation[] goldAnnotations = new Annotation[gold.length]; for (int i = 0; i < gold.length; ++i) { goldAnnotations[i] = new Annotation(gold[i]); tokenizer.annotate(goldAnnotations[i]); List<CoreLabel> goldTokens = goldAnnotations[i].get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> testTokens = sentences.get(i); int goldTokensSize = goldTokens.size(); assertEquals("Sentence lengths didn't match:\n" + goldTokens + " vs. \n" + testTokens + '\n', goldTokensSize, testTokens.size()); for (int j = 0; j < goldTokensSize; ++j) { assertEquals(goldTokens.get(j).word(), testTokens.get(j).word()); } } } public void testNoSplitting() { checkResult(wts, "This should only be one sentence.", "This should only be one sentence."); } public void testTwoSentences() { checkResult(wts, "This should be two sentences. There is a split.", "This should be two sentences.", "There is a split."); checkResult(wts, "This should be two sentences! There is a split.", "This should be two sentences!", "There is a split."); checkResult(wts, "This should be two sentences? There is a split.", "This should be two sentences?", "There is a split."); checkResult(wts, "This should be two sentences!!!?!! There is a split.", "This should be two sentences!!!?!!", "There is a split."); } public void testEdgeCases() { checkResult(wts, "This should be two sentences. Second one incomplete", "This should be two sentences.", "Second one incomplete"); checkResult(wts, "One incomplete sentence", "One incomplete sentence"); checkResult(wts, "(Break after a parenthesis.) (Or after \"quoted stuff!\")", "(Break after a parenthesis.)", "(Or after \"quoted stuff!\")"); checkResult(wts, " "); checkResult(wts, "This should be\n one sentence.", "This should be one sentence."); checkResult(wts, "'') Funny stuff joined on.", "'') Funny stuff joined on."); } public void testMr() { checkResult(wts, "Mr. White got a loaf of bread", "Mr. White got a loaf of bread"); } public void testNullSplitter() { checkResult(wtsNull, "This should be one sentence. There is no split.", "This should be one sentence. There is no split."); } public void testParagraphStrategies() { final WordToSentenceProcessor<CoreLabel> wtsNever = new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER); final WordToSentenceProcessor<CoreLabel> wtsAlways = new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.ALWAYS); final WordToSentenceProcessor<CoreLabel> wtsTwo = new WordToSentenceProcessor<>(WordToSentenceProcessor.NewlineIsSentenceBreak.TWO_CONSECUTIVE); String input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies."; String input2 = "Depending on the options,\nthis could be all sorts of things,\n as I like chocolate. And cookies."; checkResult(wtsNever, input1, "Depending on the options,\nthis could be all sorts of things,\n\nas I like chocolate.", "And cookies."); checkResult(wtsAlways, input1, "Depending on the options,", "this could be all sorts of things,", "as I like chocolate.", "And cookies."); checkResult(wtsTwo, input1, "Depending on the options, this could be all sorts of things,", "as I like chocolate.", "And cookies."); checkResult(wtsNever, input2, "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.", "And cookies."); checkResult(wtsAlways, input2, "Depending on the options,", "this could be all sorts of things,", "as I like chocolate.", "And cookies."); checkResult(wtsTwo, input2, "Depending on the options,\nthis could be all sorts of things,\nas I like chocolate.", "And cookies."); } public void testXmlElements() { final WordToSentenceProcessor<CoreLabel> wtsXml = new WordToSentenceProcessor<>(null, null,null, Generics.newHashSet(Arrays.asList("p", "chapter")), WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER, null, null); String input1 = "<chapter>Chapter 1</chapter><p>This is text. So is this.</p> <p>One without end</p><p>Another</p><p>And another</p>"; checkResult(wtsXml, input1, "Chapter 1", "This is text.", "So is this.", "One without end", "Another", "And another"); } public void testRegion() { final WordToSentenceProcessor<CoreLabel> wtsRegion = new WordToSentenceProcessor<>(WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX, WordToSentenceProcessor.DEFAULT_BOUNDARY_FOLLOWERS_REGEX, WordToSentenceProcessor.DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD, Generics.newHashSet(Collections.singletonList("p")), "chapter|preface", WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER, null, null, false, false); String input1 = "<title>Chris rules!</title><preface><p>Para one</p><p>Para two</p></preface>" + "<chapter><p>Text we like. Two sentences \n\n in it.</p></chapter><coda>Some more text here</coda>"; checkResult(wtsRegion, input1, "Para one", "Para two", "Text we like.", "Two sentences in it."); } public void testBlankLines() { final WordToSentenceProcessor<CoreLabel> wtsLines = new WordToSentenceProcessor<>(Generics.newHashSet(WordToSentenceProcessor.DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD)); String input1 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies."; checkResult(wtsLines, input1, "Depending on the options,", "this could be all sorts of things,", "", "as I like chocolate. And cookies."); String input2 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n"; checkResult(wtsLines, input2, "Depending on the options,", "this could be all sorts of things,", "", "as I like chocolate. And cookies."); String input3 = "Depending on the options,\nthis could be all sorts of things,\n\n as I like chocolate. And cookies.\n\n"; checkResult(wtsLines, input3, "Depending on the options,", "this could be all sorts of things,", "", "as I like chocolate. And cookies.", ""); } public void testExclamationPoint() { Annotation annotation = new Annotation("Foo!!"); ptb.annotate(annotation); List list = annotation.get(CoreAnnotations.TokensAnnotation.class); assertEquals("Wrong double bang", "[Foo, !!]", list.toString()); } public void testChinese() { checkResult(cwts, wsNL,"巴拉特 说 : 「 我们 未 再 获得 任何 结果 。 」 < 金融时报 ? > 《 金融时报 》 周三", "巴拉特 说 : 「 我们 未 再 获得 任何 结果 。 」", "< 金融时报 ? >", "《 金融时报 》 周三"); } }