/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.tokenizers; import org.junit.Before; import org.junit.Test; import org.languagetool.TestTools; import org.languagetool.language.English; public class EnglishSRXSentenceTokenizerTest { // accept \n as paragraph: private final SentenceTokenizer stokenizer = new SRXSentenceTokenizer(new English()); // accept only \n\n as paragraph: private final SentenceTokenizer stokenizer2 = new SRXSentenceTokenizer(new English()); @Before public void setUp() { stokenizer.setSingleLineBreaksMarksParagraph(true); stokenizer2.setSingleLineBreaksMarksParagraph(false); } // NOTE: sentences here need to end with a space character so they // have correct whitespace when appended: @Test public void testTokenize() { // incomplete sentences, need to work for on-thy-fly checking of texts: testSplit("Here's a"); testSplit("Here's a sentence. ", "And here's one that's not comp"); testSplit("This is a sentence. "); testSplit("This is a sentence. ", "And this is another one."); testSplit("This is a sentence.", "Isn't it?", "Yes, it is."); testSplit("This is e.g. Mr. Smith, who talks slowly...", "But this is another sentence."); testSplit("Chanel no. 5 is blah."); testSplit("Mrs. Jones gave Peter $4.5, to buy Chanel No 5.", "He never came back."); testSplit("On p. 6 there's nothing. ", "Another sentence."); testSplit("Leave me alone!, he yelled. ", "Another sentence."); testSplit("\"Leave me alone!\", he yelled."); testSplit("'Leave me alone!', he yelled. ", "Another sentence."); testSplit("'Leave me alone!,' he yelled. ", "Another sentence."); testSplit("This works on the phrase level, i.e. not on the word level."); testSplit("Let's meet at 5 p.m. in the main street."); testSplit("James comes from the U.K. where he worked as a programmer."); testSplit("Don't split strings like U.S.A. please."); testSplit("Don't split strings like U. S. A. either."); testSplit("Don't split... ", "Well you know. ", "Here comes more text."); testSplit("Don't split... well you know. ", "Here comes more text."); testSplit("The \".\" should not be a delimiter in quotes."); testSplit("\"Here he comes!\" she said."); testSplit("\"Here he comes!\", she said."); testSplit("\"Here he comes.\" ", "But this is another sentence."); testSplit("\"Here he comes!\". ", "That's what he said."); testSplit("The sentence ends here. ", "(Another sentence.)"); testSplit("The sentence (...) ends here."); testSplit("The sentence [...] ends here."); testSplit("The sentence ends here (...). ", "Another sentence."); // previously known failed but not now :) testSplit("He won't. ", "Really."); testSplit("He will not. ", "Really."); testSplit("He won't go. ", "Really."); testSplit("He won't say no.", "Not really."); testSplit("He won't say No.", "Not really."); testSplit("He won't say no. 5 is better. ", "Not really."); testSplit("He won't say No. 5 is better. ", "Not really."); testSplit("They met at 5 p.m. on Thursday."); testSplit("They met at 5 p.m. ", "It was Thursday."); testSplit("This is it: a test."); testSplit("12) Make sure that the lamp is on. ", "12) Make sure that the lamp is on. "); testSplit("He also offers a conversion table (see Cohen, 1988, p. 123). "); // one/two returns = paragraph = new sentence: TestTools.testSplit(new String[] { "He won't\n\n", "Really." }, stokenizer2); TestTools.testSplit(new String[] { "He won't\n", "Really." }, stokenizer); TestTools.testSplit(new String[] { "He won't\n\n", "Really." }, stokenizer2); TestTools.testSplit(new String[] { "He won't\nReally." }, stokenizer2); // Missing space after sentence end: testSplit("James is from the Ireland!", "He lives in Spain now."); // From the abbreviation list: testSplit("Jones Bros. have built a successful company."); // parentheses: testSplit("It (really!) works."); testSplit("It [really!] works."); testSplit("It works (really!). ", "No doubt."); testSplit("It works [really!]. ", "No doubt."); testSplit("It really(!) works well."); testSplit("It really[!] works well."); testSplit("This is a sentence.\u0002 ", "And this is another one."); // footnotes in LibOO/OOo look like this } private void testSplit(String... sentences) { TestTools.testSplit(sentences, stokenizer); } }