package edu.stanford.nlp.process;
import junit.framework.TestCase;
import java.io.StringReader;
import java.util.List;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
public class WhitespaceTokenizerTest extends TestCase {
public final String[] TEST =
{ "This is a test . \n This is a second line .",
"A \n B \n \n C",
"A. B" };
public final String[][] RESULTS_NO_EOL =
{ {"This", "is", "a", "test", ".",
"This", "is", "a", "second", "line", "."},
{"A", "B", "C"},
{"A.", "B"} };
public final String[][] RESULTS_EOL =
{ {"This", "is", "a", "test", ".", "\n",
"This", "is", "a", "second", "line", "."},
{"A", "\n", "B", "\n", "\n", "C"},
{"A.", "B"} };
public void runTest(TokenizerFactory<? extends HasWord> factory,
String[] testStrings, String[][] resultsStrings) {
for (int i = 0; i < testStrings.length; ++i) {
Tokenizer<? extends HasWord> tokenizer =
factory.getTokenizer(new StringReader(testStrings[i]));
List<? extends HasWord> tokens = tokenizer.tokenize();
assertEquals(resultsStrings[i].length, tokens.size());
for (int j = 0; j < resultsStrings[i].length; ++j) {
assertEquals(resultsStrings[i][j], tokens.get(j).word());
}
}
}
public void testWordTokenizer() {
runTest(WhitespaceTokenizer.factory(false), TEST, RESULTS_NO_EOL);
runTest(WhitespaceTokenizer.factory(true), TEST, RESULTS_EOL);
}
public void testCLTokenizer() {
LexedTokenFactory<CoreLabel> factory = new CoreLabelTokenFactory();
runTest(new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel>
(factory, false),
TEST, RESULTS_NO_EOL);
runTest(new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel>
(factory, true),
TEST, RESULTS_EOL);
}
}