/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.nlp.tokenizer;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import static org.junit.Assert.*;
/**
*
* @author Haifeng
*/
public class SimpleTokenizerTest {
public SimpleTokenizerTest() {
}
@BeforeClass
public static void setUpClass() throws Exception {
}
@AfterClass
public static void tearDownClass() throws Exception {
}
@Before
public void setUp() {
}
@After
public void tearDown() {
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenize() {
System.out.println("tokenize");
String text = "Good muffins cost $3.88\nin New York. Please buy "
+ "me\ntwo of them.\n\nYou cannot eat them. I gonna eat them. "
+ "Thanks. Of course, I won't. ";
String[] expResult = {"Good", "muffins", "cost", "$", "3.88", "in",
"New", "York.", "Please", "buy", "me", "two", "of", "them", ".",
"You", "can", "not", "eat", "them.", "I", "gon", "na", "eat",
"them.", "Thanks.", "Of", "course", ",", "I", "will", "not", "."};
SimpleTokenizer instance = new SimpleTokenizer(true);
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testSplitContraction() {
System.out.println("tokenize contraction");
String text = "Here are some examples of contractions: 'tis, "
+ "'twas, ain't, aren't, Can't, could've, couldn't, didn't, doesn't, "
+ "don't, hasn't, he'd, he'll, he's, how'd, how'll, how's, i'd, i'll, i'm, "
+ "i've, isn't, it's, might've, mightn't, must've, mustn't, Shan't, "
+ "she'd, she'll, she's, should've, shouldn't, that'll, that's, "
+ "there's, they'd, they'll, they're, they've, wasn't, we'd, we'll, "
+ "we're, weren't, what'd, what's, when'd, when'll, when's, "
+ "where'd, where'll, where's, who'd, who'll, who's, why'd, why'll, "
+ "why's, Won't, would've, wouldn't, you'd, you'll, you're, you've";
String[] expResult = {"Here", "are", "some", "examples", "of",
"contractions", ":", "'t", "is", ",", "'t", "was", ",", "am",
"not", ",", "are", "not", ",", "Can", "not", ",", "could", "'ve",
",", "could", "not", ",", "did", "not", ",", "does", "not", ",", "do",
"not", ",", "has", "not", ",", "he", "'d", ",", "he", "'ll", ",",
"he", "'s", ",", "how", "'d", ",", "how", "'ll", ",", "how", "'s",
",", "i", "'d", ",", "i", "'ll", ",", "i", "'m", ",", "i", "'ve", ",",
"is", "not", ",", "it", "'s", ",", "might", "'ve", ",", "might",
"not", ",", "must", "'ve", ",", "must", "not", ",", "Shall",
"not", ",", "she", "'d", ",", "she", "'ll", ",", "she", "'s", ",",
"should", "'ve", ",", "should", "not", ",", "that", "'ll", ",",
"that", "'s", ",", "there", "'s", ",", "they", "'d", ",", "they", "'ll",
",", "they", "'re", ",", "they", "'ve", ",", "was", "not", ",", "we",
"'d", ",", "we", "'ll", ",", "we", "'re", ",", "were", "not", ",",
"what", "'d", ",", "what", "'s", ",", "when",
"'d", ",", "when", "'ll", ",", "when", "'s", ",", "where", "'d", ",",
"where", "'ll", ",", "where", "'s", ",", "who", "'d", ",", "who",
"'ll", ",", "who", "'s", ",", "why", "'d", ",", "why", "'ll", ",",
"why", "'s", ",", "Will", "not", ",", "would", "'ve", ",", "would",
"not", ",", "you", "'d", ",", "you", "'ll", ",", "you", "'re", ",",
"you", "'ve"};
SimpleTokenizer instance = new SimpleTokenizer(true);
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testSplitAbbreviation() {
System.out.println("tokenize abbreviation");
String text = "Here are some examples of abbreviations: A.B., abbr., "
+ "acad., A.D., alt., A.M., B.C., etc.";
String[] expResult = {"Here", "are", "some", "examples", "of",
"abbreviations", ":", "A.B.", ",", "abbr.", ",", "acad.",
",", "A.D.", ",", "alt.", ",", "A.M.", ",", "B.C.", ",", "etc.", "."};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenizeTis() {
System.out.println("tokenize tis");
String text = "'tis, 'tisn't, and 'twas were common in early modern English texts.";
String[] expResult = {"'t", "is", ",", "'t", "is", "not", ",", "and",
"'t", "was", "were", "common", "in", "early", "modern", "English",
"texts", "."};
SimpleTokenizer instance = new SimpleTokenizer(true);
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenizeHyphen() {
System.out.println("tokenize hyphen");
String text = "On a noncash basis for the quarter, the bank reported a "
+ "loss of $7.3 billion because of a $10.4 billion write-down "
+ "in the value of its credit card unit, attributed to federal "
+ "regulations that limit debit fees and other charges.";
String[] expResult = {"On", "a", "noncash", "basis", "for", "the",
"quarter", ",", "the", "bank", "reported", "a", "loss", "of", "$",
"7.3", "billion", "because", "of", "a", "$", "10.4", "billion",
"write-down", "in", "the", "value", "of", "its", "credit", "card",
"unit", ",", "attributed", "to", "federal", "regulations", "that",
"limit", "debit", "fees", "and", "other", "charges", "."};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenizeSingleQuote() {
System.out.println("tokenize single quote");
String text = "String literals can be enclosed in matching single "
+ "quotes ('). But it's also appearing in contractions such as can't.";
String[] expResult = {"String", "literals", "can", "be", "enclosed", "in",
"matching", "single", "quotes", "(", "'", ")", ".", "But", "it", "'s", "also",
"appearing", "in", "contractions", "such", "as", "can", "not", "."};
SimpleTokenizer instance = new SimpleTokenizer(true);
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenizeRomanNumeral() {
System.out.println("tokenize roman numeral");
String text = "S.. or S: means \"twice\" (as in \"twice a third\").";
String[] expResult = {"S..", "or", "S", ":", "means", "\"", "twice",
"\"", "(", "as", "in", "\"", "twice", "a", "third", "\"", ")", "."};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenizeMixedAlphanumWords() {
System.out.println("tokenize words with mixed numbers, letters, and punctuation");
String text = "3M, L-3, BB&T, AutoZone, O'Reilly, Harley-Davidson, CH2M, A-Mark, "
+ "Quad/Graphics, Bloomin' Brands, B/E Aerospace, J.Crew, E*Trade.";
// Note: would be very hard to get "Bloomin'" and "E*Trade" correct
String[] expResult = {"3M", ",", "L-3", ",", "BB&T", ",", "AutoZone", ",", "O'Reilly",
",", "Harley-Davidson", ",", "CH2M", ",", "A-Mark", ",", "Quad/Graphics", ",", "Bloomin",
"'", "Brands", ",", "B/E", "Aerospace", ",", "J.Crew", ",", "E", "*", "Trade", "."};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class SimpleTokenizer.
*/
@Test
public void testTokenizeDiacritizedWords() {
System.out.println("tokenize words with diacritized chars (both composite and combining)");
String text = "The naïve résumé of Raúl Ibáñez; re\u0301sume\u0301.";
String[] expResult = {"The", "naïve", "résumé", "of", "Raúl", "Ibáñez", ";", "re\u0301sume\u0301", "."};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class TreebankWordTokenizer.
*/
@Test
public void testTokenizeNonLatinChars() {
System.out.println("tokenize words containing non-Latin chars");
// See https://en.wikipedia.org/wiki/Zero-width_non-joiner
String text = "میخواهم עֲוֹנֹת Auflage";
String[] expResult = {"میخواهم", "עֲוֹנֹת", "Auflage"};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class TreebankWordTokenizer.
*/
@Test
public void testTokenizeVariousSpaces() {
System.out.println("tokenize words separated by various kinds of space");
// No-break space and em-space
String text = "the\u00A0cat\u2003the_cat";
String[] expResult = {"the", "cat", "the_cat"};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
/**
* Test of split method, of class TreebankWordTokenizer.
*/
@Test
public void testTokenizeToC() {
System.out.println("tokenize 1.2 Interpretation.....................................................................................................................3");
// No-break space and em-space
String text = "1.2 Interpretation.....................................................................................................................3";
String[] expResult = {"1.2", "Interpretation", ".....................................................................................................................", "3"};
SimpleTokenizer instance = new SimpleTokenizer();
String[] result = instance.split(text);
assertEquals(expResult.length, result.length);
for (int i = 0; i < result.length; i++) {
assertEquals(expResult[i], result[i]);
}
}
}