SimpleTokenizerTest.java example

Explorer
smile-master
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.nlp.tokenizer;

import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import static org.junit.Assert.*;

/**
 *
 * @author Haifeng
 */
public class SimpleTokenizerTest {

    public SimpleTokenizerTest() {
    }

    @BeforeClass
    public static void setUpClass() throws Exception {
    }

    @AfterClass
    public static void tearDownClass() throws Exception {
    }

    @Before
    public void setUp() {
    }

    @After
    public void tearDown() {
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenize() {
        System.out.println("tokenize");
        String text = "Good muffins cost $3.88\nin New York.  Please buy "
                + "me\ntwo of them.\n\nYou cannot eat them. I gonna eat them. "
                + "Thanks. Of course, I won't. ";

        String[] expResult = {"Good", "muffins", "cost", "$", "3.88", "in",
            "New", "York.", "Please", "buy", "me", "two", "of", "them", ".",
            "You", "can", "not", "eat", "them.", "I", "gon", "na", "eat",
            "them.", "Thanks.", "Of", "course", ",", "I", "will", "not", "."};

        SimpleTokenizer instance = new SimpleTokenizer(true);
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testSplitContraction() {
        System.out.println("tokenize contraction");
        String text = "Here are some examples of contractions: 'tis, "
                + "'twas, ain't, aren't, Can't, could've, couldn't, didn't, doesn't, "
                + "don't, hasn't, he'd, he'll, he's, how'd, how'll, how's, i'd, i'll, i'm, "
                + "i've, isn't, it's, might've, mightn't, must've, mustn't, Shan't, "
                + "she'd, she'll, she's, should've, shouldn't, that'll, that's, "
                + "there's, they'd, they'll, they're, they've, wasn't, we'd, we'll, "
                + "we're, weren't, what'd, what's, when'd, when'll, when's, "
                + "where'd, where'll, where's, who'd, who'll, who's, why'd, why'll, "
                + "why's, Won't, would've, wouldn't, you'd, you'll, you're, you've";

        String[] expResult = {"Here", "are", "some", "examples", "of",
            "contractions", ":", "'t", "is", ",", "'t", "was", ",", "am",
            "not", ",", "are", "not", ",", "Can", "not", ",", "could", "'ve",
            ",", "could", "not", ",", "did", "not", ",", "does", "not", ",", "do",
            "not", ",", "has", "not", ",", "he", "'d", ",", "he", "'ll", ",",
            "he", "'s", ",", "how", "'d", ",", "how", "'ll", ",", "how", "'s",
            ",", "i", "'d", ",", "i", "'ll", ",", "i", "'m", ",", "i", "'ve", ",",
            "is", "not", ",", "it", "'s", ",", "might", "'ve", ",", "might",
            "not", ",", "must", "'ve", ",", "must", "not", ",", "Shall",
            "not", ",", "she", "'d", ",", "she", "'ll", ",", "she", "'s", ",",
            "should", "'ve", ",", "should", "not", ",", "that", "'ll", ",",
            "that", "'s", ",", "there", "'s", ",", "they", "'d", ",", "they", "'ll",
            ",", "they", "'re", ",", "they", "'ve", ",", "was", "not", ",", "we",
            "'d", ",", "we", "'ll", ",", "we", "'re", ",", "were", "not", ",",
            "what", "'d", ",", "what", "'s", ",", "when",
            "'d", ",", "when", "'ll", ",", "when", "'s", ",", "where", "'d", ",",
            "where", "'ll", ",", "where", "'s", ",", "who", "'d", ",", "who",
            "'ll", ",", "who", "'s", ",", "why", "'d", ",", "why", "'ll", ",",
            "why", "'s", ",", "Will", "not", ",", "would", "'ve", ",", "would",
            "not", ",", "you", "'d", ",", "you", "'ll", ",", "you", "'re", ",",
            "you", "'ve"};

        SimpleTokenizer instance = new SimpleTokenizer(true);
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testSplitAbbreviation() {
        System.out.println("tokenize abbreviation");
        String text = "Here are some examples of abbreviations: A.B., abbr., "
                + "acad., A.D., alt., A.M., B.C., etc.";

        String[] expResult = {"Here", "are", "some", "examples", "of",
            "abbreviations", ":", "A.B.", ",", "abbr.", ",", "acad.",
            ",", "A.D.", ",", "alt.", ",", "A.M.", ",", "B.C.", ",", "etc.", "."};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenizeTis() {
        System.out.println("tokenize tis");
        String text = "'tis, 'tisn't, and 'twas were common in early modern English texts.";
        String[] expResult = {"'t", "is", ",", "'t", "is", "not", ",", "and",
            "'t", "was", "were", "common", "in", "early", "modern", "English",
            "texts", "."};

        SimpleTokenizer instance = new SimpleTokenizer(true);
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenizeHyphen() {
        System.out.println("tokenize hyphen");
        String text = "On a noncash basis for the quarter, the bank reported a "
                + "loss of $7.3 billion because of a $10.4 billion write-down "
                + "in the value of its credit card unit, attributed to federal "
                + "regulations that limit debit fees and other charges.";

        String[] expResult = {"On", "a", "noncash", "basis", "for", "the",
            "quarter", ",", "the", "bank", "reported", "a", "loss", "of", "$",
            "7.3", "billion", "because", "of", "a", "$", "10.4", "billion",
            "write-down", "in", "the", "value", "of", "its", "credit", "card",
            "unit", ",", "attributed", "to", "federal", "regulations", "that",
            "limit", "debit", "fees", "and", "other", "charges", "."};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenizeSingleQuote() {
        System.out.println("tokenize single quote");
        String text = "String literals can be enclosed in matching single "
                + "quotes ('). But it's also appearing in contractions such as can't.";

        String[] expResult = {"String", "literals", "can", "be", "enclosed", "in",
            "matching", "single", "quotes", "(", "'", ")", ".", "But", "it", "'s", "also",
            "appearing", "in", "contractions", "such", "as", "can", "not", "."};

        SimpleTokenizer instance = new SimpleTokenizer(true);
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenizeRomanNumeral() {
        System.out.println("tokenize roman numeral");
        String text = "S.. or S: means \"twice\" (as in \"twice a third\").";
        String[] expResult = {"S..", "or", "S", ":", "means", "\"", "twice",
            "\"", "(", "as", "in", "\"", "twice", "a", "third", "\"", ")", "."};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenizeMixedAlphanumWords() {
        System.out.println("tokenize words with mixed numbers, letters, and punctuation");
        String text = "3M, L-3, BB&T, AutoZone, O'Reilly, Harley-Davidson, CH2M, A-Mark, "
                + "Quad/Graphics, Bloomin' Brands, B/E Aerospace, J.Crew, E*Trade.";

        // Note: would be very hard to get "Bloomin'" and "E*Trade" correct
        String[] expResult = {"3M", ",", "L-3", ",", "BB&T", ",", "AutoZone", ",", "O'Reilly",
                ",", "Harley-Davidson", ",", "CH2M", ",", "A-Mark", ",", "Quad/Graphics", ",", "Bloomin",
                "'", "Brands", ",", "B/E", "Aerospace", ",", "J.Crew", ",", "E", "*", "Trade", "."};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class SimpleTokenizer.
     */
    @Test
    public void testTokenizeDiacritizedWords() {
        System.out.println("tokenize words with diacritized chars (both composite and combining)");
        String text = "The naïve résumé of Raúl Ibáñez; re\u0301sume\u0301.";
        String[] expResult = {"The", "naïve", "résumé", "of", "Raúl", "Ibáñez", ";", "re\u0301sume\u0301", "."};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class TreebankWordTokenizer.
     */
    @Test
    public void testTokenizeNonLatinChars() {
        System.out.println("tokenize words containing non-Latin chars");
        // See https://en.wikipedia.org/wiki/Zero-width_non-joiner
        String text = "می‌خواهم   עֲו‌ֹנֹת   Auf‌lage";
        String[] expResult = {"می‌خواهم", "עֲו‌ֹנֹת", "Auf‌lage"};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class TreebankWordTokenizer.
     */
    @Test
    public void testTokenizeVariousSpaces() {
        System.out.println("tokenize words separated by various kinds of space");
        // No-break space and em-space
        String text = "the\u00A0cat\u2003the_cat";
        String[] expResult = {"the", "cat", "the_cat"};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }

    /**
     * Test of split method, of class TreebankWordTokenizer.
     */
    @Test
    public void testTokenizeToC() {
        System.out.println("tokenize 1.2 Interpretation.....................................................................................................................3");
        // No-break space and em-space
        String text = "1.2 Interpretation.....................................................................................................................3";
        String[] expResult = {"1.2", "Interpretation", ".....................................................................................................................", "3"};

        SimpleTokenizer instance = new SimpleTokenizer();
        String[] result = instance.split(text);

        assertEquals(expResult.length, result.length);
        for (int i = 0; i < result.length; i++) {
            assertEquals(expResult[i], result[i]);
        }
    }
}