TokenizerTest.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2015 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.tokenizer;

import static org.junit.Assert.assertEquals;

import org.apache.commons.lang.StringUtils;
import org.junit.Test;
import org.omegat.tokenizer.ITokenizer.StemmingMode;
import org.omegat.util.Token;

public class TokenizerTest {

    @Test
    public void testEnglish() {
        ITokenizer tok = new LuceneEnglishTokenizer();
        String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" dog.";
        assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
                "jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "dog", "." },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "dog" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "the", "quick", "brown", "x0", "jump", "jumped", "over", "1", "lazi", "lazy", "dog" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "quick", "brown", "jump", "jumped", "over", "lazi", "lazy", "dog" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }

    /**
     * LuceneJapaneseTokenizer includes two customizations that warrant testing:
     * <ol><li>Special removal of tags (e.g. <x0/>) when stemming
     * <li>Re-joining of tags when doing verbatim or non-stemming tokenizing
     * </ol>
     */
    @Test
    public void testJapanese() {
        ITokenizer tok = new LuceneJapaneseTokenizer();
        String orig = "\u6211\u3005\u306E\u3059\u3079\u3066\u306F\u540C\u3058\uFF11\u500B\u306E\u60D1"
                + "\u661F\uFF08\u82F1\uFF1A\u300Ca planet\u300D\uFF09\u306B\u4F4F\u307F\u3001\u6211"
                + "\u3005\u306E\u3059\u3079\u3066\u306F\u305D\u306E\u751F\u7269\u570F\u306E1.5\u90E8"
                + "\u3067\u3042\u308B<x0/>\u3002";
        assertVerbatim(new String[] { "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u540C\u3058",
                "\uFF11", "\u500B", "\u306E", "\u60D1\u661F", "\uFF08", "\u82F1", "\uFF1A", "\u300C", "a",
                " ", "planet", "\u300D", "\uFF09", "\u306B", "\u4F4F\u307F", "\u3001", "\u6211\u3005", "\u306E",
                "\u3059\u3079\u3066", "\u306F", "\u305D\u306E", "\u751F\u7269", "\u570F", "\u306E", "1", ".", "5",
                "\u90E8", "\u3067", "\u3042\u308B", "<x0/>", "\u3002" },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u540C\u3058",
                "\u500B", "\u306E", "\u60D1\u661F", "\uFF08", "\u82F1", "\uFF1A", "\u300C", "a", "planet",
                "\u300D", "\uFF09", "\u306B", "\u4F4F\u307F", "\u3001", "\u6211\u3005", "\u306E",
                "\u3059\u3079\u3066", "\u306F", "\u305D\u306E", "\u751F\u7269", "\u570F", "\u306E", ".",
                "\u90E8", "\u3067", "\u3042\u308B", "\u3002" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u540C\u3058",
                "1", "\uFF11", "\u500B", "\u306E", "\u60D1\u661F", "\u82F1", "a", "planet", "\u306B", "\u4F4F\u3080",
                "\u4F4F\u307F", "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u305D\u306E",
                "\u751F\u7269", "\u570F", "\u306E", "1", "5", "\u90E8", "\u3060", "\u3067", "\u3042\u308B" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "\u6211\u3005", "\u3059\u3079\u3066", "\u540C\u3058", "\u500B",
                "\u60D1\u661F", "\u82F1", "a", "planet", "\u4F4F\u3080", "\u4F4F\u307F", "\u6211\u3005",
                "\u3059\u3079\u3066", "\u751F\u7269", "\u570F", "\u90E8" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));

        // Check for TagJoiningFilter
        orig = "<x0/>\u3042</x0>\u300C<x1/>\u300D<x2/>\u3002<foo bar 123";
        assertVerbatim(new String[] { "<x0/>", "\u3042", "</x0>", "\u300C", "<x1/>", "\u300D", "<x2/>", "\u3002",
                "<", "foo", " ", "bar", " ", "123" },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        // Check for tag removal
        assertResult(new String[] { "\u3042", "foo", "bar" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }

    /**
     * Turkish warrants special testing because it has the letter \u0130
     * (LATIN CAPITAL LETTER I WITH DOT ABOVE); the result (both content
     * and length) of performing <code>"\u0130".toLowerCase()</code> depends
     * on the default Locale, and in the past there were issues with improper
     * lowercasing during tokenization leading to OOB exceptions.
     * <p>
     * Text from https://tr.wikipedia.org/wiki/T%C3%BCrk%C3%A7e
     */
    @Test
    public void testTurkish() {
        ITokenizer tok = new LuceneTurkishTokenizer();
        String orig = "\u201C\u0130stanbul a\u011Fz\u0131\u201D, T\u00FCrkiye T\u00FCrk\u00E7esi"
                + "yaz\u0131 dilinin kayna\u011F\u0131 olarak kabul edilir; yaz\u0131 dili bu"
                + "a\u011F\u0131z temelinde olu\u015Fmu\u015Ftur.";
        assertVerbatim(new String[] { "\u201C", "\u0130stanbul", " ", "a\u011Fz\u0131", "\u201D",
                ",", " ", "T\u00FCrkiye", " ", "T\u00FCrk\u00E7esiyaz\u0131", " ", "dilinin", " ",
                "kayna\u011F\u0131", " ", "olarak", " ", "kabul", " ", "edilir", ";", " ", "yaz\u0131",
                " ", "dili", " ", "bua\u011F\u0131z", " ", "temelinde", " ", "olu\u015Fmu\u015Ftur", "." },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "\u0130stanbul", "a\u011Fz\u0131", "T\u00FCrkiye",
                "T\u00FCrk\u00E7esiyaz\u0131", "dilinin", "kayna\u011F\u0131", "olarak",
                "kabul", "edilir", "yaz\u0131", "dili", "bua\u011F\u0131z", "temelinde",
                "olu\u015Fmu\u015Ftur" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "istanbul", "a\u011Fz\u0131", "t\u00FCrki", "T\u00FCrkiye",
                "t\u00FCrk\u00E7esiyaz", "T\u00FCrk\u00E7esiyaz\u0131", "dil", "dilinin",
                "kaynak", "kayna\u011F\u0131", "olarak", "kabul", "edilir", "yaz", "yaz\u0131",
                "dil", "dili", "buak", "bua\u011F\u0131z", "temel", "temelinde", "olu\u015F",
                "olu\u015Fmu\u015Ftur" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "istanbul", "a\u011Fz\u0131", "t\u00FCrki", "T\u00FCrkiye",
                "t\u00FCrk\u00E7esiyaz", "T\u00FCrk\u00E7esiyaz\u0131", "dil", "dilinin",
                "kaynak", "kayna\u011F\u0131", "kabul", "edilir", "yaz", "yaz\u0131",
                "dil", "dili", "buak", "bua\u011F\u0131z", "temel", "temelinde", "olu\u015F",
                "olu\u015Fmu\u015Ftur" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }

    /**
     * Chinese tends to have very few character boundaries breakable by BreakIterator,
     * so LuceneSmartChineseTokenizer tokenizes by code point for verbatim tokenizing.
     * <p>
     * Text from https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD
     */
    @Test
    public void testChinese() {
        ITokenizer tok = new LuceneSmartChineseTokenizer();
        String orig = "\u6F22\u8A9E\u7684\u6587\u5B57\u7CFB\u7D71\u2014\u2014\u6F22\u5B57\u662F"
                + "\u4E00\u7A2E\u610F\u97F3\u8A9E\u8A00\uFF0C\u8868\u610F\u7684\u540C\u6642\u4E5F"
                + "\u5177\u4E00\u5B9A\u7684\u8868\u97F3\u529F\u80FD\u3002";
        assertVerbatim(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587", "\u5B57", "\u7CFB",
                "\u7D71", "\u2014", "\u2014", "\u6F22", "\u5B57", "\u662F", "\u4E00", "\u7A2E",
                "\u610F", "\u97F3", "\u8A9E", "\u8A00", "\uFF0C", "\u8868", "\u610F", "\u7684",
                "\u540C", "\u6642", "\u4E5F", "\u5177", "\u4E00", "\u5B9A", "\u7684", "\u8868",
                "\u97F3", "\u529F", "\u80FD", "\u3002" },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587\u5B57", "\u7CFB",
                "\u7D71", ",", ",", "\u6F22", "\u5B57", "\u662F", "\u4E00", "\u7A2E",
                "\u610F", "\u97F3", "\u8A9E", "\u8A00", ",", "\u8868\u610F", "\u7684",
                "\u540C", "\u6642", "\u4E5F", "\u5177", "\u4E00\u5B9A", "\u7684", "\u8868\u97F3",
                "\u529F\u80FD", "," },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587\u5B57", "\u7CFB",
                "\u7D71", ",", "\u2014", ",", "\u2014", "\u6F22", "\u5B57", "\u662F", "\u4E00",
                "\u7A2E", "\u610F", "\u97F3", "\u8A9E", "\u8A00", ",", "\uFF0C", "\u8868\u610F",
                "\u7684", "\u540C", "\u6642", "\u4E5F", "\u5177", "\u4E00\u5B9A", "\u7684",
                "\u8868\u97F3", "\u529F\u80FD", ",", "\u3002" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587\u5B57", "\u7CFB",
                "\u7D71", "\u6F22", "\u5B57", "\u662F", "\u4E00", "\u7A2E", "\u610F", "\u97F3",
                "\u8A9E", "\u8A00", "\u8868\u610F", "\u7684", "\u540C", "\u6642", "\u4E5F",
                "\u5177", "\u4E00\u5B9A", "\u7684", "\u8868\u97F3", "\u529F\u80FD" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }

    /**
     * The behavior of the Lucene GermanAnalyzer was better for our purposes in
     * Lucene 3.0, so we implement a custom analyzer that recreates that
     * behavior.
     *
     * @see <a href=
     *      "https://groups.yahoo.com/neo/groups/OmegaT/conversations/messages/28395">
     *      User group discussion</a>
     */
    @Test
    public void testGerman() {
        ITokenizer tok = new LuceneGermanTokenizer();
        assertResult(new String[] { "prasentier", "pr\u00e4sentierte" },
                tok.tokenizeWordsToStrings("pr\u00e4sentierte", StemmingMode.GLOSSARY));
        assertResult(new String[] { "prasentier", "pr\u00e4sentieren" },
                tok.tokenizeWordsToStrings("pr\u00e4sentieren", StemmingMode.GLOSSARY));
    }

    /**
     * The DefaultTokenizer has a completely different implementation from the
     * Lucene-base tokenizers (the latter were originally an external plugin,
     * for licensing reasons). It's based on Java's BreakIterator. It warrants
     * testing so that it doesn't get overlooked when changes are made to the
     * other tokenizers.
     */
    @Test
    public void testDefault() {
        ITokenizer tok = new DefaultTokenizer();
        String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" \u0130stanbul. "
                + "\u65E5\u672C\u8A9E\u3042\u3044\u3046\u3048\u304A\u3002";
        assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
                "jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "\u0130stanbul", ".",
                " ", "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A", "\u3002" },
                tok.tokenizeVerbatimToStrings(orig),
                tok.tokenizeVerbatim(orig),
                orig);
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
                "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
                "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
        assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
                "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
                tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
    }

    private void assertVerbatim(String[] expected, String[] test, Token[] testTok, String origString) {
        assertResult(expected, test);
        assertEquals(StringUtils.join(expected), StringUtils.join(test));
        assertEquals(expected.length, testTok.length);
        for (int i = 0; i < expected.length; i++) {
            assertEquals(expected[i], testTok[i].getTextFromString(origString));
        }
    }

    private void assertResult(String[] expected, String[] test) {
//        for (String s : test) {
//            System.out.print('"');
//            System.out.print(s.replace("\"", "\\\""));
//            System.out.print("\", ");
//        }
//        System.out.print('\n');
        assertEquals(expected.length, test.length);
        for (int i = 0; i < expected.length; i++) {
            assertEquals(expected[i], test[i]);
        }
    }
}