/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2015 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.tokenizer;
import static org.junit.Assert.assertEquals;
import org.apache.commons.lang.StringUtils;
import org.junit.Test;
import org.omegat.tokenizer.ITokenizer.StemmingMode;
import org.omegat.util.Token;
public class TokenizerTest {
@Test
public void testEnglish() {
ITokenizer tok = new LuceneEnglishTokenizer();
String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" dog.";
assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
"jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "dog", "." },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "dog" },
tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
assertResult(new String[] { "the", "quick", "brown", "x0", "jump", "jumped", "over", "1", "lazi", "lazy", "dog" },
tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
assertResult(new String[] { "quick", "brown", "jump", "jumped", "over", "lazi", "lazy", "dog" },
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}
/**
* LuceneJapaneseTokenizer includes two customizations that warrant testing:
* <ol><li>Special removal of tags (e.g. <x0/>) when stemming
* <li>Re-joining of tags when doing verbatim or non-stemming tokenizing
* </ol>
*/
@Test
public void testJapanese() {
ITokenizer tok = new LuceneJapaneseTokenizer();
String orig = "\u6211\u3005\u306E\u3059\u3079\u3066\u306F\u540C\u3058\uFF11\u500B\u306E\u60D1"
+ "\u661F\uFF08\u82F1\uFF1A\u300Ca planet\u300D\uFF09\u306B\u4F4F\u307F\u3001\u6211"
+ "\u3005\u306E\u3059\u3079\u3066\u306F\u305D\u306E\u751F\u7269\u570F\u306E1.5\u90E8"
+ "\u3067\u3042\u308B<x0/>\u3002";
assertVerbatim(new String[] { "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u540C\u3058",
"\uFF11", "\u500B", "\u306E", "\u60D1\u661F", "\uFF08", "\u82F1", "\uFF1A", "\u300C", "a",
" ", "planet", "\u300D", "\uFF09", "\u306B", "\u4F4F\u307F", "\u3001", "\u6211\u3005", "\u306E",
"\u3059\u3079\u3066", "\u306F", "\u305D\u306E", "\u751F\u7269", "\u570F", "\u306E", "1", ".", "5",
"\u90E8", "\u3067", "\u3042\u308B", "<x0/>", "\u3002" },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
assertResult(new String[] { "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u540C\u3058",
"\u500B", "\u306E", "\u60D1\u661F", "\uFF08", "\u82F1", "\uFF1A", "\u300C", "a", "planet",
"\u300D", "\uFF09", "\u306B", "\u4F4F\u307F", "\u3001", "\u6211\u3005", "\u306E",
"\u3059\u3079\u3066", "\u306F", "\u305D\u306E", "\u751F\u7269", "\u570F", "\u306E", ".",
"\u90E8", "\u3067", "\u3042\u308B", "\u3002" },
tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
assertResult(new String[] { "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u540C\u3058",
"1", "\uFF11", "\u500B", "\u306E", "\u60D1\u661F", "\u82F1", "a", "planet", "\u306B", "\u4F4F\u3080",
"\u4F4F\u307F", "\u6211\u3005", "\u306E", "\u3059\u3079\u3066", "\u306F", "\u305D\u306E",
"\u751F\u7269", "\u570F", "\u306E", "1", "5", "\u90E8", "\u3060", "\u3067", "\u3042\u308B" },
tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
assertResult(new String[] { "\u6211\u3005", "\u3059\u3079\u3066", "\u540C\u3058", "\u500B",
"\u60D1\u661F", "\u82F1", "a", "planet", "\u4F4F\u3080", "\u4F4F\u307F", "\u6211\u3005",
"\u3059\u3079\u3066", "\u751F\u7269", "\u570F", "\u90E8" },
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
// Check for TagJoiningFilter
orig = "<x0/>\u3042</x0>\u300C<x1/>\u300D<x2/>\u3002<foo bar 123";
assertVerbatim(new String[] { "<x0/>", "\u3042", "</x0>", "\u300C", "<x1/>", "\u300D", "<x2/>", "\u3002",
"<", "foo", " ", "bar", " ", "123" },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
// Check for tag removal
assertResult(new String[] { "\u3042", "foo", "bar" },
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}
/**
* Turkish warrants special testing because it has the letter \u0130
* (LATIN CAPITAL LETTER I WITH DOT ABOVE); the result (both content
* and length) of performing <code>"\u0130".toLowerCase()</code> depends
* on the default Locale, and in the past there were issues with improper
* lowercasing during tokenization leading to OOB exceptions.
* <p>
* Text from https://tr.wikipedia.org/wiki/T%C3%BCrk%C3%A7e
*/
@Test
public void testTurkish() {
ITokenizer tok = new LuceneTurkishTokenizer();
String orig = "\u201C\u0130stanbul a\u011Fz\u0131\u201D, T\u00FCrkiye T\u00FCrk\u00E7esi"
+ "yaz\u0131 dilinin kayna\u011F\u0131 olarak kabul edilir; yaz\u0131 dili bu"
+ "a\u011F\u0131z temelinde olu\u015Fmu\u015Ftur.";
assertVerbatim(new String[] { "\u201C", "\u0130stanbul", " ", "a\u011Fz\u0131", "\u201D",
",", " ", "T\u00FCrkiye", " ", "T\u00FCrk\u00E7esiyaz\u0131", " ", "dilinin", " ",
"kayna\u011F\u0131", " ", "olarak", " ", "kabul", " ", "edilir", ";", " ", "yaz\u0131",
" ", "dili", " ", "bua\u011F\u0131z", " ", "temelinde", " ", "olu\u015Fmu\u015Ftur", "." },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
assertResult(new String[] { "\u0130stanbul", "a\u011Fz\u0131", "T\u00FCrkiye",
"T\u00FCrk\u00E7esiyaz\u0131", "dilinin", "kayna\u011F\u0131", "olarak",
"kabul", "edilir", "yaz\u0131", "dili", "bua\u011F\u0131z", "temelinde",
"olu\u015Fmu\u015Ftur" },
tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
assertResult(new String[] { "istanbul", "a\u011Fz\u0131", "t\u00FCrki", "T\u00FCrkiye",
"t\u00FCrk\u00E7esiyaz", "T\u00FCrk\u00E7esiyaz\u0131", "dil", "dilinin",
"kaynak", "kayna\u011F\u0131", "olarak", "kabul", "edilir", "yaz", "yaz\u0131",
"dil", "dili", "buak", "bua\u011F\u0131z", "temel", "temelinde", "olu\u015F",
"olu\u015Fmu\u015Ftur" },
tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
assertResult(new String[] { "istanbul", "a\u011Fz\u0131", "t\u00FCrki", "T\u00FCrkiye",
"t\u00FCrk\u00E7esiyaz", "T\u00FCrk\u00E7esiyaz\u0131", "dil", "dilinin",
"kaynak", "kayna\u011F\u0131", "kabul", "edilir", "yaz", "yaz\u0131",
"dil", "dili", "buak", "bua\u011F\u0131z", "temel", "temelinde", "olu\u015F",
"olu\u015Fmu\u015Ftur" },
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}
/**
* Chinese tends to have very few character boundaries breakable by BreakIterator,
* so LuceneSmartChineseTokenizer tokenizes by code point for verbatim tokenizing.
* <p>
* Text from https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD
*/
@Test
public void testChinese() {
ITokenizer tok = new LuceneSmartChineseTokenizer();
String orig = "\u6F22\u8A9E\u7684\u6587\u5B57\u7CFB\u7D71\u2014\u2014\u6F22\u5B57\u662F"
+ "\u4E00\u7A2E\u610F\u97F3\u8A9E\u8A00\uFF0C\u8868\u610F\u7684\u540C\u6642\u4E5F"
+ "\u5177\u4E00\u5B9A\u7684\u8868\u97F3\u529F\u80FD\u3002";
assertVerbatim(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587", "\u5B57", "\u7CFB",
"\u7D71", "\u2014", "\u2014", "\u6F22", "\u5B57", "\u662F", "\u4E00", "\u7A2E",
"\u610F", "\u97F3", "\u8A9E", "\u8A00", "\uFF0C", "\u8868", "\u610F", "\u7684",
"\u540C", "\u6642", "\u4E5F", "\u5177", "\u4E00", "\u5B9A", "\u7684", "\u8868",
"\u97F3", "\u529F", "\u80FD", "\u3002" },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
assertResult(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587\u5B57", "\u7CFB",
"\u7D71", ",", ",", "\u6F22", "\u5B57", "\u662F", "\u4E00", "\u7A2E",
"\u610F", "\u97F3", "\u8A9E", "\u8A00", ",", "\u8868\u610F", "\u7684",
"\u540C", "\u6642", "\u4E5F", "\u5177", "\u4E00\u5B9A", "\u7684", "\u8868\u97F3",
"\u529F\u80FD", "," },
tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
assertResult(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587\u5B57", "\u7CFB",
"\u7D71", ",", "\u2014", ",", "\u2014", "\u6F22", "\u5B57", "\u662F", "\u4E00",
"\u7A2E", "\u610F", "\u97F3", "\u8A9E", "\u8A00", ",", "\uFF0C", "\u8868\u610F",
"\u7684", "\u540C", "\u6642", "\u4E5F", "\u5177", "\u4E00\u5B9A", "\u7684",
"\u8868\u97F3", "\u529F\u80FD", ",", "\u3002" },
tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
assertResult(new String[] { "\u6F22", "\u8A9E", "\u7684", "\u6587\u5B57", "\u7CFB",
"\u7D71", "\u6F22", "\u5B57", "\u662F", "\u4E00", "\u7A2E", "\u610F", "\u97F3",
"\u8A9E", "\u8A00", "\u8868\u610F", "\u7684", "\u540C", "\u6642", "\u4E5F",
"\u5177", "\u4E00\u5B9A", "\u7684", "\u8868\u97F3", "\u529F\u80FD" },
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}
/**
* The behavior of the Lucene GermanAnalyzer was better for our purposes in
* Lucene 3.0, so we implement a custom analyzer that recreates that
* behavior.
*
* @see <a href=
* "https://groups.yahoo.com/neo/groups/OmegaT/conversations/messages/28395">
* User group discussion</a>
*/
@Test
public void testGerman() {
ITokenizer tok = new LuceneGermanTokenizer();
assertResult(new String[] { "prasentier", "pr\u00e4sentierte" },
tok.tokenizeWordsToStrings("pr\u00e4sentierte", StemmingMode.GLOSSARY));
assertResult(new String[] { "prasentier", "pr\u00e4sentieren" },
tok.tokenizeWordsToStrings("pr\u00e4sentieren", StemmingMode.GLOSSARY));
}
/**
* The DefaultTokenizer has a completely different implementation from the
* Lucene-base tokenizers (the latter were originally an external plugin,
* for licensing reasons). It's based on Java's BreakIterator. It warrants
* testing so that it doesn't get overlooked when changes are made to the
* other tokenizers.
*/
@Test
public void testDefault() {
ITokenizer tok = new DefaultTokenizer();
String orig = "The quick, brown <x0/> jumped over 1 \"lazy\" \u0130stanbul. "
+ "\u65E5\u672C\u8A9E\u3042\u3044\u3046\u3048\u304A\u3002";
assertVerbatim(new String[] { "The", " ", "quick", ",", " ", "brown", " ", "<x0/>", " ",
"jumped", " ", "over", " ", "1", " ", "\"", "lazy", "\"", " ", "\u0130stanbul", ".",
" ", "\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A", "\u3002" },
tok.tokenizeVerbatimToStrings(orig),
tok.tokenizeVerbatim(orig),
orig);
assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
"\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
tok.tokenizeWordsToStrings(orig, StemmingMode.NONE));
assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
"\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
tok.tokenizeWordsToStrings(orig, StemmingMode.GLOSSARY));
assertResult(new String[] { "The", "quick", "brown", "jumped", "over", "lazy", "\u0130stanbul",
"\u65E5\u672C\u8A9E", "\u3042\u3044\u3046\u3048\u304A" },
tok.tokenizeWordsToStrings(orig, StemmingMode.MATCHING));
}
private void assertVerbatim(String[] expected, String[] test, Token[] testTok, String origString) {
assertResult(expected, test);
assertEquals(StringUtils.join(expected), StringUtils.join(test));
assertEquals(expected.length, testTok.length);
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], testTok[i].getTextFromString(origString));
}
}
private void assertResult(String[] expected, String[] test) {
// for (String s : test) {
// System.out.print('"');
// System.out.print(s.replace("\"", "\\\""));
// System.out.print("\", ");
// }
// System.out.print('\n');
assertEquals(expected.length, test.length);
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], test[i]);
}
}
}