/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.linguistic;
import java.io.IOException;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.lucene.ChineseTokenizerAdapter;
import org.junit.Test;
/**
* Test cases for {@link ChineseTokenizerAdapter}.
*/
public class ChineseTokenizerTest extends TokenizerTestBase
{
@Override
protected ITokenizer createTokenStream() throws IOException
{
return new DefaultTokenizerFactory()
.getTokenizer(LanguageCode.CHINESE_SIMPLIFIED);
}
@Test
public void testTermTokens()
{
String test = "东亚货币贬值";
TokenImage [] tokens =
{
term("东亚"), term("货币"), term("贬值"),
};
assertEqualTokens(test, tokens);
}
@Test
public void testChineseEnglishTermTokens()
{
String test = "test 东亚货币贬值 English";
TokenImage [] tokens =
{
term("test"), term("东亚"), term("货币"), term("贬值"), term("english")
};
assertEqualTokens(test, tokens);
}
@Test
public void testJunkTokens()
{
final String [] junkTokens = new String []
{
",", ".", "<", ">", "?", "/", "\\", "|", "-", "_", "+", "=", "*", "&", "^",
"%", "#", "@", "!", "~", "`", ";", ":", "'", "\"", "(", ")", "$", "·", "‘",
"’", "…", "`", "’", "“", "”", "‘", "—"
};
TokenImage [] tokens =
{
punctuation(","),
};
for (String junkToken : junkTokens)
{
assertEqualTokens(junkToken, tokens);
}
}
@Test
public void testPunctuationTokens()
{
String test = "东亚货币贬值。周小燕老师,您辛苦了!";
TokenImage [] tokens =
{
term("东亚"), term("货币"), term("贬值"), punctuation(","), term("周"), term("小"),
term("燕"), term("老师"), punctuation(","), term("您"), term("辛苦"), term("了"),
punctuation(","),
};
assertEqualTokens(test, tokens);
}
@Test
public void testNumericTokens()
{
String test = "湖南11个部门";
TokenImage [] tokens =
{
term("湖南"), numeric("11"), term("个"), term("部门"),
};
assertEqualTokens(test, tokens);
}
}