/*-*
* Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. A copy of the
* License is distributed with this work in the LICENSE.md file. You may
* also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.ipadic;
import com.atilika.kuromoji.CommonCornerCasesTest;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import static com.atilika.kuromoji.TestUtils.*;
import static org.junit.Assert.*;
public class TokenizerTest {
private static Tokenizer tokenizer;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
tokenizer = new Tokenizer();
}
@Test
public void testSimpleSegmentation() {
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaces = {"スペース", "ステーション", "に", "行き", "ます", "。", "うたがわしい", "。"};
List<Token> tokens = tokenizer.tokenize(input);
assertTrue(tokens.size() == surfaces.length);
for (int i = 0; i < tokens.size(); i++) {
assertEquals(surfaces[i], tokens.get(i).getSurface());
}
}
@Test
public void testSimpleReadings() {
List<Token> tokens = tokenizer.tokenize("寿司が食べたいです。");
assertTrue(tokens.size() == 6);
assertEquals(tokens.get(0).getReading(), "スシ");
assertEquals(tokens.get(1).getReading(), "ガ");
assertEquals(tokens.get(2).getReading(), "タベ");
assertEquals(tokens.get(3).getReading(), "タイ");
assertEquals(tokens.get(4).getReading(), "デス");
assertEquals(tokens.get(5).getReading(), "。");
}
@Test
public void testSimpleReading() {
List<Token> tokens = tokenizer.tokenize("郵税");
assertEquals(tokens.get(0).getReading(), "ユウゼイ");
}
@Test
public void testSimpleBaseFormKnownWord() {
List<Token> tokens = tokenizer.tokenize("お寿司が食べたい。");
assertTrue(tokens.size() == 6);
assertEquals("食べ", tokens.get(3).getSurface());
assertEquals("食べる", tokens.get(3).getBaseForm());
}
@Test
public void testSimpleBaseFormUnknownWord() {
List<Token> tokens = tokenizer.tokenize("アティリカ株式会社");
assertTrue(tokens.size() == 2);
assertFalse(tokens.get(0).isKnown());
assertEquals("*", tokens.get(0).getBaseForm());
assertTrue(tokens.get(1).isKnown());
assertEquals("株式会社", tokens.get(1).getBaseForm());
}
@Test
public void testYabottaiCornerCase() {
List<Token> tokens = tokenizer.tokenize("やぼったい");
assertEquals(1, tokens.size());
assertEquals("やぼったい", tokens.get(0).getSurface());
}
@Test
public void testTsukitoshaCornerCase() {
List<Token> tokens = tokenizer.tokenize("突き通しゃ");
assertEquals(1, tokens.size());
assertEquals("突き通しゃ", tokens.get(0).getSurface());
}
@Test
public void testIpadicTokenAPIs() throws Exception {
List<Token> tokens = tokenizer.tokenize("お寿司が食べたい!");
String[] pronunciations = {"オ", "スシ", "ガ", "タベ", "タイ", "!"};
assertEquals(pronunciations.length, tokens.size());
for (int i = 0; i < tokens.size(); i++) {
assertEquals(pronunciations[i], tokens.get(i).getPronunciation());
}
String[] conjugationForms = {"*", "*", "*", "連用形", "基本形", "*"};
for (int i = 0; i < tokens.size(); i++) {
assertEquals(conjugationForms[i], tokens.get(i).getConjugationForm());
}
String[] conjugationTypes = {"*", "*", "*", "一段", "特殊・タイ", "*"};
for (int i = 0; i < tokens.size(); i++) {
assertEquals(conjugationTypes[i], tokens.get(i).getConjugationType());
}
String[] posLevel1 = {"接頭詞", "名詞", "助詞", "動詞", "助動詞", "記号"};
for (int i = 0; i < tokens.size(); i++) {
assertEquals(posLevel1[i], tokens.get(i).getPartOfSpeechLevel1());
}
String[] posLevel2 = {"名詞接続", "一般", "格助詞", "自立", "*", "一般"};
for (int i = 0; i < tokens.size(); i++) {
assertEquals(posLevel2[i], tokens.get(i).getPartOfSpeechLevel2());
}
String[] posLevel3 = {"*", "*", "一般", "*", "*", "*"};
for (int i = 0; i < tokens.size(); i++) {
assertEquals(posLevel3[i], tokens.get(i).getPartOfSpeechLevel3());
}
String[] posLevel4 = {"*", "*", "*", "*", "*", "*"};
for (int i = 0; i < tokens.size(); i++) {
assertEquals(posLevel4[i], tokens.get(i).getPartOfSpeechLevel4());
}
}
@Test
public void testCustomPenalties() {
String input = "シニアソフトウェアエンジニアを探しています";
Tokenizer customTokenizer = new Tokenizer.Builder().mode(Tokenizer.Mode.SEARCH).kanjiPenalty(3, 10000)
.otherPenalty(Integer.MAX_VALUE, 0).build();
String[] expected1 = {"シニアソフトウェアエンジニア", "を", "探し", "て", "い", "ます"};
assertTokenSurfacesEquals(Arrays.asList(expected1), customTokenizer.tokenize(input));
Tokenizer searchTokenizer = new Tokenizer.Builder().mode(Tokenizer.Mode.SEARCH).build();
String[] expected2 = {"シニア", "ソフトウェア", "エンジニア", "を", "探し", "て", "い", "ます"};
assertTokenSurfacesEquals(Arrays.asList(expected2), searchTokenizer.tokenize(input));
}
@Test
public void testNakaguroSplit() {
Tokenizer defaultTokenizer = new Tokenizer();
Tokenizer nakakuroSplittingTokenizer = new Tokenizer.Builder().isSplitOnNakaguro(true).build();
String input = "ラレ・プールカリムの音楽が好き。";
assertTokenSurfacesEquals(Arrays.asList("ラレ・プールカリム", "の", "音楽", "が", "好き", "。"),
defaultTokenizer.tokenize(input));
assertTokenSurfacesEquals(Arrays.asList("ラレ", "・", "プールカリム", "の", "音楽", "が", "好き", "。"),
nakakuroSplittingTokenizer.tokenize(input));
}
@Test
public void testAllFeatures() {
Tokenizer tokenizer = new Tokenizer();
String input = "寿司が食べたいです。";
List<Token> tokens = tokenizer.tokenize(input);
assertEquals("寿司\t名詞,一般,*,*,*,*,寿司,スシ,スシ", toString(tokens.get(0)));
assertEquals("が\t助詞,格助詞,一般,*,*,*,が,ガ,ガ", toString(tokens.get(1)));
assertEquals("食べ\t動詞,自立,*,*,一段,連用形,食べる,タベ,タベ", toString(tokens.get(2)));
assertEquals("たい\t助動詞,*,*,*,特殊・タイ,基本形,たい,タイ,タイ", toString(tokens.get(3)));
assertEquals("です\t助動詞,*,*,*,特殊・デス,基本形,です,デス,デス", toString(tokens.get(4)));
}
private String toString(Token token) {
return token.getSurface() + "\t" + token.getAllFeatures();
}
@Test
public void testCompactedTrieCrash() {
String input = "\m";
Tokenizer tokenizer = new Tokenizer();
assertTokenSurfacesEquals(Arrays.asList("\", "m"), tokenizer.tokenize(input));
}
@Test
public void testFeatureLengths() throws IOException {
String userDictionary = "" + "gsf,gsf,ジーエスーエフ,カスタム名詞\n";
Tokenizer tokenizer = new Tokenizer.Builder()
.userDictionary(new ByteArrayInputStream(userDictionary.getBytes(StandardCharsets.UTF_8)))
.build();
assertEqualTokenFeatureLengths("ahgsfdajhgsfdこの丘はアクロポリスと呼ばれている。", tokenizer);
}
@Test
public void testNewBocchan() throws IOException {
assertTokenizedStreamEquals(getClass().getResourceAsStream("/bocchan-ipadic-features.txt"),
getClass().getResourceAsStream("/bocchan.txt"), tokenizer);
}
@Test
public void testPunctuation() {
CommonCornerCasesTest.testPunctuation(new Tokenizer());
}
}