/*-* * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md) * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. A copy of the * License is distributed with this work in the LICENSE.md file. You may * also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.atilika.kuromoji.compile; import com.atilika.kuromoji.dict.CharacterDefinitions; import com.atilika.kuromoji.dict.UnknownDictionary; import com.atilika.kuromoji.io.IntegerArrayIO; import com.atilika.kuromoji.io.StringArrayIO; import org.junit.BeforeClass; import org.junit.Test; import java.io.*; import java.util.Map; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; public class UnknownDictionaryCompilerTest { private static UnknownDictionary unknownDictionary; private static CharacterDefinitions characterDefinitions; private static int[][] costs; private static int[][] references; private static String[][] features; @BeforeClass public static void setUp() throws IOException { File charDef = File.createTempFile("kuromoji-chardef-", ".bin"); charDef.deleteOnExit(); CharacterDefinitionsCompiler charDefCompiler = new CharacterDefinitionsCompiler(new BufferedOutputStream(new FileOutputStream(charDef))); charDefCompiler.readCharacterDefinition(new BufferedInputStream( CharacterDefinitionsCompilerTest.class.getClassLoader().getResourceAsStream("char.def")), "euc-jp"); charDefCompiler.compile(); Map<String, Integer> categoryMap = charDefCompiler.makeCharacterCategoryMap(); File unkDef = File.createTempFile("kuromoji-unkdef-", ".bin"); unkDef.deleteOnExit(); UnknownDictionaryCompiler unkDefCompiler = new UnknownDictionaryCompiler(categoryMap, new FileOutputStream(unkDef)); unkDefCompiler.readUnknownDefinition(new BufferedInputStream( UnknownDictionaryCompilerTest.class.getClassLoader().getResourceAsStream("unk.def")), "euc-jp"); unkDefCompiler.compile(); InputStream charDefInput = new BufferedInputStream(new FileInputStream(charDef)); int[][] definitions = IntegerArrayIO.readSparseArray2D(charDefInput); int[][] mappings = IntegerArrayIO.readSparseArray2D(charDefInput); String[] symbols = StringArrayIO.readArray(charDefInput); characterDefinitions = new CharacterDefinitions(definitions, mappings, symbols); InputStream unkDefInput = new BufferedInputStream(new FileInputStream(unkDef)); costs = IntegerArrayIO.readArray2D(unkDefInput); references = IntegerArrayIO.readArray2D(unkDefInput); features = StringArrayIO.readArray2D(unkDefInput); unknownDictionary = new UnknownDictionary(characterDefinitions, references, costs, features); } @Test public void testCostsAndFeatures() { int[] categories = characterDefinitions.lookupCategories('一'); // KANJI & KANJINUMERIC assertEquals(2, categories.length); assertArrayEquals(new int[] {5, 6}, categories); // KANJI entries assertArrayEquals(new int[] {2, 3, 4, 5, 6, 7}, unknownDictionary.lookupWordIds(categories[0])); // KANJI feature variety assertArrayEquals(new String[] {"名詞", "一般", "*", "*", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(2)); assertArrayEquals(new String[] {"名詞", "サ変接続", "*", "*", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(3)); assertArrayEquals(new String[] {"名詞", "固有名詞", "地域", "一般", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(4)); assertArrayEquals(new String[] {"名詞", "固有名詞", "組織", "*", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(5)); assertArrayEquals(new String[] {"名詞", "固有名詞", "人名", "一般", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(6)); assertArrayEquals(new String[] {"名詞", "固有名詞", "人名", "一般", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(6)); // KANJINUMERIC entry assertArrayEquals(new int[] {29}, unknownDictionary.lookupWordIds(categories[1])); // KANJINUMERIC costs assertEquals(1295, unknownDictionary.getLeftId(29)); assertEquals(1295, unknownDictionary.getRightId(29)); assertEquals(27473, unknownDictionary.getWordCost(29)); // KANJINUMERIC features assertArrayEquals(new String[] {"名詞", "数", "*", "*", "*", "*", "*"}, unknownDictionary.getAllFeaturesArray(29)); } }