/** * Copyright © 2010-2012 Atilika Inc. All rights reserved. * * Atilika Inc. licenses this file to you under the Apache License, Version * 2.0 (the "License"); you may not use this file except in compliance with * the License. A copy of the License is distributed with this work in the * LICENSE.txt file. You may also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package jp.ac.waseda.info.kake.moca.tools; import java.io.File; import java.io.IOException; import java.util.Map.Entry; import org.atilika.kuromoji.dict.ConnectionCosts; import org.atilika.kuromoji.dict.TokenInfoDictionary; import org.atilika.kuromoji.dict.UnknownDictionary; import org.atilika.kuromoji.trie.DoubleArrayTrie; import org.atilika.kuromoji.util.ConnectionCostsBuilder; import org.atilika.kuromoji.util.DictionaryBuilder; import org.atilika.kuromoji.util.DoubleArrayTrieBuilder; import org.atilika.kuromoji.util.UnknownDictionaryBuilder; /** * MoCA用辞書を生成します。 */ public class MocaDictionaryBuilder { public void build(DictionaryBuilder.DictionaryFormat format, String inputDirname, String outputDirname) throws IOException { build(format, inputDirname, outputDirname, "euc-jp", false); } public void build(DictionaryBuilder.DictionaryFormat format, String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException { System.out.println("building tokeninfo dict..."); MocaTokenInfoDictionaryBuilder tokenInfoBuilder = new MocaTokenInfoDictionaryBuilder( format, encoding, normalizeEntry); TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder .build(inputDirname); System.out.print(" building double array trie..."); DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder .entrySet()); trie.write(outputDirname); System.out.println(" done"); System.out.print(" processing target map..."); for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) { int tokenInfoId = entry.getKey(); String surfaceform = entry.getValue(); int doubleArrayId = trie.lookup(surfaceform); assert doubleArrayId > 0; tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId); } tokenInfoDictionary.write(outputDirname); trie = null; tokenInfoBuilder = null; System.out.println(" done"); System.out.println("done"); System.out.print("building unknown word dict..."); UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder( encoding); UnknownDictionary unkDictionary = unkBuilder.build(inputDirname); unkDictionary.write(outputDirname); System.out.println("done"); System.out.print("building connection costs..."); ConnectionCosts connectionCosts = ConnectionCostsBuilder .build(inputDirname + File.separator + "matrix.def"); connectionCosts.write(outputDirname); System.out.println("done"); } /** * MoCA用辞書を生成します。 * * @param args * [0]: 辞書データの形式(ipadic または unidic) * [1]: CSV形式辞書データのディレクトリ * [2]: 出力先ディレクトリ * @throws IOException * @throws ClassNotFoundException */ public static void main(String[] args) throws IOException, ClassNotFoundException { DictionaryBuilder.DictionaryFormat format; if (args[0].equalsIgnoreCase("ipadic")) { format = DictionaryBuilder.DictionaryFormat.IPADIC; } else if (args[0].equalsIgnoreCase("unidic")) { format = DictionaryBuilder.DictionaryFormat.UNIDIC; } else { System.err.println("Illegal format " + args[0] + " using unidic instead"); format = DictionaryBuilder.DictionaryFormat.IPADIC; } String inputDirname = args[1]; String outputDirname = args[2]; MocaDictionaryBuilder builder = new MocaDictionaryBuilder(); System.out.println("MoCA dictionary builder"); System.out.println(); System.out.println("dictionary format: " + format); System.out.println("input directory: " + inputDirname); System.out.println("output directory: " + outputDirname); System.out.println(); builder.build(format, inputDirname, outputDirname); } }