/** * Copyright © 2010-2012 Atilika Inc. All rights reserved. * * Atilika Inc. licenses this file to you under the Apache License, Version * 2.0 (the "License"); you may not use this file except in compliance with * the License. A copy of the License is distributed with this work in the * LICENSE.txt file. You may also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package org.atilika.kuromoji.util; import java.io.File; import java.io.IOException; import java.util.Map.Entry; import org.atilika.kuromoji.dict.ConnectionCosts; import org.atilika.kuromoji.dict.TokenInfoDictionary; import org.atilika.kuromoji.dict.UnknownDictionary; import org.atilika.kuromoji.trie.DoubleArrayTrie; /** * @author Masaru Hasegawa * @author Christian Moen */ public class DictionaryBuilder { public enum DictionaryFormat { IPADIC, UNIDIC }; public DictionaryBuilder() { } public void build(DictionaryFormat format, String inputDirname, String outputDirname, String encoding, boolean normalizeEntry) throws IOException { System.out.println("building tokeninfo dict..."); TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry); TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname); System.out.print(" building double array trie..."); DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet()); trie.write(outputDirname); System.out.println(" done"); System.out.print(" processing target map..."); for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) { int tokenInfoId = entry.getKey(); String surfaceform = entry.getValue(); int doubleArrayId = trie.lookup(surfaceform); assert doubleArrayId > 0; tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId); } tokenInfoDictionary.write(outputDirname); trie = null; tokenInfoBuilder = null; System.out.println(" done"); System.out.println("done"); System.out.print("building unknown word dict..."); UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding); UnknownDictionary unkDictionary = unkBuilder.build(inputDirname); unkDictionary.write(outputDirname); System.out.println("done"); System.out.print("building connection costs..."); ConnectionCosts connectionCosts = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def"); connectionCosts.write(outputDirname); System.out.println("done"); } public static void main(String[] args) throws IOException, ClassNotFoundException { DictionaryFormat format; if (args[0].equalsIgnoreCase("ipadic")) { format = DictionaryFormat.IPADIC; } else if (args[0].equalsIgnoreCase("unidic")) { format = DictionaryFormat.UNIDIC; } else { System.err.println("Illegal format " + args[0] + " using unidic instead"); format = DictionaryFormat.IPADIC; } String inputDirname = args[1]; String outputDirname = args[2]; String inputEncoding = args[3]; boolean normalizeEntries = Boolean.parseBoolean(args[4]); DictionaryBuilder builder = new DictionaryBuilder(); System.out.println("dictionary builder"); System.out.println(""); System.out.println("dictionary format: " + format); System.out.println("input directory: " + inputDirname); System.out.println("output directory: " + outputDirname); System.out.println("input encoding: " + inputEncoding); System.out.println("normalize entries: " + normalizeEntries); System.out.println(""); builder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries); } }