/**
* Copyright © 2010-2012 Atilika Inc. All rights reserved.
*
* Atilika Inc. licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. A copy of the License is distributed with this work in the
* LICENSE.txt file. You may also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package jp.ac.waseda.info.kake.moca.tools;
import java.io.File;
import java.io.IOException;
import java.util.Map.Entry;
import org.atilika.kuromoji.dict.ConnectionCosts;
import org.atilika.kuromoji.dict.TokenInfoDictionary;
import org.atilika.kuromoji.dict.UnknownDictionary;
import org.atilika.kuromoji.trie.DoubleArrayTrie;
import org.atilika.kuromoji.util.ConnectionCostsBuilder;
import org.atilika.kuromoji.util.DictionaryBuilder;
import org.atilika.kuromoji.util.DoubleArrayTrieBuilder;
import org.atilika.kuromoji.util.UnknownDictionaryBuilder;
/**
* MoCA用辞書を生成します。
*/
public class MocaDictionaryBuilder {
public void build(DictionaryBuilder.DictionaryFormat format,
String inputDirname, String outputDirname) throws IOException {
build(format, inputDirname, outputDirname, "euc-jp", false);
}
public void build(DictionaryBuilder.DictionaryFormat format,
String inputDirname, String outputDirname, String encoding,
boolean normalizeEntry) throws IOException {
System.out.println("building tokeninfo dict...");
MocaTokenInfoDictionaryBuilder tokenInfoBuilder = new MocaTokenInfoDictionaryBuilder(
format, encoding, normalizeEntry);
TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder
.build(inputDirname);
System.out.print(" building double array trie...");
DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder
.entrySet());
trie.write(outputDirname);
System.out.println(" done");
System.out.print(" processing target map...");
for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
int tokenInfoId = entry.getKey();
String surfaceform = entry.getValue();
int doubleArrayId = trie.lookup(surfaceform);
assert doubleArrayId > 0;
tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
}
tokenInfoDictionary.write(outputDirname);
trie = null;
tokenInfoBuilder = null;
System.out.println(" done");
System.out.println("done");
System.out.print("building unknown word dict...");
UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(
encoding);
UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
unkDictionary.write(outputDirname);
System.out.println("done");
System.out.print("building connection costs...");
ConnectionCosts connectionCosts = ConnectionCostsBuilder
.build(inputDirname + File.separator + "matrix.def");
connectionCosts.write(outputDirname);
System.out.println("done");
}
/**
* MoCA用辞書を生成します。
*
* @param args
* [0]: 辞書データの形式(ipadic または unidic)
* [1]: CSV形式辞書データのディレクトリ
* [2]: 出力先ディレクトリ
* @throws IOException
* @throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException,
ClassNotFoundException {
DictionaryBuilder.DictionaryFormat format;
if (args[0].equalsIgnoreCase("ipadic")) {
format = DictionaryBuilder.DictionaryFormat.IPADIC;
} else if (args[0].equalsIgnoreCase("unidic")) {
format = DictionaryBuilder.DictionaryFormat.UNIDIC;
} else {
System.err.println("Illegal format " + args[0]
+ " using unidic instead");
format = DictionaryBuilder.DictionaryFormat.IPADIC;
}
String inputDirname = args[1];
String outputDirname = args[2];
MocaDictionaryBuilder builder = new MocaDictionaryBuilder();
System.out.println("MoCA dictionary builder");
System.out.println();
System.out.println("dictionary format: " + format);
System.out.println("input directory: " + inputDirname);
System.out.println("output directory: " + outputDirname);
System.out.println();
builder.build(format, inputDirname, outputDirname);
}
}