/**
* Copyright © 2010-2012 Atilika Inc. All rights reserved.
*
* Atilika Inc. licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. A copy of the License is distributed with this work in the
* LICENSE.txt file. You may also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package jp.ac.waseda.info.kake.moca.tools;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import org.atilika.kuromoji.dict.TokenInfoDictionary;
import org.atilika.kuromoji.util.CSVUtil;
import org.atilika.kuromoji.util.DictionaryBuilder.DictionaryFormat;
import jp.ac.waseda.info.kake.string.StringSizeConverter;
/**
* TODO UniDicで辞書作成を試す
*
* @author Sho
*
*/
public class MocaTokenInfoDictionaryBuilder {
/**
* Internal word id - incrementally assigned as entries are read and added.
* This will be byte offset of dictionary file
*/
private int offset = 0;
private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
private String encoding = "euc-jp";
private boolean normalizeEntries = false;
private DictionaryFormat format = DictionaryFormat.IPADIC;
public MocaTokenInfoDictionaryBuilder() {
this.dictionaryEntries = new TreeMap<Integer, String>();
}
public MocaTokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
this.format = format;
this.encoding = encoding;
this.dictionaryEntries = new TreeMap<Integer, String>();
this.normalizeEntries = normalizeEntries;
}
public TokenInfoDictionary build(String dirname) throws IOException {
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".csv");
}
};
ArrayList<File> csvFiles = new ArrayList<File>();
for (File file : new File(dirname).listFiles(filter)) {
csvFiles.add(file);
}
return buildDictionary(csvFiles);
}
public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024);
for (File file : csvFiles) {
FileInputStream inputStream = new FileInputStream(file);
InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
BufferedReader reader = new BufferedReader(streamReader);
String line = null;
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
if (entry.length < 13) {
System.out.println("Entry in CSV is not valid: " + line);
continue;
}
String[] formatEntry = formatEntry(entry);
int next = dictionary.put(formatEntry);
if (next == offset) {
System.out.println("Failed to process line: " + line);
continue;
}
dictionaryEntries.put(offset, StringSizeConverter.getFullString(entry[0]));
offset = next;
if (!(formatEntry[0].equals(formatEntry[11])/* || formatEntry[0].equals(KanaConverter
.getHiragana(formatEntry[11])))*/)) {//TODO 辞書軽量化すべき? しかしコメント外すと一部単語が入ってないっぽい
next = dictionary.put(formatEntry);
if (next == offset) {
System.out.println("Failed to process line: " + line);
continue;
}
dictionaryEntries.put(offset, StringSizeConverter.getFullString(formatEntry[11]));
offset = next;// TONIXY 読みで追加
}
// NFKC normalize dictionary entry
if (normalizeEntries) {
if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))) {
continue;
}
String[] normalizedEntry = new String[entry.length];
for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
}
formatEntry = formatEntry(normalizedEntry);
next = dictionary.put(formatEntry);
dictionaryEntries.put(offset, StringSizeConverter.getFullString(normalizedEntry[0]));
offset = next;
if (!(formatEntry[0].equals(formatEntry[11])/* || formatEntry[0].equals(KanaConverter
.getHiragana(formatEntry[11]))*/)) {//TODO 同上
next = dictionary.put(formatEntry);
dictionaryEntries.put(offset, StringSizeConverter.getFullString(formatEntry[11]));
offset = next;// TONIXY 読みで追加
}
}
}
}
return dictionary;
}
/*
* IPADIC features TONIXY 末尾[13]に表層表現を追加
*
* 0 - surface 1 - left cost 2 - right cost 3 - word cost 4-9 - pos 10 -
* base form 11 - reading 12 - pronounciation
*
* UniDic features
*
* 0 - surface 1 - left cost 2 - right cost 3 - word cost 4-9 - pos 10 -
* base form reading 11 - base form 12 - surface form 13 - surface reading
*/
public String[] formatEntry(String[] features) {
String[] features2 = new String[14];
if (this.format == DictionaryFormat.IPADIC) {
for (int i = 0; i < features.length; i++)
features2[i] = features[i];
} else {
features2[0] = features[0];
features2[1] = features[1];
features2[2] = features[2];
features2[3] = features[3];
features2[4] = features[4];
features2[5] = features[5];
features2[6] = features[6];
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
features2[10] = features[11];
// If the surface reading is non-existent, use surface form for
// reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly
// other cases as well
if (features[13].length() == 0) {
features2[11] = features[0];
features2[12] = features[0];
} else {
features2[11] = features[13];
features2[12] = features[13];
}
}
features2[13] = features[0];
return features2;
}
public Set<Entry<Integer, String>> entrySet() {
return dictionaryEntries.entrySet();
}
}