/** * Copyright © 2010-2012 Atilika Inc. All rights reserved. * * Atilika Inc. licenses this file to you under the Apache License, Version * 2.0 (the "License"); you may not use this file except in compliance with * the License. A copy of the License is distributed with this work in the * LICENSE.txt file. You may also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package org.atilika.kuromoji.dict; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; import org.atilika.kuromoji.util.CSVUtil; /** * @author Masaru Hasegawa * @author Christian Moen */ public class UserDictionary implements Dictionary { private TreeMap<String, int[]> entries = new TreeMap<String, int[]>(); private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>(); private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000; public static final int WORD_COST = -100000; public static final int LEFT_ID = 5; public static final int RIGHT_ID = 5; public UserDictionary() { } /** * Lookup words in text * ユーザ辞書に登録された単語の位置を全て探し出し、 {wordId, position, length} の形の2次元配列で返す * * @param text * @return array of {wordId, position, length} */ public int[][] lookup(String text) { TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...] for (String keyword : entries.descendingKeySet()) { int offset = 0; int position = text.indexOf(keyword, offset); while (offset < text.length() && position >= 0) { if(!result.containsKey(position)){ result.put(position, entries.get(keyword)); } offset += position + keyword.length(); position = text.indexOf(keyword, offset); } } return toIndexArray(result); } /** * Convert Map of index and wordIdAndLength to array of {wordId, index, length} * * @param input * @return array of {wordId, index, length} */ private int[][] toIndexArray(Map<Integer, int[]> input) { ArrayList<int[]> result = new ArrayList<int[]>(); for (int i : input.keySet()) { int[] wordIdAndLength = input.get(i); int wordId = wordIdAndLength[0]; // convert length to index int current = i; for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset int[] token = { wordId + j - 1, current, wordIdAndLength[j] }; result.add(token); current += wordIdAndLength[j]; } } return result.toArray(new int[result.size()][]); } @Override public int getLeftId(int wordId) { return LEFT_ID; } @Override public int getRightId(int wordId) { return RIGHT_ID; } /** * ユーザ辞書のWORD_COST = -100000を返す */ @Override public int getWordCost(int wordId) { return WORD_COST; } @Override public String getReading(int wordId) { return getFeature(wordId, 0); } @Override public String getBaseForm(int wordId) { return null; // NOTE: Currently unsupported TODO ユーザ辞書のgetBaseFormはnullでいい? } @Override public String getPartOfSpeech(int wordId) { return getFeature(wordId, 1); } @Override public String getAllFeatures(int wordId) { return getFeature(wordId); } @Override public String[] getAllFeaturesArray(int wordId) { String allFeatures = featureEntries.get(wordId); if(allFeatures == null) { return null; } return allFeatures.split(INTERNAL_SEPARATOR); } @Override public String getFeature(int wordId, int... fields) { String[] allFeatures = getAllFeaturesArray(wordId); if (allFeatures == null) { return null; } StringBuilder sb = new StringBuilder(); if (fields.length == 0) { // All features for (String feature : allFeatures) { sb.append(CSVUtil.quoteEscape(feature)).append(","); } } else if (fields.length == 1) { // One feature doesn't need to escape value sb.append(allFeatures[fields[0]]).append(","); } else { for (int field : fields){ sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(","); } } return sb.deleteCharAt(sb.length() - 1).toString(); } public static UserDictionary read(String filename) throws IOException { return read(new FileInputStream(filename)); } // TONIXY 下の引数型変更に伴う追加。InputStreamのままでも使えるように。 public static UserDictionary read(InputStream is) throws IOException { return read(new InputStreamReader(is)); } // TONIXY ユーザ辞書読込の自由度を上げるため、引数をReader型に変更。 public static UserDictionary read(Reader is) throws IOException { UserDictionary dictionary = new UserDictionary(); BufferedReader reader = new BufferedReader(is); String line = null; int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; while ((line = reader.readLine()) != null) { // Remove comments TONIXY 登録できる文字の自由度を上げるため、Kuromojiユーザ辞書コメントに非対応にした。 // line = line.replaceAll("#.*$", ""); // TODO Kuromojiユーザ辞書コメントに対応させる。全角半角非依存にし、必ず全角登録する仕様にすれば問題ない // Skip empty lines or comment lines if (line.trim().length() == 0) { continue; } String[] values = CSVUtil.parse(line); String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; if (segmentation.length != readings.length) { // FIXME: Should probably deal with this differently. Exception? System.out.println("This entry is not properly formatted : " + line); } int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.length; i++) { wordIdAndLength[i + 1] = segmentation[i].length(); dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos); wordId++; } dictionary.entries.put(values[0], wordIdAndLength); } reader.close(); return dictionary; } }