/** * Copyright © 2010-2012 Atilika Inc. All rights reserved. * * Atilika Inc. licenses this file to you under the Apache License, Version * 2.0 (the "License"); you may not use this file except in compliance with * the License. A copy of the License is distributed with this work in the * LICENSE.txt file. You may also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package org.atilika.kuromoji.viterbi; import java.util.LinkedList; import java.util.List; import org.atilika.kuromoji.Tokenizer.Mode; import org.atilika.kuromoji.dict.CharacterDefinition; import org.atilika.kuromoji.dict.CharacterDefinition.CharacterClass; import org.atilika.kuromoji.dict.ConnectionCosts; import org.atilika.kuromoji.dict.Dictionary; import org.atilika.kuromoji.dict.TokenInfoDictionary; import org.atilika.kuromoji.dict.UnknownDictionary; import org.atilika.kuromoji.dict.UserDictionary; import org.atilika.kuromoji.trie.DoubleArrayTrie; import org.atilika.kuromoji.viterbi.ViterbiNode.Type; import jp.ac.waseda.info.kake.string.StringSizeConverter; /** * TONIXY バグ修正 * TONIXY サブクラスを作りやすいよう、可視性など大幅に変更 * * @author Masaru Hasegawa * @author Christian Moen */ public class Viterbi { // TONIXY trueにするとNode情報を一覧できる private static final boolean printsNodes = false; // TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更 protected final DoubleArrayTrie trie; protected final TokenInfoDictionary dictionary; protected final UnknownDictionary unkDictionary; protected final ConnectionCosts costs; // TONIXY ユーザ辞書変更を可能にしたため、finalを除去。サブクラスからの参照を可能にするため、privateをprotectedに変更 protected UserDictionary userDictionary; // TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更 protected final CharacterDefinition characterDefinition; // TONIXY ユーザ辞書変更を可能にしたため、finalを除去 private boolean useUserDictionary; // TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更 protected final boolean searchMode; protected final boolean extendedMode; protected static final int DEFAULT_COST = 10000000; protected static final int SEARCH_MODE_KANJI_LENGTH_DEFAULT = 2; protected static final int SEARCH_MODE_OTHER_LENGTH_DEFAULT = 7; protected static final int SEARCH_MODE_KANJI_PENALTY_DEFAULT = 3000; protected static final int SEARCH_MODE_OTHER_PENALTY_DEFAULT = 1700; protected final int searchModeKanjiPenalty; protected final int searchModeOtherPenalty; protected final int searchModeOtherLength; protected final int searchModeKanjiLength; protected static final String BOS = "BOS"; protected static final String EOS = "EOS"; /** * TONIXY 未知語分解モードのON/OFF。未知語の部分文字列をLatticeに追加するかどうか。trueに設定することを推奨 */ protected final boolean unknownFixMode; /** * TONIXY 全角化のON/OFF。trueに設定することを推奨 */ protected final boolean convertsSize; // TONIXY サブクラスからの参照を可能にするため、getterを作成 protected UserDictionary getUserDictionary() { return userDictionary; } protected boolean usesUserDictionary() { return useUserDictionary; } protected boolean isUnknownFixMode() { return unknownFixMode; } /** * TONIXY 非推奨。未知語分解モードがOFFになる。未知語分解モードの搭載後、従来通り動くためのコンストラクタ * * @param trie * @param dictionary * @param unkDictionary * @param costs * @param userDictionary * @param mode * @param searchModeKanjiPenalty * @param searchModeOtherPenalty * @param searchModeKanjiLength * @param searchModeOtherLength */ /* public Viterbi(DoubleArrayTrie trie, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary, ConnectionCosts costs, UserDictionary userDictionary, Mode mode, int searchModeKanjiPenalty, int searchModeOtherPenalty, int searchModeKanjiLength, int searchModeOtherLength){ this(trie, dictionary, unkDictionary, costs, userDictionary, mode, searchModeKanjiPenalty, searchModeOtherPenalty, searchModeKanjiLength, searchModeOtherLength, false, false); } */ /** * Constructor * TONIXY 非推奨。未知語分解モードがOFFになる。未知語分解モードの搭載後、従来通り動くためのコンストラクタ * * @param trie * @param targetMap * @param dictionary * @param unkDictionary * @param costs * @param userDictionary */ /* public Viterbi(DoubleArrayTrie trie, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary, ConnectionCosts costs, UserDictionary userDictionary, Mode mode) { this(trie, dictionary, unkDictionary, costs, userDictionary, mode, false, false); } */ /** * Constructor * TONIXY 未知語分解モードの搭載のため、引数追加 * * @param trie * @param dictionary * @param unkDictionary * @param costs * @param userDictionary * @param mode * @param searchModeKanjiPenalty * @param searchModeOtherPenalty * @param searchModeKanjiLength * @param searchModeOtherLength * @param unknownFixMode * @param convertsSize */ public Viterbi(DoubleArrayTrie trie, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary, ConnectionCosts costs, UserDictionary userDictionary, Mode mode, int searchModeKanjiPenalty, int searchModeOtherPenalty, int searchModeKanjiLength, int searchModeOtherLength, boolean unknownFixMode, boolean convertsSize) { this.trie = trie; this.dictionary = dictionary; this.unkDictionary = unkDictionary; this.costs = costs; this.userDictionary = userDictionary; this.searchModeKanjiPenalty = searchModeKanjiPenalty; this.searchModeOtherPenalty = searchModeOtherPenalty; this.searchModeKanjiLength = searchModeKanjiLength; this.searchModeOtherLength = searchModeOtherLength; this.unknownFixMode = unknownFixMode; this.convertsSize = convertsSize; if (userDictionary == null) { this.useUserDictionary = false; } else { this.useUserDictionary = true; } switch (mode) { case SEARCH: searchMode = true; extendedMode = false; break; case EXTENDED: searchMode = true; extendedMode = true; break; default: searchMode = false; extendedMode = false; break; } this.characterDefinition = unkDictionary.getCharacterDefinition(); } /** * Constructor * TONIXY 未知語分解モードの搭載のため、引数追加 * * @param trie * @param targetMap * @param dictionary * @param unkDictionary * @param costs * @param userDictionary * @param unknownFixMode * @param convertsSize */ public Viterbi(DoubleArrayTrie trie, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary, ConnectionCosts costs, UserDictionary userDictionary, Mode mode, boolean unknownFixMode, boolean convertsSize) { this(trie, dictionary, unkDictionary, costs, userDictionary, mode, SEARCH_MODE_KANJI_PENALTY_DEFAULT, SEARCH_MODE_OTHER_PENALTY_DEFAULT, SEARCH_MODE_KANJI_LENGTH_DEFAULT, SEARCH_MODE_OTHER_LENGTH_DEFAULT, unknownFixMode, convertsSize); } /** * Find best path from input lattice. * @param lattice the result of build method * @return List of ViterbiNode which consist best path */ public List<ViterbiNode> search(ViterbiNode[][][] lattice) { ViterbiNode[][] startIndexArr = lattice[0]; ViterbiNode[][] endIndexArr = lattice[1]; // ここからコスト計算とパス設定 for (int i = 1; i < startIndexArr.length; i++){ if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists. continue; } // 現在地から始まるViterbiNodeを順番に見る for (ViterbiNode node : startIndexArr[i]) { if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index break; } int backwardConnectionId = node.getLeftId(); int wordCost = node.getWordCost(); int leastPathCost = DEFAULT_COST; // ひとつ前となりうるViterbiNodeを順番に見る for (ViterbiNode leftNode : endIndexArr[i]) { if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index break; } int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost] // "Search mode". Add extra costs if it is long node. // ここからサーチモードの処理 if (searchMode) { // System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false. String surfaceForm = node.getSurfaceForm(); int length = surfaceForm.length(); if (length > searchModeKanjiLength) { boolean allKanji = true; // check if node consists of only kanji for (int pos = 0; pos < length; pos++) { if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){ allKanji = false; break; } } if (allKanji) { // Process only Kanji keywords pathCost += (length - searchModeKanjiLength) * searchModeKanjiPenalty; } else if (length > searchModeOtherLength) { pathCost += (length - searchModeOtherLength) * searchModeOtherPenalty; // pathCost += searchModePenalty; } } } // ここまでサーチモードの処理 // 現時点での最小コストとなったなら、ひとつ前のViterbiNodeを記憶しておく if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left). leastPathCost = pathCost; node.setPathCost(leastPathCost); node.setLeftNode(leftNode); } } } } // ここまでコスト計算とパス設定 // track best path // バックトラック ViterbiNode node = endIndexArr[0][0]; // EOS LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>(); result.add(node);// 最小コストのパスはEOSへとつながっているので、ベストパスの最後は必ずEOSになっている while (true) { ViterbiNode leftNode = node.getLeftNode(); if (leftNode == null) { break; } // EXTENDED mode convert unknown word into unigram node // EXTENDEDモードの処理。未知語を1文字ずつ分割? if (extendedMode && leftNode.getType() == Type.UNKNOWN) { int unigramWordId = CharacterClass.NGRAM.getId(); int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required String surfaceForm = leftNode.getSurfaceForm(); for (int i = surfaceForm.length(); i > 0; i--) { ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN); result.addFirst(uniGramNode); } // ここまでEXTENDEDモードの処理 } else { result.addFirst(leftNode); } node = leftNode; } return result; } /** * Build lattice from input text * @param text * @return */ public ViterbiNode[][][] build(String text) { int textLength = text.length(); // startIndexArr[n]には、textのn文字目(1<=n<=text.length)から始まるViterbiNodeが入ってる。[0][0]にはBOS,[text.length()+1][0]にはEOS ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS // endIndexArr[n]には、textのn-1文字目(2<=n<=text.length()+1)で終わるViterbiNodeが入ってる。 ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS // startSizeArr[n]は、startIndexArr[n]の長さ int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr // endSizeArr[n]は、endIndexArr[n]の長さ int[] endSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in endIndexArr ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, 0, 0, -1, Type.KNOWN); addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr); // Process user dictionary; // 先にユーザ辞書の単語をすべて登録 if (useUserDictionary) { processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr); } int unknownWordEndIndex = -1; // index of the last character of unknown word for (int startIndex = 0; startIndex < textLength; startIndex++) { // If no token ends where current token starts, skip this index if (endSizeArr[startIndex + 1] == 0) { continue; } String suffix = text.substring(startIndex); boolean found = false; for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) { String prefix = suffix.substring(0, endIndex); // startIndex~endIndexの部分文字列になる int result; if(convertsSize) result = trie.lookup(StringSizeConverter.getFullString(prefix)); else result = trie.lookup(prefix); if (result > 0) { // Found match in double array trie found = true; // Don't produce unknown word starting from this index for (int wordId : dictionary.lookupWordIds(result)) { ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN); addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr); } } else if(result < 0) { // If result is less than zero, continue to next position break; } } // In the case of normal mode, it doesn't process unknown word greedily. if(!searchMode && unknownWordEndIndex > startIndex){ continue; } // Process Unknown Word int unknownWordLength = 0; char firstCharacter = suffix.charAt(0); boolean isInvoke = characterDefinition.isInvoke(firstCharacter); if (isInvoke){ // Process "invoke" unknownWordLength = unkDictionary.lookup(suffix); } else if (found == false){ // Process not "invoke" unknownWordLength = unkDictionary.lookup(suffix); } if (unknownWordLength > 0) { // found unknown word String unkWord = suffix.substring(0, unknownWordLength); int characterId = characterDefinition.lookup(firstCharacter); int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same for (int wordId : wordIds) { // TONIXY 未知語の部分文字列をとって、すべて辞書に追加するよう変更。これによって、未知語部分をできるだけ短くする。 // また、未知語直後にユーザ辞書内単語が来た時、ユーザ辞書内単語以前の結果が消滅する現象も修正できる。 if (unknownFixMode) for (int i = 1; i <= unknownWordLength; i++) { ViterbiNode node = new ViterbiNode(wordId, unkWord.substring(0, i), unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN); addToArrays(node, startIndex + 1, startIndex + 1 + i, startIndexArr, endIndexArr, startSizeArr, endSizeArr); } else{ ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN); addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr); } } unknownWordEndIndex = startIndex + unknownWordLength; } } ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, 0, 0, textLength + 1, Type.KNOWN); addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0 ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr}; return result; } /** * Find token(s) in input text and set found token(s) in arrays as normal tokens * ユーザ辞書にある単語を探してstartIndexArr, endIndexArrに登録 * * TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更 * * @param text * @param startIndexArr * @param endIndexArr * @param startSizeArr * @param endSizeArr */ protected void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) { int[][] result = userDictionary.lookup(text); for(int[] segmentation : result) { int wordId = segmentation[0]; int index = segmentation[1]; int length = segmentation[2]; ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER); addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr); } } /** * Add node to arrays and increment count in size array * startIndexArrとendIndexArrにnodeを追加する。 * * TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更 * * @param node * @param startIndex * @param endIndex * @param startIndexArr * @param endIndexArr * @param startSizeArr * @param endSizeArr */ protected void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) { if(printsNodes) { Dictionary dic = null; try { switch (node.getType()) { case KNOWN: dic = dictionary; break; case UNKNOWN: dic = unkDictionary; break; case USER: dic = userDictionary; break; } } catch (Exception e) { } String[] features = { "?" }; String pos = ""; int wordId = node.getWordId(); if(dic != null){ features = dic.getAllFeaturesArray(wordId); pos = dic.getPartOfSpeech(wordId); } System.out.println(" * " + node.getSurfaceForm() + " <" + features[features.length - 1] + ">【" + pos + "】(" + startIndex + ", " + endIndex + ") " + node.getWordCost()); } int startNodesCount = startSizeArr[startIndex]; int endNodesCount = endSizeArr[endIndex]; // 必要に応じて初期化 if (startNodesCount == 0) { startIndexArr[startIndex] = new ViterbiNode[10]; } if (endNodesCount == 0) { endIndexArr[endIndex] = new ViterbiNode[10]; } // 必要に応じて配列を長くする if (startIndexArr[startIndex].length <= startNodesCount){ startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]); } if (endIndexArr[endIndex].length <= endNodesCount){ endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]); } // 格納 startIndexArr[startIndex][startNodesCount] = node; endIndexArr[endIndex][endNodesCount] = node; // 配列の長さを修正 startSizeArr[startIndex] = startNodesCount + 1; endSizeArr[endIndex] = endNodesCount + 1; } /** * Return twice as big array which contains value of input array * * @param array * @return */ private ViterbiNode[] extendArray(ViterbiNode[] array) { // extend array ViterbiNode[] newArray = new ViterbiNode[array.length * 2]; System.arraycopy(array, 0, newArray, 0, array.length); return newArray; } /** * ユーザー辞書を変更します。TONIXY とにぃによる追記。 * * @param userDictionary */ public void setUserDictionary(UserDictionary userDictionary) { this.userDictionary = userDictionary; if (userDictionary == null) { this.useUserDictionary = false; } else { this.useUserDictionary = true; } } }