Viterbi.java example

Explorer
MoCA-master
- MoCA
  - src
    - jp
      - ac
        waseda
        info
        kake
        moca
        MoCA.java
        MocaTokenizer.java
        dict
        MocaDictionaries.java
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        syllable
        SyllabifiedString.java
        Syllable.java
        system
        InputMain.java
        PrintIntegerMaker.java
        tools
        MocaDictionaryBuilder.java
        MocaTokenInfoDictionaryBuilder.java
        viterbi
        MocaViterbi.java
        wordcost
        AbstractWordCostAdjuster.java
        BaseWordCostAdjuster.java
        KanaWordCostAdjuster.java
        LevenshteinWordCostAdjuster.java
        MultipliedWordCostAdjuster.java
        UnknownWordCostAdjuster.java
- TonixyKuromoji
  - src
    - TonixyKuromoji.java
    - jp
      - ac
        waseda
        info
        kake
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        system
        InputMain.java
        PrintIntegerMaker.java
    - org
      - atilika
        kuromoji
        DebugTokenizer.java
        Token.java
        Tokenizer.java
        TokenizerRunner.java
        dict
        CharacterDefinition.java
        ConnectionCosts.java
        Dictionaries.java
        Dictionary.java
        TokenInfoDictionary.java
        UnknownDictionary.java
        UserDictionary.java
        trie
        DoubleArrayTrie.java
        Trie.java
        util
        CSVUtil.java
        ConnectionCostsBuilder.java
        DictionaryBuilder.java
        DoubleArrayTrieBuilder.java
        TokenInfoDictionaryBuilder.java
        UnknownDictionaryBuilder.java
        viterbi
        Viterbi.java
        ViterbiFormatter.java
        ViterbiNode.java
/**
 * Copyright © 2010-2012 Atilika Inc.  All rights reserved.
 *
 * Atilika Inc. licenses this file to you under the Apache License, Version
 * 2.0 (the "License"); you may not use this file except in compliance with
 * the License.  A copy of the License is distributed with this work in the
 * LICENSE.txt file.  You may also obtain a copy of the License from
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package org.atilika.kuromoji.viterbi;

import java.util.LinkedList;
import java.util.List;

import org.atilika.kuromoji.Tokenizer.Mode;
import org.atilika.kuromoji.dict.CharacterDefinition;
import org.atilika.kuromoji.dict.CharacterDefinition.CharacterClass;
import org.atilika.kuromoji.dict.ConnectionCosts;
import org.atilika.kuromoji.dict.Dictionary;
import org.atilika.kuromoji.dict.TokenInfoDictionary;
import org.atilika.kuromoji.dict.UnknownDictionary;
import org.atilika.kuromoji.dict.UserDictionary;
import org.atilika.kuromoji.trie.DoubleArrayTrie;
import org.atilika.kuromoji.viterbi.ViterbiNode.Type;

import jp.ac.waseda.info.kake.string.StringSizeConverter;


/**
 * TONIXY バグ修正
 * TONIXY サブクラスを作りやすいよう、可視性など大幅に変更
 *
 * @author Masaru Hasegawa
 * @author Christian Moen
 */
public class Viterbi {

	// TONIXY trueにするとNode情報を一覧できる
	private static final boolean printsNodes = false;

	// TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更
	protected final DoubleArrayTrie trie;

	protected final TokenInfoDictionary dictionary;

	protected final UnknownDictionary unkDictionary;

	protected final ConnectionCosts costs;

	// TONIXY ユーザ辞書変更を可能にしたため、finalを除去。サブクラスからの参照を可能にするため、privateをprotectedに変更
	protected UserDictionary userDictionary;

	// TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更
	protected final CharacterDefinition characterDefinition;

	// TONIXY ユーザ辞書変更を可能にしたため、finalを除去
	private boolean useUserDictionary;

	// TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更
	protected final boolean searchMode;

	protected final boolean extendedMode;

	protected static final int DEFAULT_COST = 10000000;

	protected static final int SEARCH_MODE_KANJI_LENGTH_DEFAULT = 2;

	protected static final int SEARCH_MODE_OTHER_LENGTH_DEFAULT = 7;

	protected static final int SEARCH_MODE_KANJI_PENALTY_DEFAULT = 3000;

	protected static final int SEARCH_MODE_OTHER_PENALTY_DEFAULT = 1700;

	protected final int searchModeKanjiPenalty;

	protected final int searchModeOtherPenalty;

	protected final int searchModeOtherLength;

	protected final int searchModeKanjiLength;

	protected static final String BOS = "BOS";

	protected static final String EOS = "EOS";

	/**
	 * TONIXY 未知語分解モードのON/OFF。未知語の部分文字列をLatticeに追加するかどうか。trueに設定することを推奨
	 */
	protected final boolean unknownFixMode;

	/**
	 * TONIXY 全角化のON/OFF。trueに設定することを推奨
	 */
	protected final boolean convertsSize;

	// TONIXY サブクラスからの参照を可能にするため、getterを作成
	protected UserDictionary getUserDictionary() {
		return userDictionary;
	}

	protected boolean usesUserDictionary() {
		return useUserDictionary;
	}

	protected boolean isUnknownFixMode() {
		return unknownFixMode;
	}

	/**
	 * TONIXY 非推奨。未知語分解モードがOFFになる。未知語分解モードの搭載後、従来通り動くためのコンストラクタ
	 *
	 * @param trie
	 * @param dictionary
	 * @param unkDictionary
	 * @param costs
	 * @param userDictionary
	 * @param mode
	 * @param searchModeKanjiPenalty
	 * @param searchModeOtherPenalty
	 * @param searchModeKanjiLength
	 * @param searchModeOtherLength
	 */
/*	public Viterbi(DoubleArrayTrie trie,
			   TokenInfoDictionary dictionary,
			   UnknownDictionary unkDictionary,
			   ConnectionCosts costs,
			   UserDictionary userDictionary,
			   Mode mode,
			   int searchModeKanjiPenalty,
			   int searchModeOtherPenalty,
			   int searchModeKanjiLength,
			   int searchModeOtherLength){
		this(trie, dictionary, unkDictionary, costs, userDictionary, mode, searchModeKanjiPenalty, searchModeOtherPenalty, searchModeKanjiLength, searchModeOtherLength, false, false);
	}
*/
	/**
	 * Constructor
	 * TONIXY 非推奨。未知語分解モードがOFFになる。未知語分解モードの搭載後、従来通り動くためのコンストラクタ
	 *
	 * @param trie
	 * @param targetMap
	 * @param dictionary
	 * @param unkDictionary
	 * @param costs
	 * @param userDictionary
	 */
/*	public Viterbi(DoubleArrayTrie trie,
				   TokenInfoDictionary dictionary,
				   UnknownDictionary unkDictionary,
				   ConnectionCosts costs,
				   UserDictionary userDictionary,
				   Mode mode) {
		this(trie, dictionary, unkDictionary, costs, userDictionary, mode, false, false);
	}
*/
	/**
	 * Constructor
	 * TONIXY 未知語分解モードの搭載のため、引数追加
	 *
	 * @param trie
	 * @param dictionary
	 * @param unkDictionary
	 * @param costs
	 * @param userDictionary
	 * @param mode
	 * @param searchModeKanjiPenalty
	 * @param searchModeOtherPenalty
	 * @param searchModeKanjiLength
	 * @param searchModeOtherLength
	 * @param unknownFixMode
	 * @param convertsSize
	 */
	public Viterbi(DoubleArrayTrie trie,
			   TokenInfoDictionary dictionary,
			   UnknownDictionary unkDictionary,
			   ConnectionCosts costs,
			   UserDictionary userDictionary,
			   Mode mode,
			   int searchModeKanjiPenalty,
			   int searchModeOtherPenalty,
			   int searchModeKanjiLength,
			   int searchModeOtherLength,
			   boolean unknownFixMode,
			   boolean convertsSize) {
		this.trie = trie;
		this.dictionary = dictionary;
		this.unkDictionary = unkDictionary;
		this.costs = costs;
		this.userDictionary = userDictionary;

		this.searchModeKanjiPenalty = searchModeKanjiPenalty;
		this.searchModeOtherPenalty = searchModeOtherPenalty;
		this.searchModeKanjiLength = searchModeKanjiLength;
		this.searchModeOtherLength = searchModeOtherLength;
		this.unknownFixMode = unknownFixMode;
		this.convertsSize = convertsSize;

		if (userDictionary == null) {
			this.useUserDictionary = false;
		} else {
			this.useUserDictionary = true;
		}

		switch (mode) {
		case SEARCH:
			searchMode = true;
			extendedMode = false;
			break;
		case EXTENDED:
			searchMode = true;
			extendedMode = true;
			break;
		default:
			searchMode = false;
			extendedMode = false;
			break;
		}

		this.characterDefinition = unkDictionary.getCharacterDefinition();
	}

	/**
	 * Constructor
	 * TONIXY 未知語分解モードの搭載のため、引数追加
	 *
	 * @param trie
	 * @param targetMap
	 * @param dictionary
	 * @param unkDictionary
	 * @param costs
	 * @param userDictionary
	 * @param unknownFixMode
	 * @param convertsSize
	 */
	public Viterbi(DoubleArrayTrie trie,
				   TokenInfoDictionary dictionary,
				   UnknownDictionary unkDictionary,
				   ConnectionCosts costs,
				   UserDictionary userDictionary,
				   Mode mode, boolean unknownFixMode, boolean convertsSize) {
		this(trie, dictionary, unkDictionary, costs, userDictionary, mode,
			 SEARCH_MODE_KANJI_PENALTY_DEFAULT, SEARCH_MODE_OTHER_PENALTY_DEFAULT,
			 SEARCH_MODE_KANJI_LENGTH_DEFAULT, SEARCH_MODE_OTHER_LENGTH_DEFAULT, unknownFixMode, convertsSize);
	}

	/**
	 * Find best path from input lattice.
	 * @param lattice the result of build method
	 * @return	List of ViterbiNode which consist best path
	 */
	public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
		ViterbiNode[][] startIndexArr = lattice[0];
		ViterbiNode[][] endIndexArr = lattice[1];

		// ここからコスト計算とパス設定
		for (int i = 1; i < startIndexArr.length; i++){

			if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
				continue;
			}

			// 現在地から始まるViterbiNodeを順番に見る
			for (ViterbiNode node : startIndexArr[i]) {
				if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
					break;
				}

				int backwardConnectionId = node.getLeftId();
				int wordCost = node.getWordCost();
				int leastPathCost = DEFAULT_COST;

				// ひとつ前となりうるViterbiNodeを順番に見る
				for (ViterbiNode leftNode : endIndexArr[i]) {
					if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
						break;
					}

					int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost;	// cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]

					// "Search mode". Add extra costs if it is long node.
					// ここからサーチモードの処理
					if (searchMode) {
//						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
						String surfaceForm = node.getSurfaceForm();
						int length = surfaceForm.length();
						if (length > searchModeKanjiLength) {
							boolean allKanji = true;
							// check if node consists of only kanji
							for (int pos = 0; pos < length; pos++) {
								if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
									allKanji = false;
									break;
								}
							}

							if (allKanji) {	// Process only Kanji keywords
								pathCost += (length - searchModeKanjiLength) * searchModeKanjiPenalty;
							} else if (length > searchModeOtherLength) {
								pathCost += (length - searchModeOtherLength) * searchModeOtherPenalty;
//								pathCost += searchModePenalty;
							}
						}
					}
					// ここまでサーチモードの処理

					// 現時点での最小コストとなったなら、ひとつ前のViterbiNodeを記憶しておく
					if (pathCost < leastPathCost){	// If total cost is lower than before, set current previous node as best left node (previous means left).
						leastPathCost = pathCost;
						node.setPathCost(leastPathCost);
						node.setLeftNode(leftNode);
					}
				}
			}
		}
		// ここまでコスト計算とパス設定

		// track best path
		// バックトラック
		ViterbiNode node = endIndexArr[0][0];	// EOS
		LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
		result.add(node);// 最小コストのパスはEOSへとつながっているので、ベストパスの最後は必ずEOSになっている
		while (true) {
			ViterbiNode leftNode = node.getLeftNode();
			if (leftNode == null) {
				break;
			}

			// EXTENDED mode convert unknown word into unigram node
			// EXTENDEDモードの処理。未知語を1文字ずつ分割?
			if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
				int unigramWordId = CharacterClass.NGRAM.getId();
				int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
				int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
				int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
				String surfaceForm = leftNode.getSurfaceForm();
				for (int i = surfaceForm.length(); i > 0; i--) {
					ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
					result.addFirst(uniGramNode);
				}
				// ここまでEXTENDEDモードの処理
			} else {
				result.addFirst(leftNode);
			}
			node = leftNode;
		}

		return result;
	}


	/**
	 * Build lattice from input text
	 * @param text
	 * @return
	 */
	public ViterbiNode[][][] build(String text) {
		int textLength = text.length();
		// startIndexArr[n]には、textのn文字目(1<=n<=text.length)から始まるViterbiNodeが入ってる。[0][0]にはBOS,[text.length()+1][0]にはEOS
		ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
		// endIndexArr[n]には、textのn-1文字目(2<=n<=text.length()+1)で終わるViterbiNodeが入ってる。
		ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
		// startSizeArr[n]は、startIndexArr[n]の長さ
		int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
		// endSizeArr[n]は、endIndexArr[n]の長さ
		int[] endSizeArr = new int[textLength + 2];   // array to keep ViterbiNode count in endIndexArr

		ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, 0, 0, -1, Type.KNOWN);
		addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);

		// Process user dictionary;
		// 先にユーザ辞書の単語をすべて登録
		if (useUserDictionary) {
			processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
		}

		int unknownWordEndIndex = -1;	// index of the last character of unknown word

		for (int startIndex = 0; startIndex < textLength; startIndex++) {
			// If no token ends where current token starts, skip this index
			if (endSizeArr[startIndex + 1] == 0) {
				continue;
			}

			String suffix = text.substring(startIndex);

			boolean found = false;
			for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
				String prefix = suffix.substring(0, endIndex); // startIndex～endIndexの部分文字列になる

				int result;
				if(convertsSize)
					result = trie.lookup(StringSizeConverter.getFullString(prefix));
				else
					result = trie.lookup(prefix);

				if (result > 0) {	// Found match in double array trie
					found = true;	// Don't produce unknown word starting from this index
					for (int wordId : dictionary.lookupWordIds(result)) {
						ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
						addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
					}
				} else if(result < 0) {	// If result is less than zero, continue to next position
						break;
				}
			}

			// In the case of normal mode, it doesn't process unknown word greedily.
			if(!searchMode && unknownWordEndIndex > startIndex){
				continue;
			}

			// Process Unknown Word
			int unknownWordLength = 0;
			char firstCharacter = suffix.charAt(0);
			boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
			if (isInvoke){	// Process "invoke"
				unknownWordLength = unkDictionary.lookup(suffix);
			} else if (found == false){	// Process not "invoke"
				unknownWordLength = unkDictionary.lookup(suffix);
			}

			if (unknownWordLength > 0) {      // found unknown word
				String unkWord = suffix.substring(0, unknownWordLength);
				int characterId = characterDefinition.lookup(firstCharacter);
				int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same

				for (int wordId : wordIds) {
					// TONIXY 未知語の部分文字列をとって、すべて辞書に追加するよう変更。これによって、未知語部分をできるだけ短くする。
					// また、未知語直後にユーザ辞書内単語が来た時、ユーザ辞書内単語以前の結果が消滅する現象も修正できる。
					if (unknownFixMode)
						for (int i = 1; i <= unknownWordLength; i++) {
							ViterbiNode node = new ViterbiNode(wordId, unkWord.substring(0, i),
									unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId),
									unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
							addToArrays(node, startIndex + 1, startIndex + 1 + i, startIndexArr, endIndexArr,
									startSizeArr, endSizeArr);
						}
					else{
						ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
						addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
					}
				}
				unknownWordEndIndex = startIndex + unknownWordLength;
			}
		}

		ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
		addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0

		ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};

		return result;
	}

	/**
	 * Find token(s) in input text and set found token(s) in arrays as normal tokens
	 * ユーザ辞書にある単語を探してstartIndexArr, endIndexArrに登録
	 *
	 * TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更
	 *
	 * @param text
	 * @param startIndexArr
	 * @param endIndexArr
	 * @param startSizeArr
	 * @param endSizeArr
	 */
	protected void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
		int[][] result = userDictionary.lookup(text);
		for(int[] segmentation : result) {
			int wordId = segmentation[0];
			int index = segmentation[1];
			int length = segmentation[2];
			ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
			addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
		}
	}

	/**
	 * Add node to arrays and increment count in size array
	 * startIndexArrとendIndexArrにnodeを追加する。
	 *
	 * TONIXY サブクラスからの参照を可能にするため、privateをprotectedに変更
	 *
	 * @param node
	 * @param startIndex
	 * @param endIndex
	 * @param startIndexArr
	 * @param endIndexArr
	 * @param startSizeArr
	 * @param endSizeArr
	 */
	protected void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
		if(printsNodes) {
		Dictionary dic = null;
			try {
				switch (node.getType()) {
				case KNOWN:
					dic = dictionary;
					break;
				case UNKNOWN:
					dic = unkDictionary;
					break;
				case USER:
					dic = userDictionary;
					break;
				}
			} catch (Exception e) {
			}
			String[] features = { "?" };
			String pos = "";
			int wordId = node.getWordId();
			if(dic != null){
				features = dic.getAllFeaturesArray(wordId);
				pos = dic.getPartOfSpeech(wordId);
			}
			System.out.println(" * " + node.getSurfaceForm() + " <" + features[features.length - 1] + ">【" + pos + "】(" + startIndex
					+ ", " + endIndex + ") " + node.getWordCost());
		}
		int startNodesCount = startSizeArr[startIndex];
		int endNodesCount = endSizeArr[endIndex];

		// 必要に応じて初期化
		if (startNodesCount == 0) {
			startIndexArr[startIndex] = new ViterbiNode[10];
		}

		if (endNodesCount == 0) {
			endIndexArr[endIndex] = new ViterbiNode[10];
		}

		// 必要に応じて配列を長くする
		if (startIndexArr[startIndex].length <= startNodesCount){
			startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
		}

		if (endIndexArr[endIndex].length <= endNodesCount){
			endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
		}

		// 格納
		startIndexArr[startIndex][startNodesCount] = node;
		endIndexArr[endIndex][endNodesCount] = node;

		// 配列の長さを修正
		startSizeArr[startIndex] = startNodesCount + 1;
		endSizeArr[endIndex] = endNodesCount + 1;
	}


	/**
	 * Return twice as big array which contains value of input array
	 *
	 * @param array
	 * @return
	 */
	private ViterbiNode[] extendArray(ViterbiNode[] array) {
		// extend array
		ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
		System.arraycopy(array, 0, newArray, 0, array.length);
		return newArray;
	}

	/**
	 * ユーザー辞書を変更します。TONIXY とにぃによる追記。
	 *
	 * @param userDictionary
	 */
	public void setUserDictionary(UserDictionary userDictionary) {
		this.userDictionary = userDictionary;
		if (userDictionary == null) {
			this.useUserDictionary = false;
		} else {
			this.useUserDictionary = true;
		}
	}
}