package jp.ac.waseda.info.kake.moca.viterbi;
import jp.ac.waseda.info.kake.moca.MocaTokenizer;
import jp.ac.waseda.info.kake.moca.MocaTokenizer.MocaMode;
import jp.ac.waseda.info.kake.moca.syllable.SyllabifiedString;
import jp.ac.waseda.info.kake.moca.syllable.Syllable;
import jp.ac.waseda.info.kake.moca.viterbi.wordcost.AbstractWordCostAdjuster;
import jp.ac.waseda.info.kake.moca.viterbi.wordcost.KanaWordCostAdjuster;
import jp.ac.waseda.info.kake.moca.viterbi.wordcost.LevenshteinWordCostAdjuster;
import jp.ac.waseda.info.kake.moca.viterbi.wordcost.BaseWordCostAdjuster;
import jp.ac.waseda.info.kake.moca.viterbi.wordcost.MultipliedWordCostAdjuster;
import jp.ac.waseda.info.kake.moca.viterbi.wordcost.UnknownWordCostAdjuster;
import jp.ac.waseda.info.kake.string.KanaConverter;
import jp.ac.waseda.info.kake.string.StringSizeConverter;
import org.atilika.kuromoji.dict.ConnectionCosts;
import org.atilika.kuromoji.dict.TokenInfoDictionary;
import org.atilika.kuromoji.dict.UnknownDictionary;
import org.atilika.kuromoji.dict.UserDictionary;
import org.atilika.kuromoji.trie.DoubleArrayTrie;
import org.atilika.kuromoji.viterbi.Viterbi;
import org.atilika.kuromoji.viterbi.ViterbiNode;
import org.atilika.kuromoji.viterbi.ViterbiNode.Type;
/**
* ViterbiをMoCA用に改変したものです。SyllabifiedStringを用い、音節変形を考慮した形態素解析を行います。
*
* @author Tony
*
*/
public class MocaViterbi extends Viterbi {
private MocaMode mode;
/**
* 音節連結をした場合のコスト倍率
*/
private static final int AMPLIFICATION_CONNECT = 2;
private static enum PrefixType {
LITERARY(new BaseWordCostAdjuster()), SYLLABIFIED(new LevenshteinWordCostAdjuster()), KANACONVERTED(
new KanaWordCostAdjuster()), SYLLABIFIED_AND_KANACONVERTED(new MultipliedWordCostAdjuster(
SYLLABIFIED.adjuster, KANACONVERTED.adjuster));
public final AbstractWordCostAdjuster adjuster;
private PrefixType(AbstractWordCostAdjuster adjuster) {
this.adjuster = adjuster;
}
};
private static UnknownWordCostAdjuster unkAdjuster = new UnknownWordCostAdjuster();
public MocaViterbi(DoubleArrayTrie trie, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary,
ConnectionCosts costs, UserDictionary userDictionary, MocaMode mode) {
super(trie, dictionary, unkDictionary, costs, userDictionary, MocaTokenizer.kuroMode,
MocaTokenizer.unknownFixMode, MocaTokenizer.convertsSize);
this.mode = mode;
}
@Override
public ViterbiNode[][][] build(String text) {
int textLength = text.length();
// startIndexArr[n]には、textのn文字目(1<=n<=text.length)から始まるViterbiNodeが入ってる。[0][0]にはBOS,[text.length()+1][0]にはEOS
ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][];
// endIndexArr[n]には、textのn-1文字目(2<=n<=text.length()+1)で終わるViterbiNodeが入ってる。
ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][];
// startSizeArr[n]は、startIndexArr[n]の長さ
int[] startSizeArr = new int[textLength + 2];
// endSizeArr[n]は、endIndexArr[n]の長さ
int[] endSizeArr = new int[textLength + 2];
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
// Process user dictionary;
// 先にユーザ辞書の単語をすべて登録
if (usesUserDictionary()) {
processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
SyllabifiedString syls = new SyllabifiedString(text);
int unknownWordEndIndex = -1; // index of the last character of unknown
// word
int katakanaIndex = -1;// カタカナ語登録に使用
// 開始文字位置のループ
for (int startIndex = 0; startIndex < textLength; startIndex++) {
// If no token ends where current token starts, skip this index
if (endSizeArr[startIndex + 1] == 0) {
continue;
}
int result;
boolean alreadyFound = false; // この開始文字位置で既に単語が見つかっているか否か
// 終了文字位置のループ
for (int endIndex = startIndex + 1; endIndex <= textLength; endIndex++) {
boolean found = false; // 一致があるかどうか
boolean prefixFound = false; // 前方一致があるかどうか
boolean readingPrefixFound = false; // 読み仮名検索による前方一致があるかどうか
String[] prefixes;
if (mode.analysesSyllables)
prefixes = syls.substrings(startIndex, endIndex); // startIndex~endIndexの部分文字列になる
else
prefixes = new String[] { text.substring(startIndex, endIndex) };
// 変換候補のループ
for (String prefix : prefixes) {
prefix = StringSizeConverter.getFullString(prefix);
PrefixType prefixType = PrefixType.SYLLABIFIED;
if (prefix.equals(prefixes[0]))
prefixType = PrefixType.LITERARY;
result = trie.lookup(prefix);
if (result >= 0) {
readingPrefixFound = true;
if (!KanaConverter.isKatakana(prefix))
prefixFound = true;
}
// 表層表現がヒットしなかったら、読み検索
if (mode.analysesReading && result <= 0) {
result = trie.lookup(KanaConverter.getKatakana(prefix));
if (result >= 0) {
readingPrefixFound = true;
if (prefixType == PrefixType.SYLLABIFIED)
prefixType = PrefixType.SYLLABIFIED_AND_KANACONVERTED;
else
prefixType = PrefixType.KANACONVERTED;
}
}
if (result > 0) { // Found match in double array trie
for (int wordId : dictionary.lookupWordIds(result)) {
String[] features = dictionary.getAllFeaturesArray(wordId);
// 読み検索用の単語でないかチェック
PrefixType wordType = prefixType;
if (!prefix.equals(features[features.length - 1])) {
if (wordType == PrefixType.SYLLABIFIED)
wordType = PrefixType.SYLLABIFIED_AND_KANACONVERTED;
else if (wordType == PrefixType.LITERARY)
wordType = PrefixType.KANACONVERTED;
}
// MoCAの設定が読み仮名検索を許可しているかチェック
if ((wordType == PrefixType.KANACONVERTED || wordType == PrefixType.SYLLABIFIED_AND_KANACONVERTED)
&& !mode.analysesReading) {
continue;
}
found = true;
prefixFound = true;
readingPrefixFound = true;
alreadyFound = true;
// 形態素の表層表現設定
String surface = prefixes[0];
if (!mode.returnsColloquial) {
surface = features[features.length - 1];
}
ViterbiNode node = new ViterbiNode(wordId, surface, dictionary.getLeftId(wordId),
dictionary.getRightId(wordId), wordType.adjuster.getWordCost(prefixes[0], prefix,
dictionary, wordId), startIndex, Type.KNOWN);
addToArrays(node, startIndex + 1, endIndex + 1, startIndexArr, endIndexArr, startSizeArr,
endSizeArr);
// 音節連結処理
connect: if (mode.analysesSyllables && endIndex < textLength) {
Syllable before = syls.getSyllable(endIndex - 1);
Syllable next = syls.getSyllable(endIndex);
check: switch (next.type) {
case Syllable.VOWEL:
// 漢字音節後に母音音節が続くとき、漢字音節の読みによって挙動を変化させる
if (before.type != Syllable.KANJI || before.endsWithNasal()
|| before.endsWithLong())
break connect;// 漢字直後に撥音がついている場合は連結処理しない
// TODO かなの時に合わせ、長音記号が挟まる場合連結してない。これでいい?
Syllable[] reading = Syllable.createSyllables(dictionary.getReading(wordId));
Syllable readLast = reading[reading.length - 1];
if (readLast.endsWithNasal())
break connect;// 万が一、読みの最後に撥音がついている場合は連結処理しない
switch (readLast.getVowel()) {
case Syllable.UNREADABLE:
break connect;
case 'e':
if (next.getVowel() == 'e')
break;
// 漢字の読みの最後音節がエ/オ母音かつ長音節である場合のみ、イ/ウ母音への連結を許す
if ((before.endsWithLong() || readLast.endsWithLong())
&& next.getVowel() == 'i')
// beforeは、漢字直後に長音記号が来た時も長音節であると扱うために必要
break;
break connect;
case 'o':
if ((before.endsWithLong() || readLast.endsWithLong())
&& next.getVowel() == 'u')
break;
case 'u':
case 'i':
case 'a':
if (next.getVowel() == readLast.getVowel())
break;
default:
break connect;
}
break check;
case Syllable.LONG:
// LONGは、記号以外のものなら繋がりうる。記号以外にLONG音節が作られる条件=直前に撥促音があること
if (before.endsWithNasal())
break check;
break connect;
case Syllable.NASAL:
// NASALは、記号以外のものなら繋がりうる。記号以外にNASAL音節が作られる条件=直前に長音があること
// TODO どっちをとる?
// 今のルールは顔文字内の「っ」が記号とくっつかなくなるが、「、。」等の後のが省略されない
// break check;
if (before.endsWithLong())
break check;
break connect;
default:
break connect;
}
// コスト計算、表層表現を修正する必要がある
String substring = text.substring(startIndex, endIndex + next.length);
surface = substring;
if (!mode.returnsColloquial) {
surface = features[features.length - 1];
}
node = new ViterbiNode(wordId, surface, dictionary.getLeftId(wordId),
dictionary.getRightId(wordId), prefixType.adjuster.getWordCost(substring,
prefix, dictionary, wordId) * AMPLIFICATION_CONNECT, startIndex,
Type.KNOWN);
addToArrays(node, startIndex + 1, endIndex + next.length + 1, startIndexArr,
endIndexArr, startSizeArr, endSizeArr);
}
// ここまで、漢字音節→母音音節の連結処理
}
}
}
// 前方一致すらない場合は、未知語処理して終了文字位置ループを抜ける
if (!(found || (prefixFound && endIndex < textLength))) {
if (mode.analysesColloquial && !alreadyFound)
// MoCA版未知語処理。
for (int wordId : unkDictionary
.lookupWordIds(characterDefinition.lookup(prefixes[0].charAt(0))))
for (int i = 1; i <= prefixes[0].length(); i++) {
ViterbiNode node = new ViterbiNode(wordId, prefixes[0].substring(0, i),
unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId),
unkDictionary.getWordCost(wordId) * i, startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, startIndex + i + 1, startIndexArr, endIndexArr,
startSizeArr, endSizeArr);
}
if (!readingPrefixFound)
break;
}
}
// 辞書にないカタカナ語を無条件で未知語として追加
// 全体を登録するパターン
k: if (KanaConverter.isKatakana(text.charAt(startIndex))
&& (!KanaConverter.isSpecialCharacter(text.charAt(startIndex))) && startIndex >= katakanaIndex) {
katakanaIndex = startIndex + 1;
while (katakanaIndex < textLength && KanaConverter.isKatakana(text.charAt(katakanaIndex))) {
katakanaIndex++;
}
String katakana = text.substring(startIndex, katakanaIndex);
result = trie.lookup(katakana);
// 辞書に一致する単語があったら登録しない。読み検索用の単語なら見つかっても無視。
if (result > 0) {
for (int wordId : dictionary.lookupWordIds(result)) {
String[] features = dictionary.getAllFeaturesArray(wordId);
if (katakana.equals(features[features.length - 1]))
break k;
}
}
for (int wordId : unkDictionary.lookupWordIds(characterDefinition.lookup(katakana.charAt(0)))) {
ViterbiNode node = new ViterbiNode(wordId, katakana, unkDictionary.getLeftId(wordId),
unkDictionary.getRightId(wordId), unkAdjuster.getWordCost(katakana, katakana,
unkDictionary, wordId), startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, katakanaIndex + 1, startIndexArr, endIndexArr, startSizeArr,
endSizeArr);
}
}
// 全体を登録するパターン ここまで
/*
//部分文字列全部登録するパターン
katakanaIndex = startIndex;
w: while (katakanaIndex < textLength)
if (KanaConverter.isKatakana(text.charAt(katakanaIndex))) {
katakanaIndex++;
String katakana = text.substring(startIndex, katakanaIndex);
result = trie.lookup(katakana);
// 辞書に一致する単語があったら登録しない。読み検索用の単語なら見つかっても無視。
if (result > 0) {
for (int wordId : dictionary.lookupWordIds(result)) {
String[] features = dictionary.getAllFeaturesArray(wordId);
if (katakana.equals(features[features.length - 1]))
continue w;
}
}
for (int wordId : unkDictionary.lookupWordIds(characterDefinition.lookup(katakana.charAt(0)))) {
ViterbiNode node = new ViterbiNode(wordId, katakana, unkDictionary.getLeftId(wordId),
unkDictionary.getRightId(wordId), unkAdjuster.getWordCost(katakana, katakana,
unkDictionary, wordId), startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, katakanaIndex + 1, startIndexArr, endIndexArr, startSizeArr,
endSizeArr);
}
} else
break;
//部分文字列全部登録するパターン ここまで
*/
// Kuromoji版未知語処理。コスト以外は完全にViterbiのコピペ
// TODO Kuromoji版未知語処理を常に使うべきか決める
// if (true) {
if (!mode.analysesColloquial) {
if (unknownWordEndIndex > startIndex) {
continue;
}
// Process Unknown Word
int unknownWordLength = 0;
String suffix = text.substring(startIndex);
char firstCharacter = suffix.charAt(0);
boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
if (isInvoke) { // Process "invoke"
unknownWordLength = unkDictionary.lookup(suffix);
} else if (alreadyFound == false) { // Process not "invoke"
unknownWordLength = unkDictionary.lookup(suffix);
}
if (unknownWordLength > 0) { // found unknown word
String unkWord = suffix.substring(0, unknownWordLength);
int characterId = characterDefinition.lookup(firstCharacter);
int[] wordIds = unkDictionary.lookupWordIds(characterId);
for (int wordId : wordIds) {
if (unknownFixMode)
for (int i = 1; i <= unknownWordLength; i++) {
ViterbiNode node = new ViterbiNode(wordId, unkWord.substring(0, i),
unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId),
unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + i, startIndexArr, endIndexArr,
startSizeArr, endSizeArr);
}
else {
ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId),
unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex,
Type.UNKNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr,
endIndexArr, startSizeArr, endSizeArr);
}
}
unknownWordEndIndex = startIndex + unknownWordLength;
}
}
}
ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
return new ViterbiNode[][][] { startIndexArr, endIndexArr };
}
@Override
protected void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr,
ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
/*
String[] features = { "?" };
try {
switch (node.getType()) {
case KNOWN:
features = dictionary.getAllFeaturesArray(node.getWordId());
case UNKNOWN:
features = unkDictionary.getAllFeaturesArray(node.getWordId());
case USER:
features = userDictionary.getAllFeaturesArray(node.getWordId());
}
} catch (Exception e) {
}
System.out.println(" * " + node.getSurfaceForm() + " <" + features[features.length - 1] + "> (" + startIndex
+ ", " + endIndex + ") " + node.getWordCost());
*/
// ↑を有効にすると、単語候補が一覧される
super.addToArrays(node, startIndex, endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
/**
* MoCAモードの切り替えを行います。
*
* @param mode
*/
public void setMocaMode(MocaMode mode) {
this.mode = mode;
}
}