//The MIT License // //Copyright (c) 2009 nodchip // //Permission is hereby granted, free of charge, to any person obtaining a copy //of this software and associated documentation files (the "Software"), to deal //in the Software without restriction, including without limitation the rights //to use, copy, modify, merge, publish, distribute, sublicense, and/or sell //copies of the Software, and to permit persons to whom the Software is //furnished to do so, subject to the following conditions: // //The above copyright notice and this permission notice shall be included in //all copies or substantial portions of the Software. // //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, //FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE //AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER //LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, //OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN //THE SOFTWARE. package tv.dyndns.kishibe.qmaclone.server.relevance; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; import tv.dyndns.kishibe.qmaclone.server.util.Normalizer; import com.google.common.collect.Lists; public class Trie { public interface Factory { Trie create(); } private int[][] array; /** * Trie木を作成する * * @param words *  Trie木に挿入する単語 * @throws Exception */ public void build(Collection<String> words) { // Trie木に単語を挿入する List<Map<Integer, Integer>> trie = Lists.newArrayList(); trie.add(new TreeMap<Integer, Integer>()); int wordIndex = 0; for (String word : words) { addWord(word, trie, wordIndex++); } // Trie木を2次元配列に圧縮する // array[Trie木のノード番号][偶数:次の文字 奇数:次のノード番号] // 次の文字が0の場合は単語の終端を表す。次のノード番号には単語のインデックスが入る int[][] array = new int[trie.size()][]; for (int nodeIndex = 0; nodeIndex < trie.size(); ++nodeIndex) { array[nodeIndex] = new int[trie.get(nodeIndex).size() * 2]; int leafIndex = 0; for (Entry<Integer, Integer> entry : trie.get(nodeIndex).entrySet()) { array[nodeIndex][leafIndex++] = entry.getKey(); array[nodeIndex][leafIndex++] = entry.getValue(); } } this.array = array; } private static boolean isValidWord(String word) { int length = word.length(); if (length <= 1) { return false; } // ○のみからなる文字列は無視 boolean isCircles = true; for (int i = 0; i < length && isCircles; ++i) { isCircles = word.charAt(i) == '○'; } if (isCircles) { return false; } return true; } /** * Trie木に単語を挿入する * * @param word * 単語 * @param trie * Trie木 */ private void addWord(String word, List<Map<Integer, Integer>> trie, int wordIndex) { if (!isValidWord(word)) { return; } word = Normalizer.normalize(word); int currentNode = 0; for (char c : word.toCharArray()) { if (trie.get(currentNode).containsKey(c & 0xffff)) { currentNode = trie.get(currentNode).get(c & 0xffff); } else { trie.get(currentNode).put(c & 0xffff, trie.size()); currentNode = trie.size(); trie.add(new TreeMap<Integer, Integer>()); } } trie.get(currentNode).put(0, wordIndex); } /** * 文字列をパースし、含まれている単語を抽出する。vitabiアルゴリズムのようなもの * * @param string * 文字列 * @param words * 含まれている単語のインデックス */ public void parse(String string, List<Integer> words, List<Integer> offsets, List<Integer> lengths) { string = Normalizer.normalize(string); char[] s = string.toCharArray(); int n = s.length; int[] memo = new int[n + 1]; int[] edge = new int[n + 1]; int[] wordIndexes = new int[n + 1]; // 動的計画法 for (int startIndex = 0; startIndex < n; ++startIndex) { // 一致する文字が無い場合は何もせずに1マス進む if (memo[startIndex + 1] < memo[startIndex]) { memo[startIndex + 1] = memo[startIndex]; edge[startIndex + 1] = 0; } int nodeIndex = 0; for (int currentIndex = startIndex; currentIndex < n; ++currentIndex) { // 葉が無かったら if (array[nodeIndex].length == 0) { break; } // 終端文字が見つかったら if (array[nodeIndex][0] == 0 && currentIndex - startIndex != 1) { int cost = memo[startIndex] + (currentIndex - startIndex) * (currentIndex - startIndex); if (memo[currentIndex] < cost) { memo[currentIndex] = cost; edge[currentIndex] = currentIndex - startIndex; // 単語インデックスを記録する wordIndexes[currentIndex] = array[nodeIndex][1]; } } // 次の葉に移動する int c = s[currentIndex] & 0xffff; int l = 0; int r = array[nodeIndex].length / 2; while (l + 1 < r) { int m = (l + r) >> 1; // lowerBound if (array[nodeIndex][m * 2] <= c) { l = m; } else { r = m; } } if (l == array[nodeIndex].length / 2 || array[nodeIndex][l * 2] != c) { break; } nodeIndex = array[nodeIndex][l * 2 + 1]; } } // 動的計画法のトラックバック int currentIndex = n; while (currentIndex > 0) { if (edge[currentIndex] == 0) { currentIndex--; } else { if (words != null) { words.add(wordIndexes[currentIndex]); } if (lengths != null) { lengths.add(edge[currentIndex]); } currentIndex -= edge[currentIndex]; if (offsets != null) { offsets.add(currentIndex); } } } if (words != null) { Collections.reverse(words); } if (offsets != null) { Collections.reverse(offsets); } if (lengths != null) { Collections.reverse(lengths); } } public void save(File file) throws FileNotFoundException, IOException { try (ObjectOutputStream outputStream = new ObjectOutputStream(new DeflaterOutputStream( new BufferedOutputStream(new FileOutputStream(file))))) { outputStream.writeObject(array); } } public void load(File file) throws FileNotFoundException, IOException, ClassNotFoundException { try (ObjectInputStream inputStream = new ObjectInputStream(new InflaterInputStream( new BufferedInputStream(new FileInputStream(file))))) { array = (int[][]) inputStream.readObject(); } } }