package tv.dyndns.kishibe.qmaclone.server.relevance; import java.util.Deque; import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import tv.dyndns.kishibe.qmaclone.server.util.Normalizer; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Queues; import com.google.common.collect.Sets; import com.google.inject.Inject; public class PatternMatchingAutomaton { private class PMA { public Map<Character, PMA> next = Maps.newHashMap(); public List<String> accept = Lists.newArrayList(); @Override public String toString() { return MoreObjects.toStringHelper(this).add("next", next).add("accept", accept).toString(); } } private static final Logger logger = Logger.getLogger(PatternMatchingAutomaton.class.getName()); private final Set<Dictionary> dictionaries; private PMA pma; @Inject public PatternMatchingAutomaton(Set<Dictionary> dictionaries) { this.dictionaries = Preconditions.checkNotNull(dictionaries); } public List<String> segment(String sentence) { if (pma == null) { Set<String> words = Sets.newHashSet(); for (Dictionary dictionary : dictionaries) { words.addAll(dictionary.getWords()); } pma = build(words); } sentence = Normalizer.normalize(sentence).replaceAll(" ", "").replaceAll(" ", ""); List<String> words = Lists.newArrayList(); PMA v = pma; for (char c : sentence.toCharArray()) { while (!v.next.containsKey(c)) v = v.next.get('\0'); v = v.next.get(c); words.addAll(v.accept); } return words; } private PMA build(Set<String> words) { int counter = 0; PMA root = new PMA(); for (String word : words) { if (++counter % 10000 == 0) { logger.log(Level.INFO, "trie: " + counter); } // make trie PMA t = root; for (char c : word.toCharArray()) { if (!t.next.containsKey(c)) t.next.put(c, new PMA()); t = t.next.get(c); } t.accept.add(word); } counter = 0; Deque<PMA> Q = Queues.newArrayDeque(); // make failure link using bfs for (char c = 'a'; c <= 'z'; ++c) { if (root.next.containsKey(c)) { root.next.get(c).next.put('\0', root); Q.push(root.next.get(c)); } else { root.next.put(c, root); } } while (!Q.isEmpty()) { if (++counter % 10000 == 0) { logger.log(Level.INFO, "failure link: " + counter); } PMA t = Q.removeFirst(); for (char c = 'a'; c <= 'z'; ++c) { if (t.next.containsKey(c)) { Q.addLast(t.next.get(c)); PMA r = t.next.get('\0'); while (!r.next.containsKey(c)) r = r.next.get('\0'); t.next.get(c).next.put('\0', r.next.get(c)); t.next.get(c).accept.addAll(t.next.get(c).next.get('\0').accept); } } } return root; } }