package com.antbrains.wordseg;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Stack;
import java.util.Map.Entry;
import com.antbrains.crf.hadoop.FileTools;
import com.antbrains.datrie.DoubleArrayTrie;
/**
* 逆向最大匹配分词
*
* @author lili
*
*/
public class RMMSeg {
private DoubleArrayTrie trie;
private String reverseString(String s) {
StringBuilder sb = new StringBuilder(s.length());
for (int i = s.length() - 1; i >= 0; i--) {
sb.append(s.charAt(i));
}
return sb.toString();
}
public RMMSeg(DoubleArrayTrie trie) {
this.trie = trie;
}
public RMMSeg(List<String> wordList) {
trie = new DoubleArrayTrie();
for (String word : wordList) {
word=word.trim();
if(word.length()<2) continue;
trie.coverInsert(reverseString(word), 0);
}
}
public RMMSeg(String dictPath) {
try {
List<String> wordList = FileTools.readFile2List(dictPath, "UTF-8");
for (String word : wordList) {
trie.coverInsert(reverseString(word), 0);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private String cnNumbers = "零一二三四五六七八九十百千万亿";
public boolean processNumber = false;
public List<Token> seg(String sentence) {
String s = this.reverseString(sentence);
List<Token> tokens = new ArrayList<Token>();
Stack<Token> stack = new Stack<Token>();
int length = sentence.length();
for (int i = 0; i < s.length(); i++) {
String ch = s.substring(i, i + 1);
int len = trie.find(s, i)[0];
if (processNumber && cnNumbers.contains(ch)) { // 处理汉字里的数字,阿拉伯数字在前面lucene的分析器已经处理过了
int j = i + 1;
for (; j < s.length(); j++) {
ch = s.substring(j, j + 1);
if (!cnNumbers.contains(ch))
break;
}
if (j - i > 1 && j - i > len) {
stack.add(new Token(sentence, length - j, length - i));
i += (j - i - 1);
continue;
}
}
if (len > 1) {
stack.push(new Token(sentence, length - i - len, length - i));
i += (len - 1);
} else {
stack.push(new Token(sentence, length - i - 1, length - i));
}
}
while (!stack.empty()) {
tokens.add(stack.pop());
}
return tokens;
}
}