package com.antbrains.wordseg;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import com.antbrains.crf.BESB1B2MTagConvertor;
import com.antbrains.crf.CrfModel;
import com.antbrains.crf.SgdCrf;
import com.antbrains.crf.TagConvertor;
import com.antbrains.crf.hadoop.FileTools;
import com.antbrains.wordseg.Token.Type;
import com.antbrains.wordseg.luceneanalyzer.CharTermAttribute;
import com.antbrains.wordseg.luceneanalyzer.OffsetAttribute;
import com.antbrains.wordseg.luceneanalyzer.StandardTokenizer;
import com.antbrains.wordseg.luceneanalyzer.TypeAttribute;
import com.antbrains.wordseg.luceneanalyzer.Version;
public class ChineseSegmenter {
private ChineseSegmenter(){
try{
InputStream is = ChineseSegmenter.class.getResourceAsStream("/crf.model");
if(is==null) throw new RuntimeException("can't find /crf.model");
model=SgdCrf.loadModel(is);
is.close();
is = ChineseSegmenter.class.getResourceAsStream("/segdict.txt");
List<String> words=FileTools.read2List(is, "UTF8");
this.mmseg=new MMSeg(words);
this.rmmseg=new RMMSeg(words);
}catch(Exception e){
}
}
private static ChineseSegmenter instance;
static{
instance=new ChineseSegmenter();
}
public static ChineseSegmenter getInstance(){
return instance;
}
public MMSeg getMmseg() {
return mmseg;
}
public RMMSeg getRmmseg() {
return rmmseg;
}
private MMSeg mmseg;
private RMMSeg rmmseg;
private TagConvertor tc = new BESB1B2MTagConvertor();
private CrfModel model;
public CrfModel getModel() {
return model;
}
public List<List<Token>> processByLuceneAnalyzer(String sen) {
List<List<Token>> result = new ArrayList<List<Token>>();
StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_29, new StringReader(sen));
CharTermAttribute termAtt = (CharTermAttribute) tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) tokenizer.addAttribute(TypeAttribute.class);
List<Token> subSen = new ArrayList<Token>();
try {
int lastPos = 0;
boolean lastIsCn = false;
while (tokenizer.incrementToken()) {
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
if (lastPos < start) {// 被StandardAnalyzer扔掉的都认为是标点,不用参与分词
for (int i = lastPos; i < start; i++) {
if (subSen.size() > 0) {
result.add(subSen);
}
subSen = new ArrayList<Token>();
subSen.add(new Token(null, sen, i, i + 1, Type.PUNCT));
}
lastIsCn = false;
}
lastPos = end;
String wordType = typeAtt.type();
Token token = new Token(sen, start, end);
if (wordType.equals("<IDEOGRAPHIC>")) {// 汉字
token.setType(Type.CWORD);
if (!lastIsCn) {
if (subSen.size() > 0) {
result.add(subSen);
subSen = new ArrayList<Token>();
}
lastIsCn = true;
}
} else {
lastIsCn = false;
if (subSen.size() > 0) {
result.add(subSen);
}
subSen = new ArrayList<Token>();
if (wordType.equals("<ALPHANUM>")) {
token.setType(Type.ALPHA);
} else if (wordType.equals("<NUM>")) {
token.setType(Type.NUMBER);
}
}
subSen.add(token);
}
if (subSen.size() > 0) {
result.add(subSen);
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
public List<Token> seg(String sen) {
List<Token> result = new ArrayList<Token>();
// 先用StandardAnalyzer处理一遍
List<List<Token>> subSens = this.processByLuceneAnalyzer(sen);
for (List<Token> subSen : subSens) {
if (subSen.size() < 2) {
result.addAll(subSen);
} else {
for (Token tk : this.segmentSentence(subSen)) {
result.add(tk);
}
}
}
return result;
}
private String tokens2String(List<Token> tokens) {
StringBuilder sb = new StringBuilder();
for (Token tk : tokens) {
sb.append(tk.getNormalizedText());
}
return sb.toString();
}
private boolean isEqual(List<Token> tks1, List<Token> tks2) {
if (tks1.size() != tks2.size())
return false;
for (int i = 0; i < tks1.size(); i++) {
if (tks1.get(i).getLength() != tks2.get(i).getLength()) {
return false;
}
}
return true;
}
private List<int[]> compareResult(List<Token> tks1, List<Token> tks2) {
List<int[]> result = new ArrayList<int[]>();
int i = 0;
int j = 0;
int pos1 = 0;
int pos2 = 0;
int lastAlignPos = 0;
int lastAlignI = 0;
int lastAlignJ = 0;
while (true) {
if (i >= tks1.size() || j >= tks2.size())
break;
Token tk1 = tks1.get(i);
Token tk2 = tks2.get(j);
if (tk1.getLength() == tk2.getLength()) {
i++;
pos1 += tk1.getLength();
j++;
pos2 += tk2.getLength();
continue;
}
// 出现没对齐的情况
lastAlignI = i;
lastAlignJ = j;
lastAlignPos = pos1;
// i++;
pos1 += tk1.getLength();
// j++;
pos2 += tk2.getLength();
while (pos1 != pos2) {
if (pos1 < pos2) {
i++;
// if(i==tks1.size()) break L;
pos1 += tks1.get(i).getLength();
} else {
j++;
// if(j==tks2.size()) break L;
pos2 += tks2.get(j).getLength();
}
}
if (pos1 != pos2)
break;
// 再次对齐
result.add(new int[] { lastAlignPos, pos1, lastAlignI, lastAlignJ, i, j });
i++;
j++;
}
if (i < tks1.size() || j < tks2.size()) {
System.err.println("Unexpected here ");
for (Token tk : tks1) {
System.out.print(tk.getOrigText() + " ");
}
System.out.println();
for (Token tk : tks2) {
System.out.print(tk.getOrigText() + " ");
}
System.out.println();
}
return result;
}
private List<Token> segmentSentence(List<Token> tokens) {
// 首先用MMSeg和RMMSeg分词,如果不一致,就用CRFs消歧
String s = this.tokens2String(tokens);
List<Token> tks1 = this.mmseg.seg(s);
List<Token> tks2 = this.rmmseg.seg(s);
if (this.isEqual(tks1, tks2)) {
return tks1;
}
List<int[]> diff = this.compareResult(tks1, tks2);
List<Token> result = new ArrayList<Token>();
int lastPos = 0;
for (int[] arr : diff) {
for (int i = lastPos; i < arr[2]; i++) {
result.add(tks1.get(i));
}
boolean hasLeftContext = arr[2] > 0;
boolean hasRightContext = arr[4] < tks1.size() - 2;
if (hasLeftContext) {
if (tks1.get(arr[2] - 1).getLength() != tks2.get(arr[3] - 1).getLength()) {
hasLeftContext = false;
}
}
if (hasRightContext) {
if (tks1.get(arr[4] + 2).getLength() != tks2.get(arr[5] + 2).getLength()) {
hasRightContext = false;
}
}
int start1 = hasLeftContext ? arr[2] - 1 : arr[2];
int end1 = hasRightContext ? arr[4] + 2 : arr[4] + 1;
// 如果上文有歧义,暂时不考虑上下文,主要原因是实现起来有些繁琐,而且上下文影响不是很大
// 比如:一千 年 来人 类 历史
// 一 千年 来 人类 历史
// 当考虑 “来人类” 的时候 ,左边的上下文是不确定的
int start2 = hasLeftContext ? arr[3] - 1 : arr[3];
int end2 = hasRightContext ? arr[5] + 2 : arr[5] + 1;
List<Token> subList1 = tks1.subList(start1, end1);
List<Token> subList2 = tks2.subList(start2, end2);
double score1 = SgdCrf.getScore(this.token2Array(subList1), tc, model);
double score2 = SgdCrf.getScore(this.token2Array(subList2), tc, model);
if (score1 >= score2) {
result.addAll(tks1.subList(arr[2], arr[4] + 1));
} else {
result.addAll(tks2.subList(arr[3], arr[5] + 1));
}
lastPos = arr[4] + 1;
}
for (int i = lastPos; i < tks1.size(); i++) {
result.add(tks1.get(i));
}
return result;
}
private String[] token2Array(List<Token> tokens){
String[] result=new String[tokens.size()];
int i=0;
for(Token token:tokens){
result[i++]=token.getOrigText();
}
return result;
}
}