package org.ansj.splitWord;
import static org.ansj.library.DATDictionary.status;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.AmbiguityLibrary;
import org.ansj.library.DicLibrary;
import org.ansj.splitWord.impl.GetWordsImpl;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.GetWord;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.WordAlert;
/**
* 基本分词+人名识别
*
* @author ansj
*
*/
public abstract class Analysis {
/**
* 用来记录偏移量
*/
public int offe;
/**
* 分词的类
*/
private GetWordsImpl gwi = new GetWordsImpl();
protected Forest[] forests = null;
private Forest ambiguityForest = AmbiguityLibrary.get();
// 是否开启人名识别
protected Boolean isNameRecognition = true;
// 是否开启数字识别
protected Boolean isNumRecognition = true;
// 是否数字和量词合并
protected Boolean isQuantifierRecognition = true;
// 是否显示真实词语
protected Boolean isRealName = false;
/**
* 文档读取流
*/
private AnsjReader br;
protected Analysis() {
this.forests = new Forest[] { DicLibrary.get() };
this.isNameRecognition = MyStaticValue.isNameRecognition;
this.isNumRecognition = MyStaticValue.isNumRecognition;
this.isQuantifierRecognition = MyStaticValue.isQuantifierRecognition;
this.isRealName = MyStaticValue.isRealName;
};
private LinkedList<Term> terms = new LinkedList<Term>();
/**
* while 循环调用.直到返回为null则分词结束
*
* @return
* @throws IOException
*/
public Term next() throws IOException {
Term term = null;
if (!terms.isEmpty()) {
term = terms.poll();
term.updateOffe(offe);
return term;
}
String temp = br.readLine();
offe = br.getStart();
while (StringUtil.isBlank(temp)) {
if (temp == null) {
return null;
} else {
temp = br.readLine();
}
}
// 歧异处理字符串
fullTerms(temp);
if (!terms.isEmpty()) {
term = terms.poll();
term.updateOffe(offe);
return term;
}
return null;
}
/**
* 填充terms
*/
private void fullTerms(String temp) {
List<Term> result = analysisStr(temp);
terms.addAll(result);
}
/**
* 一整句话分词,用户设置的歧异优先
*
* @param temp
* @return
*/
private List<Term> analysisStr(String temp) {
Graph gp = new Graph(temp);
int startOffe = 0;
if (this.ambiguityForest != null) {
GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
String[] params = null;
while ((gw.getFrontWords()) != null) {
if (gw.offe > startOffe) {
analysis(gp, startOffe, gw.offe);
}
params = gw.getParams();
startOffe = gw.offe;
for (int i = 0; i < params.length; i += 2) {
gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
startOffe += params[i].length();
}
}
}
if (startOffe < gp.chars.length ) {
analysis(gp, startOffe, gp.chars.length);
}
List<Term> result = this.getResult(gp);
return result;
}
private void analysis(Graph gp, int startOffe, int endOffe) {
int start = 0;
int end = 0;
char[] chars = gp.chars;
String str = null;
for (int i = startOffe; i < endOffe; i++) {
switch (status(chars[i])) {
case 4:
start = i;
end = 1;
while (++i < endOffe && status(chars[i]) == 4) {
end++;
}
str = WordAlert.alertEnglish(chars, start, end);
gp.addTerm(new Term(str, start, TermNatures.EN));
i--;
break;
case 5:
start = i;
end = 1;
while (++i < endOffe && status(chars[i]) == 5) {
end++;
}
str = WordAlert.alertNumber(chars, start, end);
gp.addTerm(new Term(str, start, TermNatures.M));
i--;
break;
default:
start = i;
end = i;
int status = 0;
do {
end = ++i;
if (i >= endOffe) {
break;
}
status = status(chars[i]);
} while (status < 4);
if (status > 3) {
i--;
}
gwi.setChars(chars, start, end);
int max = start;
while ((str = gwi.allWords()) != null) {
Term term = new Term(str, gwi.offe, gwi.getItem());
int len = term.getOffe() - max;
if (len > 0) {
for (; max < term.getOffe();) {
gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
max++;
}
}
gp.addTerm(term);
max = term.toValue();
}
int len = end - max;
if (len > 0) {
for (; max < end;) {
gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
max++;
}
}
break;
}
}
}
/**
* 将为标准化的词语设置到分词中
*
* @param gp
* @param result
*/
protected void setRealName(Graph graph, List<Term> result) {
if (!MyStaticValue.isRealName) {
return;
}
String str = graph.realStr;
for (Term term : result) {
term.setRealName(str.substring(term.getOffe(), term.getOffe() + term.getName().length()));
}
}
/**
* 一句话进行分词并且封装
*
* @param temp
* @return
*/
public Result parseStr(String temp) {
return new Result(analysisStr(temp));
}
/**
* 通过构造方法传入的reader直接获取到分词结果
*
* @return
* @throws IOException
*/
public Result parse() throws IOException {
List<Term> list = new ArrayList<Term>();
Term temp = null;
while ((temp = next()) != null) {
list.add(temp);
}
Result result = new Result(list);
return result;
}
protected abstract List<Term> getResult(Graph graph);
public abstract class Merger {
public abstract List<Term> merger();
}
/**
* 重置分词器
*
* @param br
*/
public void resetContent(AnsjReader br) {
this.offe = 0;
this.br = br;
}
public void resetContent(Reader reader) {
this.offe = 0;
this.br = new AnsjReader(reader);
}
public void resetContent(Reader reader, int buffer) {
this.offe = 0;
this.br = new AnsjReader(reader, buffer);
}
public Forest getAmbiguityForest() {
return ambiguityForest;
}
public Analysis setAmbiguityForest(Forest ambiguityForest) {
this.ambiguityForest = ambiguityForest;
return this;
}
public Analysis setForests(Forest... forests) {
this.forests = forests;
return this;
}
public Analysis setIsNameRecognition(Boolean isNameRecognition) {
this.isNameRecognition = isNameRecognition;
return this;
}
public Analysis setIsNumRecognition(Boolean isNumRecognition) {
this.isNumRecognition = isNumRecognition;
return this;
}
public Analysis setIsQuantifierRecognition(Boolean isQuantifierRecognition) {
this.isQuantifierRecognition = isQuantifierRecognition;
return this;
}
public Analysis setIsRealName(Boolean isRealName) {
this.isRealName = isRealName;
return this;
}
}