package org.ansj.splitWord.analysis; import java.io.Reader; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.ansj.app.crf.SplitWord; import org.ansj.dic.LearnTool; import org.ansj.domain.Nature; import org.ansj.domain.NewWord; import org.ansj.domain.Result; import org.ansj.domain.Term; import org.ansj.domain.TermNatures; import org.ansj.library.CrfLibrary; import org.ansj.recognition.arrimpl.AsianPersonRecognition; import org.ansj.recognition.arrimpl.ForeignPersonRecognition; import org.ansj.recognition.arrimpl.NewWordRecognition; import org.ansj.recognition.arrimpl.NumRecognition; import org.ansj.recognition.arrimpl.UserDefineRecognition; import org.ansj.recognition.impl.NatureRecognition; import org.ansj.splitWord.Analysis; import org.ansj.util.AnsjReader; import org.ansj.util.Graph; import org.ansj.util.NameFix; import org.ansj.util.TermUtil; import org.ansj.util.TermUtil.InsertTermType; import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.util.MapCount; import org.nlpcn.commons.lang.util.WordAlert; import org.nlpcn.commons.lang.util.logging.Log; import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 自然语言分词,具有未登录词发现功能。建议在自然语言理解中用。搜索中不要用 * * @author ansj * */ public class NlpAnalysis extends Analysis { private static final Log LOG = LogFactory.getLog(NlpAnalysis.class); private LearnTool learn = null; private static final String TAB = "\t"; private static final int CRF_WEIGHT = 6; private SplitWord splitWord = CrfLibrary.get(); @Override protected List<Term> getResult(final Graph graph) { Merger merger = new Merger() { @Override public List<Term> merger() { if (learn == null) { learn = new LearnTool(); } graph.walkPath(); learn.learn(graph, splitWord ,forests); // 姓名识别 if (graph.hasPerson && isNameRecognition) { // 亚洲人名识别 new AsianPersonRecognition().recognition(graph.terms); graph.walkPathByScore(); NameFix.nameAmbiguity(graph.terms); // 外国人名识别 new ForeignPersonRecognition().recognition(graph.terms); graph.walkPathByScore(); } if (splitWord != null) { MapCount<String> mc = new MapCount<String>(); // 通过crf分词 List<String> words = splitWord.cut(graph.chars); Term tempTerm = null; int tempOff = 0; if (words.size() > 0) { String word = words.get(0); if (!isRuleWord(word)) { mc.add("始##始" + TAB + word, CRF_WEIGHT); } } for (String word : words) { TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word); // 尝试从词典获取词性 Term term = null; if (termNatures != TermNatures.NULL) { term = new Term(word, tempOff, termNatures); } else { term = new Term(word, tempOff, TermNatures.NW); term.setNewWord(true); } tempOff += word.length(); // 增加偏移量 if (isRuleWord(word)) { // 如果word不对那么不要了 tempTerm = null; continue; } if (term.isNewWord()) { // 尝试猜测词性 termNatures = NatureRecognition.guessNature(word); term.updateTermNaturesAndNature(termNatures); } TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT); // 对于非词典中的词持有保守态度 if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) { mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT); } tempTerm = term; if (term.isNewWord()) { learn.addTerm(new NewWord(word, Nature.NW)); } } if (tempTerm != null && !tempTerm.isNewWord()) { mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT); } graph.walkPath(mc.get()); } else { LOG.warn("not find any crf model, make sure your config right? "); } // 数字发现 if (graph.hasNum && isNumRecognition) { new NumRecognition().recognition(graph.terms); } // 词性标注 List<Term> result = getResult(); // 用户自定义词典的识别 new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms); graph.rmLittlePath(); graph.walkPathByScore(); // 进行新词发现 new NewWordRecognition(learn).recognition(graph.terms); graph.walkPathByScore(); // 优化后重新获得最优路径 result = getResult(); // 激活辞典 for (Term term : result) { learn.active(term.getName()); } setRealName(graph, result); return result; } private List<Term> getResult() { List<Term> result = new ArrayList<Term>(); int length = graph.terms.length - 1; for (int i = 0; i < length; i++) { if (graph.terms[i] == null) { continue; } result.add(graph.terms[i]); } return result; } }; return merger.merger(); } // 临时处理新词中的特殊字符 private static final Set<Character> filter = new HashSet<Character>(); static { filter.add(':'); filter.add(' '); filter.add(':'); filter.add(' '); filter.add(','); filter.add('”'); filter.add('“'); filter.add('?'); filter.add('。'); filter.add('!'); filter.add('。'); filter.add(','); filter.add('.'); filter.add('、'); filter.add('\\'); filter.add(';'); filter.add(';'); filter.add('?'); filter.add('?'); filter.add('!'); filter.add('\"'); filter.add('('); filter.add(')'); filter.add('('); filter.add(')'); filter.add('…'); filter.add('…'); filter.add('—'); filter.add('-'); filter.add('-'); filter.add('—'); filter.add('《'); filter.add('》'); } /** * 判断新词识别出来的词是否可信 * * @param word * @return */ public static boolean isRuleWord(String word) { char c = 0; for (int i = 0; i < word.length(); i++) { c = word.charAt(i); if (c != '·') { if (c < 256 || filter.contains(c) || (c = WordAlert.CharCover(word.charAt(i))) > 0) { return true; } } } return false; } public NlpAnalysis setCrfModel(SplitWord splitWord) { this.splitWord = splitWord; return this; } public NlpAnalysis setLearnTool(LearnTool learn) { this.learn = learn; return this; } public NlpAnalysis() { super(); } public NlpAnalysis(Reader reader) { super.resetContent(new AnsjReader(reader)); } public static Result parse(String str) { return new NlpAnalysis().parseStr(str); } public static Result parse(String str, Forest... forests) { return new NlpAnalysis().setForests(forests).parseStr(str); } }