package org.ansj.app.keyword; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeSet; import org.ansj.domain.Term; import org.ansj.splitWord.Analysis; import org.ansj.splitWord.analysis.*; import org.nlpcn.commons.lang.util.StringUtil; public class KeyWordComputer<T extends Analysis> { private static final Map<String, Double> POS_SCORE = new HashMap<String, Double>(); private T analysisType; static { POS_SCORE.put("null", 0.0); POS_SCORE.put("w", 0.0); POS_SCORE.put("en", 0.0); POS_SCORE.put("m", 0.0); POS_SCORE.put("num", 0.0); POS_SCORE.put("nr", 3.0); POS_SCORE.put("nrf", 3.0); POS_SCORE.put("nw", 3.0); POS_SCORE.put("nt", 3.0); POS_SCORE.put("l", 0.2); POS_SCORE.put("a", 0.2); POS_SCORE.put("nz", 3.0); POS_SCORE.put("v", 0.2); POS_SCORE.put("kw", 6.0); //关键词词性 } private int nKeyword = 5; public KeyWordComputer() { } public void setAnalysisType(T analysisType) { this.analysisType = analysisType; } /** * 返回关键词个数 * * @param nKeyword */ public KeyWordComputer(int nKeyword) { this.nKeyword = nKeyword; this.analysisType = (T) new NlpAnalysis();//默认使用NLP的分词方式 } public KeyWordComputer(int nKeyword, T analysisType) { this.nKeyword = nKeyword; this.analysisType = analysisType; } /** * @param content 正文 * @return */ private List<Keyword> computeArticleTfidf(String content, int titleLength) { Map<String, Keyword> tm = new HashMap<String, Keyword>(); List<Term> parse = analysisType.parseStr(content).getTerms(); //FIXME: 这个依赖于用户自定义词典的词性,所以得需要另一个方法.. // parse = FilterModifWord.updateNature(parse) ; for (Term term : parse) { double weight = getWeight(term, content.length(), titleLength); if (weight == 0) continue; Keyword keyword = tm.get(term.getName()); if (keyword == null) { keyword = new Keyword(term.getName(), term.natrue().allFrequency, weight); tm.put(term.getName(), keyword); } else { keyword.updateWeight(1); } } TreeSet<Keyword> treeSet = new TreeSet<Keyword>(tm.values()); ArrayList<Keyword> arrayList = new ArrayList<Keyword>(treeSet); if (treeSet.size() <= nKeyword) { return arrayList; } else { return arrayList.subList(0, nKeyword); } } /** * @param title 标题 * @param content 正文 * @return */ public List<Keyword> computeArticleTfidf(String title, String content) { if (StringUtil.isBlank(title)) { title = ""; } if (StringUtil.isBlank(content)) { content = ""; } return computeArticleTfidf(title + "\t" + content, title.length()); } /** * 只有正文 * * @param content * @return */ public List<Keyword> computeArticleTfidf(String content) { return computeArticleTfidf(content, 0); } private double getWeight(Term term, int length, int titleLength) { if (term.getName().trim().length() < 2) { return 0; } String pos = term.natrue().natureStr; Double posScore = POS_SCORE.get(pos); if (posScore == null) { posScore = 1.0; } else if (posScore == 0) { return 0; } if (titleLength > term.getOffe()) { return 5 * posScore; } return (length - term.getOffe()) * posScore / length; } }