package com.yc.nlp;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.yc.nlp.normal.Normal;
import com.yc.nlp.pojo.Result;
import com.yc.nlp.seg.InitSeg;
import com.yc.nlp.sentiment.Sentiment;
import com.yc.nlp.sim.BM25;
import com.yc.nlp.tag.Tag;
import com.yc.nlp.textrank.KeyWordTextRank;
import com.yc.nlp.textrank.TextRank;
public class NLP {
private static Logger logger = LoggerFactory.getLogger(NLP.class);
private String doc;
private BM25 bm25;
private InitSeg seg;
private Normal normal;
private Sentiment sentiment;
private Tag tag;
private TextRank textRank;
private KeyWordTextRank kvTextRank;
@SuppressWarnings("unchecked")
public NLP(Object doc) {
if (doc instanceof String) {
this.doc = doc.toString();
} else {
try {
List<List<String>> convert = (List<List<String>>) doc;
this.bm25 = new BM25(convert);
} catch (Exception e) {
logger.error("暂不支持第三种数据格式");
}
}
this.seg = Setup.getSeg();
this.normal = Setup.getNormal();
this.sentiment = Setup.getSentiment();
this.tag = Setup.getTag();
}
public List<String> words() {
return seg.seg(doc);
}
public List<String> sentences() {
return normal.getSentence(this.doc);
}
public String han() {
return normal.zh2hans(this.doc);
}
public String pinyin() {
List<String> words = this.words();
String ret = "";
for (String word : words) {
ret += normal.getPinyin(word) + " ";
}
return ret;
}
public double sentiments() {
return sentiment.classify(doc);
}
public List<Result> tags() throws Exception {
List<String> words = new ArrayList<String>(this.words());
List<String> tags = this.tag.tag(words);
List<Result> result = new ArrayList<Result>();
if (words.size() == tags.size()) {
for (int i = 0; i < words.size(); i++) {
result.add(new Result(words.get(i), tags.get(i)));
}
}
return result;
}
public List<Map<String, Integer>> tf() {
return this.bm25.getF();
}
public Map<String, Double> idf() {
return this.bm25.getIdf();
}
public List<Double> sim(List<String> doc) {
return this.bm25.simall(doc);
}
public List<String> summary(Integer... limit) {
Integer size = 0;
if (limit == null || limit.length == 0) {
size = 5;
} else {
size = limit[0];
}
List<List<String>> doc = new ArrayList<List<String>>();
List<String> sents = this.sentences();
for (String sent : sents) {
List<String> words = this.seg.seg(sent);
words = this.normal.filterStop(words);
doc.add(words);
}
this.textRank = new TextRank(doc);
this.textRank.solve();
List<String> ret = new ArrayList<String>();
List<Integer> indexes = this.textRank.topIndex(size);
for (Integer index : indexes) {
ret.add(sents.get(index));
}
return ret;
}
public List<String> keywords(Integer... limit) {
Integer size = 0;
if (limit == null || limit.length == 0) {
size = 5;
} else {
size = limit[0];
}
List<List<String>> doc = new ArrayList<List<String>>();
List<String> sents = this.sentences();
for (String sent : sents) {
List<String> words = this.seg.seg(sent);
words = this.normal.filterStop(words);
doc.add(words);
}
this.kvTextRank = new KeyWordTextRank(doc);
this.kvTextRank.solve();
List<String> ret = new ArrayList<String>();
List<String> indexes = this.kvTextRank.topIndex(size);
for (String index : indexes) {
ret.add(index);
}
return ret;
}
}