package com.kingschan.blog.test;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.IndexTokenizer;
import com.hankcs.lucene.HanLPAnalyzer;
import com.hankcs.lucene.HanLPIndexAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.util.List;
/**
* Created by kingschan on 2017/2/23.
* lucene segment Test
*/
public class LuceneTest {
public static void main(String[] args) throws Exception{
String text="我们做软件开发的,大部分人都离不开跟数据库打交道,特别是erp开发的,跟数据库打交道更是频繁,存储过程动不动就是上千行,如果数据量大,人员流动大,那么我们还能保证下一段时间系统还能流畅的运行吗?我们还能保证下一个人能看懂我们的存储过程吗?那么我结合公司平时的培训和平时个人工作经验和大家分享一下,希望对大家有帮助。";
/* System.out.println(HanLP.segment(text));
//标准分词
List<Term> termList = StandardTokenizer.segment(text);
System.out.println(termList);
//NLP分词
List<Term> termLis = NLPTokenizer.segment(text);
System.out.println(termLis);*/
//索引分词
/* List<Term> termList1 = IndexTokenizer.segment(text);
int index=0;
for (Term term : termList1)
{
System.out.println(index+":"+term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");
index++;
}
//关键字提取
List<String> keywordList = HanLP.extractKeyword(text, 5);
System.out.println(keywordList);
//自动摘要
List<String> sentenceList = HanLP.extractSummary(text, 5);
System.out.println(sentenceList);*/
/*
Segment nShortSegment = new NShortSegment().enableCustomDictionary(false).enablePlaceRecognize(true).enableOrganizationRecognize(true);
Segment shortestSegment = new DijkstraSegment().enableCustomDictionary(false).enablePlaceRecognize(true).enableOrganizationRecognize(true);
System.out.println("N-最短分词:" + nShortSegment.seg(text) + "\n最短路分词:" + shortestSegment.seg(text));*/
for (int i = 0; i < text.length(); ++i)
{
System.out.print(text.charAt(i) + "" + i + " ");
}
System.out.println();
Analyzer analyzer1 = new HanLPIndexAnalyzer();
TokenStream tokenStream1 = analyzer1.tokenStream("field", text);
tokenStream1.reset();
int _index=1;
while (tokenStream1.incrementToken())
{
CharTermAttribute attribute = tokenStream1.getAttribute(CharTermAttribute.class);
// 偏移量
OffsetAttribute offsetAtt = tokenStream1.getAttribute(OffsetAttribute.class);
// 距离
PositionIncrementAttribute positionAttr = tokenStream1.getAttribute(PositionIncrementAttribute.class);
// 词性
TypeAttribute typeAttr = tokenStream1.getAttribute(TypeAttribute.class);
//if (offsetAtt.endOffset()-offsetAtt.startOffset()==1)continue;
/*System.out.print(_index + ":");
System.out.println(attribute);
_index++;*/
System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
}
/* System.out.println("------------------mmseg4j--------------------------------------------");
Analyzer analyzer = new MaxWordAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("text", text);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
tokenStream.reset();
int position = 0;
while (tokenStream.incrementToken()) {
int increment = positionIncrementAttribute.getPositionIncrement();
if(increment > 0) {
position = position + increment;
System.out.print(position + ":");
}
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
System.out.println(term);
// System.out.println("[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type());
}*/
}
}