package com.fpcms.common.util;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.junit.Assert;
import org.junit.Test;
import com.github.rapid.common.util.Profiler;
import com.fpcms.common.util.ChineseSegmenterUtil.TokenCount;
public class ChineseSegmenterUtilTest extends Assert {
@Test
public void test_getMinLengthKeywords() {
List<String> keywords = ChineseSegmenterUtil.getMinLengthKeywords("中国人民银行是个好银行", 2, false);
assertEquals(keywords.toString(),"[人民, 国人, 中国人民银行, 中国人, 人民银行, 中国人民, 银行, 中国]");
keywords = ChineseSegmenterUtil.getMinLengthKeywords("中国人民银行是个好银行", 2, true);
assertEquals(keywords.toString(),"[中国人民银行, 银行]");
keywords = ChineseSegmenterUtil.getMinLengthKeywords("中国人民银行是个好银行", 1, true);
assertEquals(keywords.toString(),"[个, 中国人民银行, 是, 银行, 好]");
}
@Test
public void test_163_sina_qq_news() throws IOException {
String content = NetUtil.httpGet("http://news.163.com");
content += NetUtil.httpGet("http://news.sina.com.cn");
content += NetUtil.httpGet("http://news.163.com/rank/");
content += NetUtil.httpGet("http://news.qq.com");
Map<String,Integer> map = ChineseSegmenterUtil.segmenteForTokenCount(new StringReader(content),false);
Set validKeywords = new LinkedHashSet<String>();
PrintWriter writer = new PrintWriter(new FileWriter("/tmp/unuse_keywords.txt"));
for(String key : map.keySet()) {
if(key.length() >= 2) {
if(key.matches(".*[\u4e00-\u9fa5].*")) {
validKeywords.add(key);
System.out.println(key);
writer.println(key);
}
}
}
// System.out.println(map);
System.out.println("validKeywords.size:"+validKeywords.size());
System.out.println(map.size());
}
@Test
public void test_segmenteForTokenCount() throws IOException {
String string = "中国人民银行是一家好国家,中国太棒了,一,一,一";
Map map = ChineseSegmenterUtil.segmenteForTokenCount(new StringReader(string),false);
System.out.println(map);
}
@Test
public void test_toTokenCountList() throws IOException {
String string = "中国人民银行是一家好国家,中国太棒了,一,一,一";
Map map = ChineseSegmenterUtil.segmenteForTokenCount(new StringReader(string),false);
List<TokenCount> tokenCountList = ChineseSegmenterUtil.toSortedTokenCountList(map);
System.out.println("tokenCountList:"+tokenCountList);
assertEquals(tokenCountList.get(0).getToken(),"一");
assertEquals(tokenCountList.get(1).getToken(),"中国");
}
@Test
public void test_perf() throws IOException {
String string = "中国人民银行是一家好国家,中国太棒了,一,一,一";
int loopCount = 10000;
Profiler.start("ChineseSegmenterUtil.test_perf",loopCount);
for(int i = 0; i < loopCount; i++) {
Map map = ChineseSegmenterUtil.segmenteForTokenCount(new StringReader(string),false);
}
Profiler.release();
System.out.println(Profiler.dump());
}
}