package ruc.irm.tendency.word;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import ruc.irm.similarity.util.BlankUtils;
import ruc.irm.similarity.word.hownet2.concept.Concept;
import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
/**
* 临时训练及测试类
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*/
public class Training {
void test(boolean testPositive) throws IOException{
WordTendency tendency = new HownetWordTendency();
File f = new File("./dict/sentiment/负面情感词语(中文).txt");
if(testPositive){
//f = new File("./dict/sentiment/正面情感词语(中文).txt");
f = new File("./dict/sentiment/正面评价词语(中文).txt");
}
String encoding = "utf-8";
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
String line;
int wordCount = 0;
int correctCount = 0;
while ((line = in.readLine()) != null) {
if(line.length()>5) continue;
wordCount++;
double value =tendency.getTendency(line.trim());
if(value>0 && testPositive){
correctCount++;
}else if(value<0 && !testPositive){
correctCount++;
}else{
System.out.println("error:" + line + "\t value:" + value);
}
}
System.out.println("correct:" + correctCount);
System.out.println("total:" + wordCount);
System.out.println("ratio:" + correctCount*1.0/wordCount);
}
/**
* 该方法用于统计知网提供的情感词集合所涉及的义原以及出现频度
* @throws IOException
*/
/**
* @throws IOException
*/
void countSentimentDistribution() throws IOException{
Map<String, Integer> sememeMap = new HashMap<String, Integer>();
File f = new File("./dict/sentiment/负面情感词语(中文).txt");
String encoding = "utf-8";
boolean autoCombineConcept = false;
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
XiaConceptParser parser = new XiaConceptParser(new XiaSememeParser());
String line = null;
int conceptCount = 0;
int wordCount = 0;
while ((line = in.readLine()) != null) {
if(line.length()>5) continue;
wordCount++;
String word = line.trim();
Collection<Concept> concepts = parser.getInnerConcepts(word);
//由于目前的词典为知网2000版本,所以默认情况下仅对词典中出现的概念进行统计
if(BlankUtils.isBlank(concepts) && autoCombineConcept ){
concepts = parser.autoCombineConcepts(word, null);
}
for(Concept c: concepts){
conceptCount++;
List<String> names = new ArrayList<String>();
//加入主义原
names.add(c.getMainSememe());
//加入关系义原
for(String item:c.getRelationSememes()){
names.add(item.substring(item.indexOf("=") + 1));
}
//加入符号义原
for(String item:c.getSymbolSememes()){
names.add(item.substring(1));
}
//加入其他义原集合
for(String item:c.getSecondSememes()){
names.add(item);
}
for(String item:names){
Integer count = sememeMap.get(item);
if(count==null){
sememeMap.put(item, 1);
}else{
sememeMap.put(item, count+1);
}
}
}
}
in.close();
//以下是为了按照义原出现的数量进行排序的代码
Multimap<Integer, String> map2 = HashMultimap.create();
for(String key:sememeMap.keySet()){
map2.put(sememeMap.get(key), key);
}
List<Integer> keys = new ArrayList<Integer>();
for(Integer key: map2.keySet()){
keys.add(key);
}
Collections.sort(keys);
int smallSememeCount = 0; //较少出现的不同义原数量
int smallAppearTotal = 0; //较少出现的义原在概念众出现的次数总和
for(int index=(keys.size()-1); index>=0; index--){
Integer key = keys.get(index);
Collection<String> values = map2.get(key);
double ratio = (key*100.0/conceptCount);
System.out.print(key + "(" + ratio + "%): ");
for(String v:values){
System.out.print(v+ "\t");
}
System.out.println();
if(ratio<0.7){
smallSememeCount += values.size();
smallAppearTotal += key*values.size();
}
}
System.out.println("small info: ");
System.out.println("\tdifferent sememes:" + smallSememeCount);
System.out.println("\tappear count:" + smallAppearTotal);
System.out.println("\tratio:" + smallAppearTotal*100.0/conceptCount);
System.out.println("wordCount:" + wordCount);
System.out.println("conceptCount:" + conceptCount);
}
public static void main(String[] args) throws IOException {
Training training = new Training();
training.countSentimentDistribution();
// System.out.println("test positive:");
// training.test(true);
//
// System.out.println("test negative:");
//training.test(false);
}
}