package ruc.irm.similarity.statistic;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
/**
* 用于统计分词词典文件中的概念出现数量
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*/
public class DictStatistic {
/**
* 从指定的xml文件加载词典文件
* @param xmlFile
* @param gzCompressed 是否再用gz格式对词典进行了压缩
* @return
*/
public void testFromXml(String xmlFile, boolean gzCompressed) {
File file = new File(xmlFile);
if (!file.canRead()){
System.out.println("无法读取文件:" + xmlFile);
return;// fail while opening the file
}
int count = 0, conceptCount=0;
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
InputStream input = null;
try {
if(gzCompressed){
input = new GZIPInputStream(new FileInputStream(file));
}else{
input = new FileInputStream(file);
}
XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
while (xmlEventReader.hasNext()) {
XMLEvent event = xmlEventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
if(startElement.getName().toString().equals("table")){
String head = startElement.getAttributeByName(QName.valueOf("head")).getValue();
while (xmlEventReader.hasNext()) {
XMLEvent itemEvent = xmlEventReader.nextEvent();
if(itemEvent.isStartElement()){
StartElement itemStartElement = itemEvent.asStartElement();
if(!itemStartElement.getName().toString().equals("item")) continue;
String word = itemStartElement.getAttributeByName(QName.valueOf("word")).getValue();
word = head + word;
if(XiaConceptParser.getInstance().isConcept(word)){
conceptCount++;
}
count++;
if(count%1000==0){
System.out.println("process words " + count + "...");
}
}
}
}
}
}
input.close();
System.out.println(count + "\t" + conceptCount);
return;
} catch (Exception e) {
e.printStackTrace();
}
}
}