package ruc.irm.similarity.statistic;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
public class LCMC {
public void countUnConceptWords(File xmlFile) throws Exception{
int totalCount = 0, conceptCount = 0;
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
InputStream input = null;
input = new FileInputStream(xmlFile);
XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
while (xmlEventReader.hasNext()) {
XMLEvent event = xmlEventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
//如果是word开始
if(startElement.getName().toString().equals("w")){
String word = xmlEventReader.getElementText();
totalCount++;
if(XiaConceptParser.getInstance().isConcept(word)){
conceptCount++;
}
}
}
}//
input.close();
System.out.println(totalCount + "\t" + conceptCount);
}
public static void main(String[] args) throws Exception {
LCMC lcmc = new LCMC();
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_A.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_B.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_C.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_D.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_E.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_F.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_G.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_H.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_J.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_K.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_L.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_M.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_N.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_P.XML"));
lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_R.XML"));
}
}