package ruc.irm.similarity.word.hownet2.sememe;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ruc.irm.similarity.Similaritable;
import ruc.irm.similarity.word.hownet.HownetMeta;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import sun.rmi.runtime.Log;
/**
* 义原解析器基类,所有义原存储在xml文件中(当前package中的sememe.xml.tar.gz文件)。<br/>
* 算法的核心思想请参看论文《汉语词语语义相似度计算研究》或《中文信息相似度计算理论与方法》一书第三章<br/>
*
* 为提高运算速度,义原的加载方式做了调整,只把义原的汉语定义和对应的Id加入到MultiMap对象中,并通过义原的层次化Id计算义原之间的相似度。<br/>
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*
* @see {@link ruc.irm.similarity.Similaritable}
*/
public abstract class BaseSememeParser implements HownetMeta, Similaritable {
protected Logger LOG = LoggerFactory.getLogger(this.getClass());
/** 所有的义原都存放到一个MultiMap, Key为Sememe的中文定义, Value为义原的Id */
protected static Multimap<String, String> SEMEMES = null;
public BaseSememeParser() throws IOException {
if (SEMEMES != null) {
return;
}
SEMEMES = HashMultimap.create();
String sememeFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/sememe.xml.gz";
InputStream input = this.getClass().getClassLoader().getResourceAsStream(sememeFile);
input = new GZIPInputStream(input);
load(input);
}
/**
* 从文件中加载义元知识
*
* @throws IOException
*/
public void load(InputStream input) throws IOException {
System.out.print("loading sememes...");
long time = System.currentTimeMillis();
try {
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
int count = 0;
while (xmlEventReader.hasNext()) {
XMLEvent event = xmlEventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
if (startElement.getName().toString().equals("sememe")) {
String cnWord = startElement.getAttributeByName(QName.valueOf("cn")).getValue();
String id = startElement.getAttributeByName(QName.valueOf("id")).getValue();
SEMEMES.put(cnWord, id);
count++;
if (count % 100 == 0) {
System.out.print(".");
}
}
}
}
input.close();
} catch (Exception e) {
throw new IOException(e);
}
time = System.currentTimeMillis() - time;
System.out.println("\ncomplete!. time elapsed: " + (time / 1000) + "s");
}
/**
* 计算两个义原之间的关联度
*
* @param sememeName1
* @param sememeName2
* @return
*/
public double getAssociation(String sememeName1, String sememeName2) {
return 0.0;
}
}