package ruc.irm.similarity.word.hownet.sememe; import java.io.IOException; import java.io.InputStream; import java.util.Collection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ruc.irm.similarity.Similaritable; import ruc.irm.similarity.util.BlankUtils; import ruc.irm.similarity.util.FileUtils; import ruc.irm.similarity.word.hownet.HownetMeta; /** * 义原解析器, 包括义元数据的加载,义元的组织、索引、查询 以及义元的距离计算和相似度计算等. * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》 * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 * * @see ruc.irm.similarity.Similaritable * @deprecated */ public abstract class SememeParser implements HownetMeta, Similaritable { protected Logger LOG = LoggerFactory.getLogger(this.getClass()); /** 所有的义原都存放到一个数组之中,并且义元的ID号与数组的下标相同 */ protected Sememe[] SEMEMES; /** 通过对义原的汉语词义进行索引,根据该索引快速定位义原,找出义原的id,再到sememes中查找 */ private FastSimpleMap<String, Integer> sememeMap = null; public SememeParser() throws IOException{ String sememeFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat"; InputStream input = this.getClass().getClassLoader().getResourceAsStream(sememeFile); load(input, "UTF-8"); } /** * 获取两个义原描述串的相似度 * @param sememeName1 * @param sememeName2 * @see ke.commons.similarity.Similariable * @return */ public abstract double getSimilarity(String sememeName1, String sememeName2); /** * 获取两个确定义原的相似度 * @param sememe1 * @param sememe2 * @return */ public abstract double getSimilarity(Sememe sememe1, Sememe sememe2); /** * 从文件中加载义元知识 * * @throws IOException */ public void load(InputStream input, String encoding) throws IOException { SememeDictTraverseEvent event = new SememeDictTraverseEvent(); LOG.info("loading sememe dictionary..."); long time = System.currentTimeMillis(); FileUtils.traverseLines(input, encoding, event); this.SEMEMES = event.getSememes(); String[] keys = new String[SEMEMES.length]; Integer[] values = new Integer[SEMEMES.length]; //设置索引 for(int i=0; i<SEMEMES.length; i++){ keys[i] = SEMEMES[i].getCnWord(); values[i] = SEMEMES[i].getId(); } sememeMap = new FastSimpleMap<String, Integer>(keys, values); time = System.currentTimeMillis() - time; LOG.info("sememe dictionary load completely. time elapsed: " + time); } /** * 根据汉语定义计算义元之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大, * <br/>由于可能多个义元有相同的汉语词语,故计算结果为其中距离最小者 * * @param key1 * @param key2 * @return */ public int getDistance(String key1, String key2) { int distance = Integer.MAX_VALUE; // 如果两个字符串相等,直接返回距离为0 if (key1.equals(key2)) { return 0; } Integer[] semArray1 = getSememes(key1); Integer[] semArray2 = getSememes(key2); // 如果key1或者key2不是义元,并且key1<>key2,则返回无穷大 if (semArray1.length == 0 || semArray2.length == 0) { return Integer.MAX_VALUE; } for(int i:semArray1){ for(int j:semArray2){ int d = getDistance(SEMEMES[i], SEMEMES[j]); if(d<distance){ distance = d; } } } return distance; } /** * 获取两个义元在义原树中的距离 * * @param sem1 * 第一个义原 * @param sem2 * 第二个义原 * @return 两个义原的距离 */ public int getDistance(Sememe sem1, Sememe sem2) { Sememe mysem1 = sem1; Sememe mysem2 = sem2; int distance = 0; if (mysem1 == null || mysem2 == null) return Integer.MAX_VALUE; //变为深度相同,然后一次上找共同的父节点 int level = mysem1.getDepth() - mysem2.getDepth(); for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) { if (level > 0) mysem1 = SEMEMES[mysem1.getParentId()]; else mysem2 = SEMEMES[mysem2.getParentId()]; distance++; } //从不同的分支(深度相同)同时向上寻找共同的祖先节点 while (mysem1.getId() != mysem2.getId()) { // 如果已经到达根节点,仍然不同,则返回无穷大(-1) if (mysem1.getId() == mysem1.getParentId() || mysem2.getId() == mysem2.getParentId()) { distance = Integer.MAX_VALUE; break; } mysem1 = SEMEMES[mysem1.getParentId()]; mysem2 = SEMEMES[mysem2.getParentId()]; distance += 2; } return distance; } /** * 获取从该义元到根节点的路径表示字符串 * * @param key * @return */ public String getPath(String key) { StringBuilder path = new StringBuilder(); Sememe sem = getSememe(key); while (sem != null && sem.getId() != sem.getParentId()) { path.insert(0, "->" + sem.getCnWord()); sem = SEMEMES[sem.getParentId()]; } if (sem != null){ path.insert(0, "->" + sem.getCnWord()); } path.insert(0, "START"); return path.toString(); } /** * 根据义原的名字,获取该义原的位置信息,义原体系中有时会有一个名字对应多个义原,一并返回到 * 义原数组中 * @param sememeName * @return */ public Integer[] getSememes(String sememeName) { Collection<Integer> ids = sememeMap.get(sememeName); return ids.toArray(new Integer[ids.size()]); } /** * 获取其中的一个义原,大部分义原就只有一个 * @param sememeName * @return */ public Sememe getSememe(String sememeName){ Integer[] ids = getSememes(sememeName); if(BlankUtils.isBlank(ids)){ return null; }else{ return SEMEMES[ids[0]]; } } /** * 过滤义原字符串,去掉其中的英文部分 * @param sememeString * @return */ protected String filterSememeString(String sememeString){ int pos = sememeString.indexOf("|"); if (pos >= 0) { sememeString = sememeString.substring(pos + 1); } return sememeString; } }