package ruc.irm.similarity.word.pinyin; import java.util.Set; import ruc.irm.similarity.Similaritable; import ruc.irm.similarity.util.EditDistance; import ruc.irm.similarity.util.PinyinUtils; /** * 通过拼音计算两个词语是否相似,拼音的相似程度采用编辑距离算法,并进行归一化衡量 * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 */ public class PinyinSimilarity implements Similaritable { public double getSimilarity(String item1, String item2) { Set<String> pinyinSet1 = PinyinUtils.getInstance().getPinyin(item1); Set<String> pinyinSet2 = PinyinUtils.getInstance().getPinyin(item2); double max = 0.0; for(String pinyin1:pinyinSet1){ for(String pinyin2:pinyinSet2){ double distance = new EditDistance().getEditDistance(pinyin1, pinyin2); double similarity = 1 - distance/( (pinyin1.length()>pinyin2.length())?pinyin1.length():pinyin2.length()); max = (max>similarity)?max:similarity; if(max==1.0){ return max; } } } return max; } }