package ruc.irm.similarity.word; import java.util.ArrayList; import java.util.List; import ruc.irm.similarity.Similaritable; /** * 字面相似度计算方法 * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 */ public class CharBasedSimilarity implements Similaritable { private double alpha = 0.6; private double beta = 0.4; @Override public double getSimilarity(String word1, String word2) { if(isBlank(word1)&& isBlank(word2)){ return 1.0; } if(isBlank(word1)|| isBlank(word2)){ return 0.0; } List<Character> sameHZ = new ArrayList<Character>(); String longString = word1.length()>=word2.length()?word1:word2; String shortString = word1.length()<word2.length()?word1:word2; for(int i=0; i<longString.length(); i++){ Character ch = longString.charAt(i); if(shortString.contains(ch.toString())){ sameHZ.add(ch); } } double dp = Math.min(1.0*word1.length()/word2.length(), 1.0*word2.length()/word1.length()); double part1 = alpha*(1.0*sameHZ.size()/word1.length() + 1.0*sameHZ.size()/word2.length())/2.0; double part2 = beta*dp*(getWeightedResult(word1, sameHZ) + getWeightedResult(word2, sameHZ))/2.0; return part1+part2; } private double getWeightedResult(String word1, List<Character> sameHZ){ double top = 0; double bottom = 0; for(int i=0; i<word1.length(); i++){ if(sameHZ.contains(word1.charAt(i))){ top+=(i+1); } bottom += (i+1); } return 1.0*top/bottom; } private boolean isBlank(String str){ return str == null || str.trim().equals(""); } }