package ruc.irm.similarity.phrase; import java.util.ArrayList; import java.util.List; import ruc.irm.similarity.Similaritable; /** * 一种简单的短语相似度计算方法,算法原理请参考《中文信息相似度计算理论与方法》一书P69. * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 */ public class PhraseSimilarity implements Similaritable { @Override public double getSimilarity(String item1, String item2) { return (getSC(item1, item2) + getSC(item2, item1)) / 2.0; } public List<Integer> getC(String first, String second, int pos) { List<Integer> results = new ArrayList<Integer>(); char ch = first.charAt(pos); for (int i = 0; i < second.length(); i++) { if (ch == second.charAt(i)) { results.add(i); } } return results; } public int getDistance(String first, String second, int pos) { int d = second.length(); for (int k : getC(first, second, pos)) { int value = Math.abs(k - pos); if (d > value) { d = value; } } return d; } public double getCC(String first, String second, int pos) { return (second.length() - getDistance(first, second, pos)) * 1.0 / second.length(); } public double getSC(String first, String second) { double total = 0.0; for (int i = 0; i < first.length(); i++) { total = total + getCC(first, second, i); } return total / first.length(); } }