package ruc.irm.similarity.sentence.editdistance;
/**
* 基于编辑距离的汉语句子相似度计算
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*/
public class StandardEditDistance extends EditDistance {
/**
* 获取两个串的编辑距离
* @param S 字符串1
* @param T 字符串2
* @return 两个串的编辑距离
*/
public double getEditDistance(SuperString<? extends EditUnit> X, SuperString<? extends EditUnit> Y){
double[][] D; //编辑矩阵
int m = X.length(); //字符串X的长度
int n = Y.length(); //字符串Y的长度
//char ch_x_i; //字符串X的第i个词
//char ch_y_j; //字符串Y的第j个词
if(m == 0){
double distance = 0.0;
for(int j=0; j<n; j++){
distance += Y.elementAt(j).getInsertionCost();
}
return distance;
}else if(n == 0){
double distance = 0.0;
for(int i=0; i<m; i++){
distance += X.elementAt(i).getDeletionCost();
}
return distance;
}
D = new double[n+1][m+1];
D[0][0] = 0.0; //第一个初始化为0
/** 初始化D[0][j] */
for(int j = 1; j<=m; j++){
D[0][j] = D[0][j-1]+X.elementAt(j-1).getDeletionCost();
}
/** 初始化D[i][0] */
for(int i = 1;i<=n; i++){
D[i][0] = D[i-1][0]+ Y.elementAt(i-1).getInsertionCost();
}
for(int i=1; i<=m; i++){
EditUnit unit_x_i = X.elementAt(i-1);
for(int j=1; j<=n; j++){
EditUnit unit_y_j = Y.elementAt(j-1);
double cost = unit_x_i.getSubstitutionCost(unit_y_j);
D[j][i] = Math.min(D[j-1][i]+Y.elementAt(j-1).getInsertionCost(),D[j][i-1]+X.elementAt(i-1).getDeletionCost());
D[j][i] = Math.min(D[j][i], D[j-1][i-1]+cost);
}
}
return D[n][m];
}
public static void main(String[] args) {
String s1 = "abcdefg";
String s2 = "gcdefab";
StandardEditDistance ed = new StandardEditDistance();
s1 = "什么是计算机病毒";
s2 = "什么是电脑病毒";
System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1), SuperString.createCharSuperString(s2)));
System.out.println(ed.getEditDistance(SuperString.createWordSuperString(s1), SuperString.createWordSuperString(s2)));
}
}