package ruc.irm.similarity.sentence.editdistance; /** * 由Gregor提出的考虑块交换(Block Transposition)的编辑距离改进算法 * 时间复杂度为O(m3n3) * 具体实现请参考GregorLeusch,Nicola Ueffing的文章《A Novel String-to-String Distance Measure With * Application to Machine Translation Evaluation》 * 问题:<br/> * 相似度计算的问题会影响句子相似度计算的直观结果,例如“什么是计算机病毒”,“电脑病毒是什么” * 直觉应该是2,即“什么是计算机病毒”首先变为“计算机病毒什么是”,再变为“计算机病毒是什么”, * 编辑代价为2,但实际上,当由“什么是计算机病毒”变为“计算机病毒什么是”后,由于"什么是"与“是什么”的替换代价只有0.2, * 因而不再进行交互,故总的编辑距离为1.2 * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 */ public class GregorEditDistance extends EditDistance { /** 块交换代价 */ public static double swapCost = 0.5; private SuperString<? extends EditUnit> S,T; /** 存放字符串从S(i0-i1)到T(j0-j1)的中间运算结果,避免多次运算,提高运算效率*/ private double[][][][] QArray; public double getEditDistance(SuperString<? extends EditUnit> S,SuperString<? extends EditUnit> T){ this.S = S; this.T = T; QArray = new double[S.length()][S.length()][T.length()][T.length()]; for(int i=0;i<S.length();i++){ for(int i2=0;i2<S.length();i2++) for(int j=0;j<T.length();j++) for(int j2=0;j2<T.length();j2++){ QArray[i][i2][j][j2] = Double.MAX_VALUE; } } return Q(0,S.length()-1,0,T.length()-1); } private double Q(int i0,int i1,int j0,int j1){ double cost = 0; if(i1<i0){ for(int j = j0; j<=j1; j++){ cost += T.elementAt(j).getInsertionCost(); } return cost; }else if(j1<j0){ for(int i=i0; i<=i1; i++){ cost += S.elementAt(i).getDeletionCost(); } return cost; }else if(i1==i0 && j1==j0){ cost = S.elementAt(i0).getSubstitutionCost(T.elementAt(j0)); QArray[i0][i1][j0][j1] = cost; return cost; } else if(i1==i0){ double minSubstituteValue = 1.0; int minPosJ = j0; for(int j=j0;j<=j1;j++){ double subsitituteValue = S.elementAt(i0).getSubstitutionCost(T.elementAt(j)); if(minSubstituteValue > subsitituteValue){ minSubstituteValue = subsitituteValue; minPosJ = j; } } for(int j=j0;j<=j1;j++){ if(j == minPosJ){ cost += minSubstituteValue; }else{ cost += T.elementAt(j).getInsertionCost(); } } }else if(j1==j0){ double minSubstituteValue = 1.0; int minPosI = i0; for(int i=i0;i<=i1;i++){ double subsitituteValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j0)); if(minSubstituteValue > subsitituteValue){ minSubstituteValue = subsitituteValue; minPosI = i; } } for(int i=i0;i<=i1;i++){ if(i == minPosI){ cost += minSubstituteValue; }else{ cost += S.elementAt(i).getDeletionCost(); } } }else{ if(QArray[i0][i1][j0][j1]<Double.MAX_VALUE){ return QArray[i0][i1][j0][j1]; } for(int i=i0;i<i1;i++){ for(int j=j0;j<j1;j++){ double c = Math.min(Q(i0,i,j0,j)+Q(i+1,i1,j+1,j1), Q(i0,i,j+1,j1)+Q(i+1,i1,j0,j)+swapCost); if(c<QArray[i0][i1][j0][j1]){ QArray[i0][i1][j0][j1] = c; } } } return QArray[i0][i1][j0][j1]; } QArray[i0][i1][j0][j1] = cost; return cost; } public static void main(String[] argv) { String s1 = "abcxdef"; String s2 = "defxabc"; //String s2 = "我的密码我忘记了,我该怎样做呢?"; GregorEditDistance ed = new GregorEditDistance(); System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1), SuperString.createCharSuperString(s2))); } }