package edu.fudan.nlp.similarity; /** * 计算编辑距离 * @author xpqiu * @version 1.0 * @since 1.0 */ public class EditDistance { public float calcNormalise(String word, String word2) { float distance = calc(word, word2); int len = word.length() > word2.length() ? word.length() : word2.length(); return (len - distance) / len; } /** * 将x转换到y的编辑距离,可以自定义一些代价 * @param cSeq1 * @param cSeq2 * @return 距离 */ public float calc(String cSeq1, String cSeq2) { //+1 : 下标为0节点为动态规划的起点 // cSeq1.length >= cSeq2.length > 1 int xsLength = cSeq1.length() + 1; // > ysLength int ysLength = cSeq2.length() + 1; // > 2 float[] lastSlice = new float[ysLength]; float[] currentSlice = new float[ysLength]; // first slice is just inserts currentSlice[0]=0; for (int y = 1; y < ysLength; ++y) currentSlice[y] = currentSlice[y-1] + costIns(cSeq2.charAt(y-1)); // y inserts down first column of lattice for (int x = 1; x < xsLength; ++x) { char cX = cSeq1.charAt(x-1); ///////////exchange between lastSlice and currentSlice//////////// float[] lastSliceTmp = lastSlice; lastSlice = currentSlice; currentSlice = lastSliceTmp; ///////////////////////////// currentSlice[0] = lastSlice[0]+costDel(cSeq1.charAt(x-1)); // x deletes across first row of lattice for (int y = 1; y < ysLength; ++y) { int yMinus1 = y - 1; char cY = cSeq2.charAt(yMinus1); // unfold this one step further to put 1 + outside all mins on match currentSlice[y] = Math.min(cX == cY ? lastSlice[yMinus1] // match : costReplace(cX,cY) + lastSlice[yMinus1], // 替换代价 Math.min(costDel(cX)+lastSlice[y], // 删除代价 costIns(cY)+currentSlice[yMinus1])); // 插入代价 } } return currentSlice[currentSlice.length-1]; } static String noCostChars = "的 最和"; static String maxCostChars = "不"; /** * @param c * @return 插入代价 */ protected static float costIns(char c) { if(noCostChars.indexOf(c)!=-1) return 0; if(maxCostChars.indexOf(c)!=-1) return 5; return 1; } /** * 删除 * @param c * @return 删除代价 */ protected static float costDel(char c) { if(noCostChars.indexOf(c)!=-1) return 0; if(maxCostChars.indexOf(c)!=-1) return 5; return 1; } static char[][] repCostChars = new char[][]{{'C','G'}}; /** * x和y肯定不同的 * @param x * @param y * @return 代价 */ protected static float costReplace(char x, char y) { int cost = 1; for(char[] xy: repCostChars){ if(xy[0]==x&&xy[1]==y){ cost =2; break; }else if(xy[0]==y&&xy[1]==x){ cost =2; break; } } return cost;//noCostChars.indexOf(c)!=-1?1:0; } public float sim(String str1, String str2) { float ld = calc(str1, str2); return 1 - ld / Math.max(str1.length(), str2.length()); } }