package edu.fudan.nlp.similarity;
import java.util.HashSet;
import edu.fudan.nlp.corpus.CiLin;
/**
* @author xpqiu
* @version 1.0
* @since 1.0
*/
public class EditDistanceWithSemantic extends EditDistance implements ISimilarity <String>{
private int wordlen;
private HashSet<String> synSet;
public EditDistanceWithSemantic(){
wordlen = 2;
String dataFile = "\\\\10.11.7.3\\f$\\对于共享版《同义词词林》的改进\\improvedThesaurus.data";
synSet = (HashSet<String>) CiLin.buildSynonymSet(dataFile);
}
/**
* 将x转换到y的编辑距离,可以自定义一些代价
*/
public float calc(String item1, String item2) {
String str1 = (String) item1;
String str2 = (String) item2;
float d[][]; //矩阵
int n = str1.length();
int m = str2.length();
int i; //遍历str1的
int j; //遍历str2的
char ch1; //str1的
char ch2; //str2的
int cost; //记录相同字符,在某个矩阵位置值的增量,不是0就是1
if(n == 0) {
return m;
}
if(m == 0) {
return n;
}
d = new float[n+1][m+1];
for(i=0; i<=n; i++) { //初始化第一列
d[i][0] = i;
}
for(j=0; j<=m; j++) { //初始化第一行
d[0][j] = j;
}
for(i=1; i<=n; i++) { //遍历str1
char cX = str1.charAt(i-1);
//去匹配str2
for(j=1; j<=m; j++) {
//根据同义计算未来代价
for(int ii=1;ii<=wordlen;ii++){
if(ii+i-1>str1.length())
break;
for(int jj=1;jj<=wordlen;jj++){
if(jj+j-1>str2.length())
break;
String combine = str1.substring(i-1, ii+i-1)+"|"+str2.substring(j-1,jj+j-1);
//System.out.println(combine);
if(synSet.contains(combine)){
if(d[i+ii-1][j+jj-1]>0)
d[i+ii-1][j+jj-1]=Math.min(d[i+ii-1][j+jj-1],d[i-1][j-1]+0.1f);
else
d[i+ii-1][j+jj-1]=d[i-1][j-1]+0.1f;
}
}
}
char cY = str2.charAt(j-1);
float temp = (cX == cY ? d[i-1][j-1] // match
: costReplace(cX,cY) + d[i-1][j-1]);
if(d[i][j]>0){
temp = Math.min(temp, d[i][j]);
}
d[i][j] = Math.min(temp, // 替换代价
Math.min(costDel(cX)+d[i-1][j], // 删除代价
costIns(cY)+d[i][j-1])); // 插入代价
}
}
return d[n][m];
}
public static void main(String[] args) {
EditDistanceWithSemantic ed = new EditDistanceWithSemantic();
String str1 = "发行时间 ";
String str2 = "生日";
System.out.println("ld="+ed.calc(str1, str2));
//System.out.println("sim="+ed.sim(str1, str2));
}
}