package ruc.irm.similarity.sentence.morphology;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ruc.irm.similarity.sentence.SegmentProxy;
import ruc.irm.similarity.sentence.SegmentProxy.Word;
import ruc.irm.similarity.sentence.SentenceSimilarity;
import ruc.irm.similarity.word.WordSimilarity;
import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
/**
* 基于词形和词序的句子相似度计算算法,考虑了语义因素<br/>
* 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法,在考虑语义时,
* 无法直接获取OnceWS(A, B),因此,采用了两两匹配取最大值的方式。
* 新的改进算法请参考{@code SemanticSimilarity}
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*
*/
public class MorphoSimilarity implements SentenceSimilarity {
private static Logger LOG = LoggerFactory.getLogger(MorphoSimilarity.class);
/** 词形相似度占总相似度的比重 */
private final double LAMBDA1 = 1.0;
/** 词序相似度占总相似度的比重 */
private final double LAMBDA2 = 0.0;
/** 词语相似度的计算 */
private WordSimilarity wordSimilarity = null;
private static String FILTER_CHARS = " ,。;?《》()|!,.;?<>|_^…!";
private static MorphoSimilarity instance = null;
public static MorphoSimilarity getInstance(){
if(instance == null){
instance = new MorphoSimilarity();
}
return instance;
}
private MorphoSimilarity(){
LOG.debug("used hownet wordsimilarity.");
this.wordSimilarity = XiaConceptParser.getInstance();
//this.segmenter = SegmentFactory.getInstance().getParser();
}
/**
* 滤掉词串中的空格、标点符号
* @param word_list
* @return
*/
private String[] filter(String[] word_list){
List<String> results = new ArrayList<String>();
for(String w:word_list){
if(!FILTER_CHARS.contains(w)){
results.add(w.toLowerCase());
}
}
return results.toArray(new String[results.size()]);
}
/**
* 计算两个句子的相似度
* @see ruc.irm.similarity.Similaritable
*/
public double getSimilarity(String firstSen,String secondSen){
//LOG.debug(segmenter.segmentToString(firstSen));
//LOG.debug(segmenter.segmentToString(secondSen));
String[] firstList = filter(segment(firstSen));
String[] secondList = filter(segment(secondSen));
double wordSim = getOccurrenceSimilarity(firstList,secondList);
//LOG.debug("词形相似度="+wordSim);
double orderSim = getOrderSimilarity(firstList,secondList);
//LOG.debug("词序相似度="+orderSim);
return LAMBDA1*wordSim+LAMBDA2*orderSim;
}
/**
* 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
* @param firstList
* @param secondList
* @return
*/
public double getOccurrenceSimilarity(String[] firstList, String[] secondList){
int max = firstList.length>secondList.length?firstList.length:secondList.length;
if(max==0){
return 0;
}
//首先计算出所有可能的组合
double[][] scores = new double[max][max];
for(int i=0; i<firstList.length; i++){
for(int j=0; j<secondList.length; j++){
scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
}
}
double total_score = 0;
//从scores[][]中挑选出最大的一个相似度,然后减去该元素,进一步求剩余元素中的最大相似度
while(scores.length > 0){
double max_score = 0;
int max_row = 0;
int max_col = 0;
//先挑出相似度最大的一对:<row, column, max_score>
for(int i=0; i<scores.length; i++){
for(int j=0; j<scores.length; j++){
if(max_score<scores[i][j]){
max_row = i;
max_col = j;
max_score = scores[i][j];
}
}
}
//从数组中去除最大的相似度,继续挑选
double[][] tmp_scores = new double[scores.length-1][scores.length-1];
for(int i=0; i<scores.length; i++){
if(i == max_row) continue;
for(int j=0; j<scores.length; j++){
if(j == max_col) continue;
int tmp_i = max_row>i?i:i-1;
int tmp_j = max_col>j?j:j-1;
tmp_scores[tmp_i][tmp_j] = scores[i][j];
}
}
total_score += max_score;
scores = tmp_scores;
}
return (2*total_score) / (firstList.length + secondList.length);
}
/**
* 获取两个集合的词序相似度
* @param firstList
* @param secondList
* @return
*/
public double getOrderSimilarity(String[] firstList, String[] secondList){
double similarity = 0.0;
return similarity;
}
// @SuppressWarnings("unchecked")
// public String[] segment(String sentence){
// MPWordSegment ws = new MPWordSegment();
// ws.parseReader(new StringReader(sentence));
// Vector tokens = ws.getTokens();
// String[] results = new String[tokens.size()];
// for(int i=0; i<tokens.size(); i++){
// Token token = (Token)tokens.get(i);
// results[i] = token.termText();
// }
//
// return results;
// }
public String[] segment(String sentence){
List<Word> list = SegmentProxy.segment(sentence);
String[] results = new String[list.size()];
for(int i=0; i<list.size(); i++){
results[i] = list.get(i).getWord();
}
return results;
}
}