/*
* Sifarish: Recommendation Engine
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.sifarish.feature;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
/**
* Distance based on edit distance of corresponding tokens in text
* @author pranab
*
*/
public class EditDistanceSimilarity extends DynamicAttrSimilarityStrategy {
private Set<String> sequences = new HashSet<String>();
private int maxSeqLength = 0;
private static final int MIN_TOKEN_LENGTH = 1;
private boolean tokenWise;
public EditDistanceSimilarity(boolean tokenWise) {
super();
this.tokenWise = tokenWise;
}
/* (non-Javadoc)
* @see org.sifarish.feature.DynamicAttrSimilarityStrategy#findDistance(java.lang.String, java.lang.String)
*/
@Override
public double findDistance(String src, String target) {
double distance = 0;
if (tokenWise) {
distance = findDistanceTokenWise( src, target);
} else {
distance = findDistanceFieldWise( src, target);
}
return distance;
}
/**
* @param src
* @param target
* @return
*/
private double findDistanceTokenWise(String src, String target) {
double distance = 0;
int editDistance = 0;
//System.out.println("findDistanceTokenWise:" + src + ":" + target);
String[] srcTerms = src.split(fieldDelimRegex);
String[] trgTerms = target.split(fieldDelimRegex);
if (srcTerms.length == trgTerms.length) {
for (int i = 0; i < srcTerms.length; ++i ) {
String srcItem = srcTerms[i];
String trgItem = trgTerms[i];
editDistance = 0;
//only if tokens are not equal
if (!srcItem.equals(trgItem)) {
if (srcItem.length() == 1) {
if (trgItem.indexOf(srcItem) >= 0) {
editDistance = trgItem.length() - 1;
} else {
editDistance = trgItem.length() + 1;
}
} else if (trgItem.length() == 1) {
if (srcItem.indexOf(srcItem) >= 0) {
editDistance = srcItem.length() - 1;
} else {
editDistance =srcItem.length() + 1;
}
} else {
sequences.clear();
maxSeqLength = 0;
generateSubSequences(srcItem, true);
generateSubSequences(trgItem, false);
editDistance = srcItem.length() + trgItem.length() - 2 * maxSeqLength;
}
}
//normalize
distance += ((double)editDistance) / (srcItem.length() + trgItem.length() );
}
//average over number of tokens
distance /= srcTerms.length;
} else {
//unequal number of tokens
distance = 1.0;
}
//System.out.println("edit distance:" + distance);
return distance;
}
/**
* @param src
* @param target
* @return
*/
private double findDistanceFieldWise(String src, String target) {
double distance = 0;
int editDistance = 0;
sequences.clear();
maxSeqLength = 0;
generateSubSequences(src, true);
generateSubSequences(target, false);
editDistance = src.length() + target.length() - 2 * maxSeqLength;
distance += ((double)editDistance) / (src.length() + target.length() );
return distance;
}
/**
* @param token
* @param store
*/
private void generateSubSequences(String token, boolean store) {
int len = token.length();
if (store) {
sequences.add(token);
} else {
if (sequences.contains(token) && len > maxSeqLength) {
maxSeqLength = len;
}
}
String subToken = null;
if (len > MIN_TOKEN_LENGTH ) {
//create sub sequences by taking one char out and make recursive call
for (int i = 0; i < len; ++i) {
if (i == 0) {
subToken = token.substring(1);
} else if (i == len - 1) {
subToken = token.substring(0, len - 1);
} else {
subToken = token.substring(0, i ) + token.substring(i + 1);
}
generateSubSequences(subToken, store);
}
}
}
}