/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.dictionary;
import java.util.ArrayList;
import java.util.List;
/**
* Huge list of different algos, in Java:
* http://www.dcs.shef.ac.uk/~sam/stringmetrics.html
*
* Simplified "similarity", even more logical matching, very fast:
* http://www.catalysoft.com/articles/StrikeAMatch.html
*
* @author Fuad Efendi
*
*/
public class DistanceImpl implements Distance {
public DistanceImpl() {
}
public int getDistance(Object object1, Object object2) {
String string1 = (String) object1;
String string2 = (String) object2;
// Simplified, and very fast!
// see http://www.catalysoft.com/articles/StrikeAMatch.html
// return compareStrings(string1, string2);
// Lucene 3.0:
return getDistance(string1, string2);
}
// ////////////////////////////////////////
// From Lucene 3.0 trunk, "contrib" folder:
// *****************************
// Compute Levenshtein distance: see
// com.bizosys.commons.lang.StringUtils#getLevenshteinDistance(String,
// String)
// *****************************
public int getDistance(String target, String other) {
char[] sa;
int n;
int p[]; // 'previous' cost array, horizontally
int d[]; // cost array, horizontally
int _d[]; // placeholder to assist in swapping p and d
/*
* The difference between this impl. and the previous is that, rather
* than creating and retaining a matrix of size s.length()+1 by
* t.length()+1, we maintain two single-dimensional arrays of length
* s.length()+1. The first, d, is the 'current working' distance array
* that maintains the newest distance cost counts as we iterate through
* the characters of String s. Each time we increment the index of
* String t we are comparing, d is copied to p, the second int[]. Doing
* so allows us to retain the previous cost counts as required by the
* algorithm (taking the minimum of the cost count to the left, up one,
* and diagonally up and to the left of the current cost count being
* calculated). (Note that the arrays aren't really copied anymore, just
* switched...this is clearly much better than cloning an array or doing
* a System.arraycopy() each time through the outer loop.)
*
* Effectively, the difference between the two implementations is this
* one does not cause an out of memory condition when calculating the LD
* over two very large strings.
*/
sa = target.toCharArray();
n = sa.length;
p = new int[n + 1];
d = new int[n + 1];
final int m = other.length();
if (n == 0 || m == 0) {
if (n == m) {
return 1;
} else {
return 0;
}
}
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
char t_j; // jth character of t
int cost; // cost
for (i = 0; i <= n; i++) {
p[i] = i;
}
for (j = 1; j <= m; j++) {
t_j = other.charAt(j - 1);
d[0] = j;
for (i = 1; i <= n; i++) {
cost = sa[i - 1] == t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left
// and up +cost
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
return p[n];
}
/**
* http://www.catalysoft.com/articles/StrikeAMatch.html
*
* @return an array of adjacent letter pairs contained in the input string
*/
private static List<String> letterPairs(String str) {
List<String> pairs = new ArrayList<String>();
for (int i = 0; i < str.length() - 1; i++) {
pairs.add(str.substring(i, i + 2));
}
return pairs;
}
/**
* http://www.catalysoft.com/articles/StrikeAMatch.html
*
* @return lexical similarity value in the range [0,1]
*/
public static final int compareStrings(String str1, String str2) {
List<String> pairs1 = letterPairs(str1);
List<String> pairs2 = letterPairs(str2);
int intersection = 0;
int union = pairs1.size() + pairs2.size();
for (int i = 0; i < pairs1.size(); i++) {
for (int j = 0; j < pairs2.size(); j++) {
if (pairs1.get(i).equals(pairs2.get(j))) {
intersection++;
pairs2.remove(j);
break;
}
}
}
return union - (intersection + 1) * 2;
}
}