package org.gbif.nub.lookup.similarity;
// Copyright (c) 2011, Commonwealth of Australia
// Copied and adapted from ALA:
// https://ala-nsl.googlecode.com/svn/taxamatch/trunk/src/au/org/biodiversity/services/taxamatch/impl/ModifiedDamerauLevenshtein.java
import java.util.Arrays;
public class ModifiedDamerauLevenshtein implements StringSimilarity {
private final int pBlockLimit;
// this variable holds a shared array, to reduce heap use
// we use a little synchronisation to manage the space, but
// synchronisation is reasonably quick these days.
private static volatile int[] a_matrix = new int[64 * 64];
private static final Object mutex = new Object();
/**
* A default MDL with a block limit of just 2 edits.
*/
public ModifiedDamerauLevenshtein() {
this.pBlockLimit = 2;
}
/**
* @param limit the maximum allowed distance
*/
public ModifiedDamerauLevenshtein(int limit) {
this.pBlockLimit = limit;
}
@Override
public double getSimilarity(String x1, String x2) {
return DistanceUtils.convertEditDistanceToSimilarity(getEditDistance(x1, x2), x1, x2);
}
public final int getEditDistance(final String s1, final String s2) {
if (s1.equals(s2)) {
return 0;
} else if (s1.isEmpty() || s2.isEmpty()) {
return Math.max(s1.length(), s2.length());
} else if (s1.length() == 1 && s2.length() == 1) {
return 1;
}
final char[] t1;
final char[] t2;
{
StringBuilder sb1 = new StringBuilder(s1);
StringBuilder sb2 = new StringBuilder(s2);
// these hold the index of the last character.
int l1 = sb1.length()-1;
int l2 = sb2.length()-1;
while (l1>=0 && l2>=0 && sb1.charAt(0) == sb2.charAt(0)) {
sb1.deleteCharAt(0);
sb2.deleteCharAt(0);
l1--;
l2--;
}
while (l1>=0 && l2>=0 && sb1.charAt(l1) == sb2.charAt(l2)) {
sb1.deleteCharAt(l1--);
sb2.deleteCharAt(l2--);
}
l1++;
l2++;
if (l1 == 0 || l2 == 0)
return Math.max(l1, l2);
else if (l1 == 1 && l2 == 1) return 1;
t1 = sb1.toString().toCharArray();
t2 = sb2.toString().toCharArray();
}
final int temp1Len = t1.length;
final int temp2Len = t2.length;
// using a 1-dimensional array with bit fiddling to get to the elements
// saves about 12-15% off the running time.
// so, we replace matrix[a][b] with matrix[((a)<<6)|(b)]
// this limits us to 64-character words, which should be plenty. I hope.
final int[] matrix;
synchronized (mutex) {
if (a_matrix == null) {
matrix = new int[64 * 64];
}
else {
matrix = a_matrix;
a_matrix = null;
Arrays.fill(matrix, 0);
}
}
for (int i = 0; i <= temp1Len; i++) {
matrix[i << 6] = i;
}
for (int i = 0; i <= temp2Len; i++) {
matrix[i] = i;
}
for (int i = 1; i <= temp1Len; i++) {
matrix[i << 6] = i;
for (int j = 1; j <= temp2Len; j++) {
int cost;
if (t1[i - 1] == t2[j - 1]) {
cost = 0;
}
else {
cost = 1;
}
int temp_block_length = Math.max(//
Math.min(temp1Len / 2, //
Math.min(temp2Len / 2, //
Math.max(pBlockLimit, 1))), //
1);
while (temp_block_length >= 1) {
final int sub1 = i - ((temp_block_length * 2) - 1);
final int sub2 = j - (temp_block_length - 1);
final int sub3 = i - (temp_block_length - 1);
final int sub4 = j - ((temp_block_length * 2) - 1);
if (i >= (temp_block_length * 2) && j >= (temp_block_length * 2)
&& substreq(t1, sub1, t2, sub2, temp_block_length)
&& substreq(t1, sub3, t2, sub4, temp_block_length)) {
final int ins = matrix[((i) << 6) | (j - 1)] + 1;
final int del = matrix[((i - 1) << 6) | j] + 1;
final int tran = matrix[((i - (temp_block_length * 2)) << 6) | (j - (temp_block_length * 2))]
+ cost + (temp_block_length - 1);
matrix[(i << 6) | j] = minimum(ins, del, tran);
temp_block_length = 0;
}
else if (temp_block_length == 1) {
final int del = matrix[((i - 1) << 6) | j] + 1;
final int ins = matrix[(i << 6) | (j - 1)] + 1;
final int sub = matrix[((i - 1) << 6) | (j - 1)] + cost;
matrix[(i << 6) | j] = minimum(ins, del, sub);
}
temp_block_length--;
}
}
}
int ret = matrix[(temp1Len << 6) | temp2Len];
a_matrix = matrix; // does not need to be synchronized, it is atomic
return ret;
}
// The compiler inlines these, I think.
private final static int minimum(int d, int i, int s) {
return d < i ? (d < s ? d : s) : (i < s ? i : s);
}
private final static boolean substreq(char[] src1, int start1, char[] src2, int start2, int length) {
int at1 = start1 >= 0 ? start1 - 1 : src1.length + start1;
int at2 = start2 >= 0 ? start2 - 1 : src2.length + start2;
while (length-- > 0) {
if (src1[at1++] != src2[at2++]) return false;
}
return true;
}
}