package org.apache.lucene.spelt;
/**
* Copyright 2002-2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
/**
* Calculates the edit distance between two strings, with special modifications to
* score transpositions and double-letter changes as lower cost than
* insertion/deletion/replacement.
*
* @author Martin Haye
*/
final class TRStringDistance2 {
final char[] sa;
final int n;
final int[][][] cache=new int[30][][];
/**
* Optimized to run a bit faster than the static getDistance().
* In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
*/
public TRStringDistance2 (String target) {
sa=target.toCharArray();
n=sa.length;
}
/**
* Compute Damerau-Levenstein distance between the target string and
* another string. Damerau-Levenstein is similar to Levenstein except
* that it also accounts for transposition in the set of edit operations.
* This more fully reflects a common source of misspellings.
*/
public final int getDistance (String other) {
int d[][]; // matrix
int baseCost, replaceCost, insertCost, deleteCost;
// First, initialize the matrix.
final char[] ta=other.toCharArray();
final int m=ta.length;
if (n==0)
return m;
if (m==0)
return n;
if (m>=cache.length)
d=form(n, m);
else if (cache[m]!=null)
d=cache[m];
else
d=cache[m]=form(n, m);
// Process each source character
char s_i2 = 0;
for (int i=1; i<=n; i++)
{
final char s_i=sa[i-1];
// Process each target character
char t_j2 = 0;
for (int j=1; j<=m; j++)
{
final char t_j=ta[j-1];
baseCost = s_i == t_j ? 0 : 2;
replaceCost = d[i-1][j-1] + baseCost;
insertCost = d[i-1][j] + (s_i == s_i2 ? 1 : 2);
deleteCost = d[i][j-1] + (t_j == t_j2 ? 1 : 2);
d[i][j]=min3(replaceCost, insertCost, deleteCost);
// Check for transposition
if (s_i != t_j && s_i == t_j2 && t_j == s_i2)
d[i][j] = Math.min(d[i][j], d[i-2][j-2] + 1);
t_j2 = t_j;
}
s_i2 = s_i;
}
// Step 7
return d[n][m];
}
/**
*
*/
private static int[][] form (int n, int m) {
int[][] d=new int[n+1][m+1];
// Step 2
for (int i=0; i<=n; i++) {
d[i][0]=i*2;
}
for (int j=0; j<=m; j++) {
d[0][j]=j;
}
return d;
}
//****************************
// Get minimum of three values
//****************************
private static int min3 (int a, int b, int c) {
int mi=a;
if (b<mi) {
mi=b;
}
if (c<mi) {
mi=c;
}
return mi;
}
}