/*
* Copyright 2017 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.errorprone.names;
/**
* The Needleman-Wunsch algorithm for finding least-cost string edit distances between pairs of
* strings. Like Levenshtein, but this version allows for a sequence of adjacent
* deletions/insertions to cost less than the total cost of each individual deletion/insertion, so
* that, for example editing {@code Christopher} into {@code Chris} (dropping 6 characters) is not 6
* times as expensive as editing {@code Christopher} into {@code Christophe}.
*
* <p>See http://en.wikipedia.org/wiki/Needleman-Wunsch_algorithm
*
* @author alanw@google.com (Alan Wendt)
*/
public final class NeedlemanWunschEditDistance {
private NeedlemanWunschEditDistance() {
/* disallow instantiation */
}
/**
* Returns the edit distance between two strings. Levenshtein charges the same cost for each
* insertion or deletion. This algorithm is slightly more general in that it charges a sequence of
* adjacent insertions/deletions an up-front cost plus an incremental cost per insert/delete
* operation. The idea is that Christopher -> Chris should be less than 6 times as expensive as
* Christopher -> Christophe. The algorithm used to calculate this distance takes time and space
* proportional to the product of { @code source.length() } and { @code target.length() } to build
* the 3 arrays.
*
* @param source source string.
* @param target target string
* @param caseSensitive if true, case is used in comparisons and 'a' != 'A'.
* @param changeCost cost of changing one character
* @param openGapCost cost to open a gap to insert or delete some characters.
* @param continueGapCost marginal cost to insert or delete next character.
* @return edit distance between the source and target strings.
*/
public static int getEditDistance(
String source,
String target,
boolean caseSensitive,
int changeCost,
int openGapCost,
int continueGapCost) {
if (!caseSensitive) {
source = source.toLowerCase();
target = target.toLowerCase();
}
int sourceLength = source.length();
int targetLength = target.length();
if (sourceLength == 0) {
return scriptCost(openGapCost, continueGapCost, targetLength);
}
if (targetLength == 0) {
return scriptCost(openGapCost, continueGapCost, sourceLength);
}
// mMatrix[i][j] = Cost of aligning source.substring(0,i) with
// target.substring(0,j), using an edit script ending with
// matched characters.
int[][] mMatrix = new int[sourceLength + 1][targetLength + 1];
// Cost of an alignment that ends with a bunch of deletions.
// dMatrix[i][j] = best found cost of changing the first i chars
// of source into the first j chars of target, ending with one
// or more deletes of source characters.
int[][] dMatrix = new int[sourceLength + 1][targetLength + 1];
// Cost of an alignment that ends with one or more insertions.
int[][] iMatrix = new int[sourceLength + 1][targetLength + 1];
mMatrix[0][0] = dMatrix[0][0] = iMatrix[0][0] = 0;
// Any edit script that changes i chars of source into zero
// chars of target will only involve deletions. So only the
// d&m Matrix entries are relevant, because dMatrix[i][0] gives
// the cost of changing an i-length string into a 0-length string,
// using an edit script ending in deletions.
for (int i = 1; i <= sourceLength; i++) {
mMatrix[i][0] = dMatrix[i][0] = scriptCost(openGapCost, continueGapCost, i);
// Make the iMatrix entries impossibly expensive, so they'll be
// ignored as inputs to min(). Use a big cost but not
// max int because that will overflow if anything's added to it.
iMatrix[i][0] = Integer.MAX_VALUE / 2;
}
for (int j = 1; j <= targetLength; j++) {
// Only the i&m Matrix entries are relevant here, because they represent
// the cost of changing a 0-length string into a j-length string, using
// an edit script ending in insertions.
mMatrix[0][j] = iMatrix[0][j] = scriptCost(openGapCost, continueGapCost, j);
// Make the dMatrix entries impossibly expensive, so they'll be
// ignored as inputs to min(). Use a big cost but not
// max int because that will overflow if anything's added to it.
dMatrix[0][j] = Integer.MAX_VALUE / 2;
}
for (int i = 1; i <= sourceLength; i++) {
char sourceI = source.charAt(i - 1);
for (int j = 1; j <= targetLength; j++) {
char targetJ = target.charAt(j - 1);
int cost = (sourceI == targetJ) ? 0 : changeCost;
// Cost of changing i chars of source into j chars of target,
// using an edit script ending in matched characters.
mMatrix[i][j] =
cost
+ Math.min(
mMatrix[i - 1][j - 1], Math.min(iMatrix[i - 1][j - 1], dMatrix[i - 1][j - 1]));
// Cost of an edit script ending in a deletion.
dMatrix[i][j] =
Math.min(
mMatrix[i - 1][j] + openGapCost + continueGapCost,
dMatrix[i - 1][j] + continueGapCost);
// Cost of an edit script ending in an insertion.
iMatrix[i][j] =
Math.min(
mMatrix[i][j - 1] + openGapCost + continueGapCost,
iMatrix[i][j - 1] + continueGapCost);
}
}
// Return the minimum cost.
int costOfEditScriptEndingWithMatch = mMatrix[sourceLength][targetLength];
int costOfEditScriptEndingWithDelete = dMatrix[sourceLength][targetLength];
int costOfEditScriptEndingWithInsert = iMatrix[sourceLength][targetLength];
return Math.min(
costOfEditScriptEndingWithMatch,
Math.min(costOfEditScriptEndingWithDelete, costOfEditScriptEndingWithInsert));
}
/** Return the worst case edit distance between strings of this length */
public static int getWorstCaseEditDistance(
int sourceLength, int targetLength, int changeCost, int openGapCost, int continueGapCost) {
int maxLen = Math.max(sourceLength, targetLength);
int minLen = Math.min(sourceLength, targetLength);
// Compute maximum cost of changing one string into another. If the
// lengths differ, you'll need maxLen - minLen insertions or deletions.
int totChangeCost =
scriptCost(openGapCost, continueGapCost, maxLen - minLen) + minLen * changeCost;
// Another possibility is to just delete the entire source and insert the
// target, and not do any changes.
int blowAwayCost =
scriptCost(openGapCost, continueGapCost, sourceLength)
+ scriptCost(openGapCost, continueGapCost, targetLength);
return Math.min(totChangeCost, blowAwayCost);
}
/**
* Returns a normalized edit distance between 0 and 1. This is useful if you are comparing or
* aggregating distances of different pairs of strings
*/
public static double getNormalizedEditDistance(
String source,
String target,
boolean caseSensitive,
int changeCost,
int openGapCost,
int continueGapCost) {
if (source.isEmpty() && target.isEmpty()) {
return 0.0;
}
return (double)
getEditDistance(source, target, caseSensitive, changeCost, openGapCost, continueGapCost)
/ (double)
getWorstCaseEditDistance(
source.length(), target.length(), changeCost, openGapCost, continueGapCost);
}
/**
* Return the cost of a script consisting of a contiguous sequence of insertions or a contiguous
* sequence of deletions.
*/
private static int scriptCost(int openGapCost, int continueGapCost, int scriptLength) {
return (scriptLength == 0) ? 0 : openGapCost + scriptLength * continueGapCost;
}
}