/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* AffineDistance.java
* Copyright (C) 2001 Mikhail Bilenko
*
*/
package weka.deduping.metrics;
import java.util.*;
import java.io.Serializable;
import weka.core.*;
/**
* A measure of distance between two strings based on affine distance.
* See D. Gusfield, "Algorithms on Strings, Trees and Sequences",
* Cambridge University Press, 1997.
*
* @author Mikhail Bilenko
*/
public class AffineMetric extends StringMetric implements OptionHandler, Serializable {
/** The cost of matching two characters */
protected double m_matchCost = -1;
/** The cost of a substituting two characters */
protected double m_subCost = 2;
/** The cost of opening a gap */
protected double m_gapStartCost = 3;
/** The cost of continuing a gap */
protected double m_gapExtendCost = 1;
/** Should the distance be normalized by the lengths of the strings? */
protected boolean m_normalized = true;
/** We can have different ways of converting from distance to similarity */
public static final int CONVERSION_LAPLACIAN = 1;
public static final int CONVERSION_UNIT = 2;
public static final int CONVERSION_EXPONENTIAL = 4;
public static final Tag[] TAGS_CONVERSION = {
new Tag(CONVERSION_UNIT, "similarity = 1-distance"),
new Tag(CONVERSION_LAPLACIAN, "similarity=1/(1+distance)"),
new Tag(CONVERSION_EXPONENTIAL, "similarity=exp(-distance)")
};
/** The method of converting, by default laplacian */
protected int m_conversionType = CONVERSION_EXPONENTIAL;
/** A default constructor that assigns the name of this distance */
public AffineMetric () { }
/** A metric can be data-dependent (e.g. vector space for IDF) */
public boolean isDataDependent() {
return false;
}
/** Obtain the distance between two strings
* @param s1 String 1
* @param s2 String 2
* @returns Affine distance between the two strings
*/
public double distance(String string1, String string2) throws Exception {
char[] s1 = string1.toCharArray();
char[] s2 = string2.toCharArray();
int l1 = s1.length, l2 = s2.length;
double T[][] = new double[l1+1][l2+1];
double I[][] = new double[l1+1][l2+1];
double D[][] = new double[l1+1][l2+1];
double subCost;
int i, j;
if (l1==0 || l2==0) {
return m_gapStartCost + (l1+l2-1) * m_gapExtendCost;
}
for (j = 0; j < l2+1; j++) {
I[0][j] = Double.MAX_VALUE;
D[0][j] = Double.MAX_VALUE;
}
for (j = 0; j < l1+1; j++) {
I[j][0] = Double.MAX_VALUE;
D[j][0] = Double.MAX_VALUE;
}
T[0][0] = 0;
T[0][1] = m_gapStartCost;
T[1][0] = m_gapStartCost;
for (j = 2; j < l2+1; j++) {
T[0][j] = T[0][j-1] + m_gapExtendCost;
}
for (j = 2; j < l1+1; j++) {
T[j][0] = T[j-1][0] + m_gapExtendCost;
}
for (i = 1; i < l1+1; i++) {
for (j = 1; j < l2+1; j++) {
D[i][j] = (D[i-1][j]+m_gapExtendCost > T[i-1][j]+m_gapStartCost) ?
T[i-1][j]+m_gapStartCost : D[i-1][j]+m_gapExtendCost;
I[i][j] = (I[i][j-1]+m_gapExtendCost > T[i][j-1]+m_gapStartCost) ?
T[i][j-1]+m_gapStartCost : I[i][j-1]+m_gapExtendCost;
subCost = (s1[i-1] == s2[j-1]) ? m_matchCost : m_subCost;
if ((T[i-1][j-1] + subCost < D[i][j]) && (T[i-1][j-1] + subCost < I[i][j])) {
T[i][j] = T[i-1][j-1] + subCost;
} else {
if (D[i][j] < I[i][j]) {
T[i][j] = D[i][j];
} else {
T[i][j] = I[i][j];
}
}
}
}
double ret;
if (T[l1][l2] < D[l1][l2] && T[l1][l2] < I[l1][l2]) {
ret = T[l1][l2];
} else if (D[l1][l2] < I[l1][l2]) {
ret = D[l1][l2];
} else {
ret = I[l1][l2];
}
if (m_normalized) {
ret /= l1 + l2;
}
return ret;
}
/** The computation of a metric can be either based on distance, or on similarity
* @returns true
*/
public boolean isDistanceBased() {
return true;
}
/**
* Returns a similarity estimate between two strings. Similarity is obtained by
* inverting the distance value using one of three methods:
* CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT.
* @param string1 First string.
* @param string2 Second string.
* @exception Exception if similarity could not be estimated.
*/
public double similarity(String string1, String string2) throws Exception {
switch (m_conversionType) {
case CONVERSION_LAPLACIAN:
return 1 / (1 + distance(string1, string2));
case CONVERSION_UNIT:
return 2 * (1 - distance(string1, string2));
case CONVERSION_EXPONENTIAL:
return Math.exp(-distance(string1, string2));
default:
throw new Exception ("Unknown distance to similarity conversion method");
}
}
/** Set the match cost
* @param matchCost the cost of finding a matching pair of characters
*/
public void setMatchCost(double matchCost) {
m_matchCost = matchCost;
}
/** Get the match cost
* @returns the cost of finding a matching pair of characters
*/
public double getMatchCost() {
return m_matchCost;
}
/** Set the substitution cost
* @param subCost the cost of substituting one character for another
*/
public void setSubCost(double subCost) {
m_subCost = subCost;
}
/** Get the substitution cost
* @returns the cost of substituting a pair of characters
*/
public double getSubCost() {
return m_subCost;
}
/** Set the gap opening cost
* @param gapStartCost the cost of opening a gap
*/
public void setGapStartCost(double gapStartCost) {
m_gapStartCost = gapStartCost;
}
/** Get the gap opening cost
* @returns the cost of opening a gap
*/
public double getGapStartCost() {
return m_gapStartCost;
}
/** Set the gap extension cost
* @param gapExtendCost the cost of extending a gap
*/
public void setGapExtendCost(double gapExtendCost) {
m_gapExtendCost = gapExtendCost;
}
/** Get the gap extension cost
* @returns the cost of extending a gap
*/
public double getGapExtendCost() {
return m_gapExtendCost;
}
/** Set the distance to be normalized by the sum of the string's lengths
* @param normalized if true, distance is normalized by the sum of string's lengths
*/
public void setNormalized(boolean normalized) {
m_normalized = normalized;
}
/** Get whether the distance is normalized by the sum of the string's lengths
* @return if true, distance is normalized by the sum of string's lengths
*/
public boolean getNormalized() {
return m_normalized;
}
/** Create a copy of this metric
* @return another AffineMetric with the same exact parameters as this metric
*/
public Object clone() {
AffineMetric metric = new AffineMetric();
metric.setNormalized(m_normalized);
metric.setMatchCost(m_matchCost);
metric.setSubCost(m_subCost);
metric.setGapStartCost(m_gapStartCost);
metric.setGapExtendCost(m_gapExtendCost);
return metric;
}
/**
* Gets the current settings of WeightedDotP.
*
* @return an array of strings suitable for passing to setOptions()
*/
public String [] getOptions() {
String [] options = new String [10];
int current = 0;
if (m_normalized) {
options[current++] = "-N";
}
options[current++] = "-m";
options[current++] = "" + m_matchCost;
options[current++] = "-s";
options[current++] = "" + m_subCost;
options[current++] = "-g";
options[current++] = "" + m_gapStartCost;
options[current++] = "-e";
options[current++] = "" + m_gapExtendCost;
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Parses a given list of options. Valid options are:<p>
*
* -N normalize by length
* -m matchCost
* -s subCost
* -g gapStartCost
* -e gapExtendCost
*/
public void setOptions(String[] options) throws Exception {
setNormalized(Utils.getFlag('N', options));
String matchCostString = Utils.getOption('m', options);
if (matchCostString.length() != 0) {
setMatchCost(Double.parseDouble(matchCostString));
}
String subCostString = Utils.getOption('s', options);
if (subCostString.length() != 0) {
setSubCost(Double.parseDouble(subCostString));
}
String gapStartString = Utils.getOption('g', options);
if (gapStartString.length() != 0) {
setGapStartCost(Double.parseDouble(gapStartString));
}
String gapExtendString = Utils.getOption('e', options);
if (gapExtendString.length() != 0) {
setGapExtendCost(Double.parseDouble(gapExtendString));
}
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
public Enumeration listOptions() {
Vector newVector = new Vector(5);
newVector.addElement(new Option("\tNormalize the dot product by vectors lengths\n",
"N", 0, "-N"));
newVector.addElement(new Option("\tMatch cost\n",
"m", 1, "-m matchCost"));
newVector.addElement(new Option("\tSubstitution cost\n",
"s", 1, "-m subCost"));
newVector.addElement(new Option("\tGap start cost\n",
"g", 1, "-g gapStartCost"));
newVector.addElement(new Option("\tGap extend cost\n",
"e", 1, "-e gapExtendCost"));
return newVector.elements();
}
}