package com.limegroup.gnutella.util; import com.limegroup.gnutella.Assert; /** * An approximate string matcher. Two strings are considered "approximately * equal" if one can be transformed into the other through some series of * inserts, deletes, and substitutions.<p> * * The approximate matcher has options to ignore case and whitespace. It also * has switches to make it perform better by comparing strings backwards and * reusing a buffer. However, these do <i>not</i> affect the match methods * directly; they only affect the results of the process(String) method. * This method is used to preprocess strings before passing to match(..). * Typical use: * * <pre> * String s1, s2; * ApproximateMatcher matcher=new ApproximateMatcher(); * matcher.setIgnoreCase(true); * matcher.setCompareBackwards(true); * String s1p=matcher.process(s1); //pre-process s1 * String s2p=matcher.process(s2); //pre-process s2 * int matches=matcher.match(s1p, s2p); //compare processed strings * ... * </pre> * * The reason for this design is to reduce the pre-processing overhead when a * string is matched against many other strings. Preprocessing really is * required to support the ignoreWhitespace option; it is simply not possible to * do the k-difference dynamic programming algorithm effienctly in one pass. * * Note that this class is not thread-safe if the buffering constructor is * used. */ final public class ApproximateMatcher { private boolean ignoreCase=false; private boolean ignoreWhitespace=false; private boolean compareBackwards=false; /** For avoiding allocations. This can only be used by one thread at a * time. INVARIANT: buffer!=null => buffer is a bufSize by bufSize array. */ private volatile int[][] buffer; private volatile int bufSize; /* * Creates a new approximate matcher that compares respects case and * whitespace, and compares forwards. Compared to ApproximateMatcher(int), * This constructor is useful if the matcher is used infrequently and memory * is at a premium. */ public ApproximateMatcher() { this.buffer=null; } /** * Like ApproximateMatcher() except that the new matcher can compare strings * of the given size without any significant allocations. This is a useful * optimization if you need to make many comparisons with one matcher. The * matcher will still be able to compare larger strings, but it will require * an allocation. The buffer is not released until this is garbage * collected. <b>This method breaks thread safety; only one match(..) * call can be done at a time with a matcher created by this constructor. * </b> */ public ApproximateMatcher(int size) { bufSize=size+1; buffer=new int[bufSize][bufSize]; //need "margins" of 1 on each side } ////////////////////////////// Processing Methods /////////////////////// /* * @param ignoreCase true iff case should be ignored when matching processed * strings. Default value is false. */ public void setIgnoreCase(boolean ignoreCase) { this.ignoreCase=ignoreCase; } /* * @param ignoreWhitespace true iff the characters ' ' and '_' should be * ignored when matching processed strings. Default value is false. */ public void setIgnoreWhitespace(boolean ignoreWhitespace) { this.ignoreWhitespace=ignoreWhitespace; } /* * @param compareBackwards true iff the comparison should be done backwards * when matching processed strings. This is solely an optimization if you * expect more differences at the end of the word than the beginning. * Default value is false. */ public void setCompareBackwards(boolean compareBackwards) { this.compareBackwards=compareBackwards; } /** * Returns a version of s suitable for passing to match(..). This * means that s could be stripped of whitespace, lower-cased, or reversed * depending on the calls to setIgnoreWhitespace, setIgnoreWhitespace, and * setCompareBackwards. The returned value may be == to s. */ public String process(String s) { //Optimize for special case. if (! (ignoreCase || compareBackwards || ignoreWhitespace)) return s; StringBuffer buf=new StringBuffer(s.length()); if (compareBackwards) { for (int i=0; i<s.length(); i++) { char c=s.charAt(s.length()-i-1); if (ignoreCase) c=Character.toLowerCase(c); if (ignoreWhitespace) if (c==' ' || c=='_') continue; buf.append(c); } } else { //Exactly like above, but forward. for (int i=0; i<s.length(); i++) { char c=s.charAt(i); if (ignoreCase) c=Character.toLowerCase(c); if (ignoreWhitespace) if (c==' ' || c=='_') continue; buf.append(c); } } return buf.toString(); } ///////////////////////// Public Matching Methods ////////////////////////// /* * Returns the edit distance between s1 and s2. That is, returns the number * of insertions, deletions, or replacements necessary to transform s1 into * s2. A value of 0 means the strings match exactly.<p> * * If you want to ignore case or whitespace, or compare backwards, s1 and s2 * should be the return values of a call to process(..). */ public final int match(String s1, String s2) { //Let m=s1.length(), n=s2.length(), and k be the edit difference between //s1 and s2. It's possible to reduce the time from O(mn) time to O(kn) //time by repeated iterations of the the k-difference algorithm. But //this is a bit complicated. return matchInternal(s1, s2, Integer.MAX_VALUE); } /** * Returns true if the edit distance between s1 and s2 is less than or equal * to maxOps. That is, returns true if s1 can be transformed into s2 * through no more than maxOps insertions, deletions, or replacements. This * method is generally more efficient than match(..) if you only care * whether two strings approximately match.<p> * * If you want to ignore case or whitespace, or compare backwards, s1 and s2 * should be the return values of a call to process(..). */ public final boolean matches(String s1, String s2, int maxOps) { return matchInternal(s1, s2, maxOps)<=maxOps; } /** * Returns true if s1 can be transformed into s2 without changing more than * the given fraction of s1's letters. For example, matches(1.) is the same * as an exact comparison, while matches(0.) always returns true as long as * |s1|>=|s2|. matches(0.9) means "s1 and s2 match pretty darn closely".<p> * * If you want to ignore case or whitespace, or compare backwards, s1 and s2 * should be the return values of a call to process(..). * * @requires 0.<=match<=1. */ public final boolean matches(String s1, String s2, float precision) { int s1n=s1.length(); int n=(int)(precision*((float)s1n)); //number UNchanged int maxOps=s1n-n; //number changed return matches(s1, s2, maxOps); } /** * If the edit distance between s1 and s2 is less than or equal to maxOps, * returns the edit distance. Otherwise returns some number greater than * maxOps. */ private int matchInternal(String s1, String s2, int maxOps) { //Swap if necessary to ensure |s1|<=|s2|. if (s1.length()<=s2.length()) return matchInternalProcessed(s1, s2, maxOps); else return matchInternalProcessed(s2, s1, maxOps); } ///////////////////////////// Core algorithm ////////////////////////// /** * Same as matchInternal, but with weaker precondition. * @requires |s1|<=|s2| */ private int matchInternalProcessed( String s1, String s2, final int maxOps) { //A classic implementation using dynamic programming. d[i,j] is the //edit distance between s1[0..i-1] and s2[0..j-1] and is defined //recursively. Note that there are "margins" of 1 on the left and //top of this matrix. See Chapter 11 of _Algorithms on Strings, Trees, //and Sequences_ by Dan Gusfield for a complete discussion. // //A key optimization is that we only fill in part of the row. This is //based on the observation that any maxOps-difference global alignment //must not contain any cell (i, i+l) or (i,i-l), where l>maxOps. // //There are two additional twists to the usual algorithm. First, we fill in //the matrix anti-diagonally instead of one row at a time. Secondly, we //stop if the minimum value of the last two diagonals is greater than //maxOps. final int s1n=s1.length(); final int s2n=s2.length(); Assert.that(s1n<=s2n); if (maxOps<=0) return (s1.equals(s2)) ? 0 : 1; //Strings of vastly differing lengths don't match. This is necessary to //prevent the last return statement below from incorrectly returning //zero. else if (Math.abs(s1n-s2n) > maxOps) { return maxOps+1; } //If one of the strings is empty, the distance is trivial to calculate. else if (s1n==0) { //s2n==0 ==> s1n==0 return s2n; } //Optimization: recycle buffer for matrix if possible. int[][] d; if (buffer!=null && (bufSize >= Math.max(s1n+1, s2n+1))) d=buffer; else d=new int[s1n+1][s2n+1]; //Note d[0][0]==0 int diagonals=2*Math.min(s1n+1, s2n+1)-1 +Math.min(s2n-s1n, maxOps); int minThisDiag; //The min value of this diagonal int minLastDiag=0; //The min value of last diagonal //For each k'th anti-diagonal except first (measured from the origin)... for (int k=1; k<diagonals; k++) { //1. Calculate indices of left corner of diagonal (i1, j1) and upper //right corner (i2, j2). This is black magic. You really need to //look at a diagram to see why it works. int i1=k/2+maxOps/2; int j1=k/2-maxOps/2; int i2=k/2-maxOps/2; int j2=k/2+maxOps/2; if ((k%2)!=0) { //odd k? if ((maxOps%2)==0) { //even maxOps? //out and away from last endpoint j1++; i2++; } else { //in towards the diagonal i1++; j2++; } } //If endpoints don't fall on board, adjust accordingly if (j1<0 || i1>s1n) { i1=Math.min(k, s1n); j1=k-i1; } if (i2<0 || j2>s2n) { j2=Math.min(k, s2n); i2=k-j2; } //2. Calculate matrix values for corners. This is just like the loop //below except (1) we need to be careful of array index problems //and (2) we don't bother looking to the left of (i1, j1) or above //(i2, j2) if it's on the outer diagonal. Assert.that(i1>0, "Zero i1"); //j1 may be zero Assert.that(j2>0, "Zero j2"); //i2 may be zero // a) Look in towards diagonal d[i1][j1]=d[i1-1][j1]+1; d[i2][j2]=d[i2][j2-1]+1; // b) Look along the diagonal, unless on edge of matrix if (j1>0) d[i1][j1]=Math.min(d[i1][j1], d[i1-1][j1-1] + diff(s1.charAt(i1-1), s2.charAt(j1-1))); if (i2>0) d[i2][j2]=Math.min(d[i2][j2], d[i2-1][j2-1] + diff(s1.charAt(i2-1), s2.charAt(j2-1))); // c) Look out away from the diagonal if "inner diagonal" or on // bottom row, unless on edge of matrix. boolean innerDiag=(k%2)!=(maxOps%2); if ((innerDiag || i1==s1n) && j1>0) d[i1][j1]=Math.min(d[i1][j1], d[i1][j1-1]+1); if (innerDiag && i2>0) d[i2][j2]=Math.min(d[i2][j2], d[i2-1][j2]+1); minThisDiag=Math.min(d[i1][j1], d[i2][j2]); //3. Calculate matrix value for each element of the diagonal except //the endpoints... int i=i1-1; int j=j1+1; while (i>i2 && j<j2) { d[i][j]=1; //Fill in d[i][j] using previous calculated values int dij=min3(d[i-1][j-1] + diff(s1.charAt(i-1), s2.charAt(j-1)), d[i-1][j] + 1, d[i][j-1] + 1); d[i][j]=dij; minThisDiag=Math.min(minThisDiag, dij); //Move up and to the right in the matrix. i--; j++; } //If min value on last two diags is too big, quit. if (minThisDiag>maxOps && minLastDiag>maxOps) { return minThisDiag; } minLastDiag=minThisDiag; } return d[s1n][s2n]; } /** Returns 0 if a==b, or 1 otherwise. */ private static int diff(char a, char b) { if (a==b) return 0; else return 1; } private static int min3(int n1, int n2, int n3) { return( Math.min( n1, Math.min( n2, n3 ) ) ); } }