package org.webcat.diff; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; public class DiffMatcher<T> { //~ Constructor ........................................................... // ---------------------------------------------------------- public DiffMatcher(List<T> text, List<T> pattern, int loc) { this(text, pattern, loc, null); } // ---------------------------------------------------------- public DiffMatcher(List<T> text, List<T> pattern, int loc, Comparator<T> comp) { this.comparator = comp; bestMatchIndex = doMatch(text, pattern, loc); } //~ Methods ............................................................... // ---------------------------------------------------------- public int getBestMatchIndex() { return bestMatchIndex; } // ---------------------------------------------------------- /** * Locate the best instance of 'pattern' in 'text' near 'loc'. Returns -1 if * no match found. * * @param text * The text to search. * @param pattern * The pattern to search for. * @param loc * The location to search around. * @return Best match index or -1. */ private int doMatch(List<T> text, List<T> pattern, int loc) { // Check for null inputs. if (text == null || pattern == null) { throw new IllegalArgumentException("Null inputs. (match_main)"); } loc = Math.max(0, Math.min(loc, text.size())); if (DiffUtils.listsEqual(text, pattern, comparator)/*text.equals(pattern)*/) { // Shortcut (potentially not guaranteed by the algorithm) return 0; } else if (text.size() == 0) { // Nothing to match. return -1; } else if (loc + pattern.size() <= text.size() && DiffUtils.listsEqual(text.subList(loc, loc + pattern.size()), pattern, comparator)/*text.subList(loc, loc + pattern.size()).equals(pattern)*/) { // Perfect match at the perfect spot! (Includes case of null pattern) return loc; } else { // Do a fuzzy compare. return bitap(text, pattern, loc); } } // ---------------------------------------------------------- /** * Locate the best instance of 'pattern' in 'text' near 'loc' using the * Bitap algorithm. Returns -1 if no match found. * * @param text * The text to search. * @param pattern * The pattern to search for. * @param loc * The location to search around. * @return Best match index or -1. */ private int bitap(List<T> text, List<T> pattern, int loc) { assert (Match_MaxBits == 0 || pattern.size() <= Match_MaxBits) : "Pattern too long for this application."; // Initialize the alphabet. Map<T, Integer> s = makeAlphabet(pattern); // Highest score beyond which we give up. double score_threshold = Match_Threshold; // Is there a nearby exact match? (speedup) int best_loc = DiffUtils.listIndexOf(text, pattern, loc, comparator); if (best_loc != -1) { score_threshold = Math.min(bitapScore(0, best_loc, loc, pattern), score_threshold); // What about in the other direction? (speedup) best_loc = DiffUtils.listLastIndexOf(text, pattern, loc + pattern.size(), comparator); if (best_loc != -1) { score_threshold = Math.min(bitapScore(0, best_loc, loc, pattern), score_threshold); } } // Initialize the bit arrays. int matchmask = 1 << (pattern.size() - 1); best_loc = -1; int bin_min, bin_mid; int bin_max = pattern.size() + text.size(); // Empty initialization added to appease Java compiler. int[] last_rd = new int[0]; for (int d = 0; d < pattern.size(); d++) { // Scan for the best match; each iteration allows for one more error. // Run a binary search to determine how far from 'loc' we can stray at // this error level. bin_min = 0; bin_mid = bin_max; while (bin_min < bin_mid) { if (bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold) { bin_min = bin_mid; } else { bin_max = bin_mid; } bin_mid = (bin_max - bin_min) / 2 + bin_min; } // Use the result from this iteration as the maximum for the next. bin_max = bin_mid; int start = Math.max(1, loc - bin_mid + 1); int finish = Math.min(loc + bin_mid, text.size()) + pattern.size(); int[] rd = new int[finish + 2]; rd[finish + 1] = (1 << d) - 1; for (int j = finish; j >= start; j--) { int charMatch; if (text.size() <= j - 1 || !s.containsKey(text.get(j - 1))) { // Out of range. charMatch = 0; } else { charMatch = s.get(text.get(j - 1)); } if (d == 0) { // First pass: exact match. rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; } else { // Subsequent passes: fuzzy match. rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; } if ((rd[j] & matchmask) != 0) { double score = bitapScore(d, j - 1, loc, pattern); // This match will almost certainly be better than any existing // match. But check anyway. if (score <= score_threshold) { // Told you so. score_threshold = score; best_loc = j - 1; if (best_loc > loc) { // When passing loc, don't exceed our current distance from loc. start = Math.max(1, 2 * loc - best_loc); } else { // Already passed loc, downhill from here on in. break; } } } } if (bitapScore(d + 1, loc, loc, pattern) > score_threshold) { // No hope for a (better) match at greater error levels. break; } last_rd = rd; } return best_loc; } // ---------------------------------------------------------- /** * Compute and return the score for a match with e errors and x location. * * @param e * Number of errors in match. * @param x * Location of match. * @param loc * Expected location of match. * @param pattern * Pattern being sought. * @return Overall score for match (0.0 = good, 1.0 = bad). */ private double bitapScore(int e, int x, int loc, List<T> pattern) { float accuracy = (float) e / pattern.size(); int proximity = Math.abs(loc - x); if (Match_Distance == 0) { // Dodge divide by zero error. return proximity == 0 ? accuracy : 1.0; } return accuracy + (proximity / (float) Match_Distance); } // ---------------------------------------------------------- /** * Initialize the alphabet for the Bitap algorithm. * * @param pattern * The text to encode. * @return Hash of character locations. */ private Map<T, Integer> makeAlphabet(List<T> pattern) { Map<T, Integer> s = new HashMap<T, Integer>(); for (T c : pattern) { s.put(c, 0); } int i = 0; for (T c : pattern) { s.put(c, s.get(c) | (1 << (pattern.size() - i - 1))); i++; } return s; } //~ Static/instance variables ............................................. // At what point is no match declared (0.0 = perfection, 1.0 = very loose). private float Match_Threshold = 0.5f; // How far to search for a match (0 = exact location, 1000+ = broad match). // A match this many characters away from the expected location will add 1.0 // to the score (0.0 is a perfect match). private int Match_Distance = 1000; // The number of bits in an int. private short Match_MaxBits = 32; private Comparator<T> comparator; private int bestMatchIndex = -1; }