package org.webcat.diff; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; public class DiffPatcher<T> { //~ Constructors .......................................................... // ---------------------------------------------------------- // public DiffPatcher(List<T> list1, DiffList<T> diffs) // { // this(list1, diffs, null); // } // ---------------------------------------------------------- public DiffPatcher(List<T> list1, DiffList<T> diffs, Comparator<T> comp) { this.comparator = comp; this.patches = makePatch(list1, diffs); } //~ Methods ............................................................... // ---------------------------------------------------------- public PatchList<T> getPatches() { return patches; } // ---------------------------------------------------------- /** * Increase the context until it is unique, but don't let the pattern expand * beyond Match_MaxBits. * * @param patch * The patch to grow. * @param list * Source list. */ private void addContext(Patch<T> patch, List<T> list) { if (list.size() == 0) { return; } List<T> pattern = list.subList(patch.start2, patch.start2 + patch.length1); int padding = 0; // Look for the first and last matches of pattern in list. If two different // matches are found, increase the pattern length. while (list.indexOf(pattern) != list.lastIndexOf(pattern) && pattern.size() < Match_MaxBits - Patch_Margin - Patch_Margin) { padding += Patch_Margin; pattern = list .subList(Math.max(0, patch.start2 - padding), Math.min( list.size(), patch.start2 + patch.length1 + padding)); } // Add one chunk for good luck. padding += Patch_Margin; // Add the prefix. List<T> prefix = list.subList(Math.max(0, patch.start2 - padding), patch.start2); if (prefix.size() != 0) { patch.diffs.addFirst(new Diff<T>(Diff.Operation.EQUAL, prefix)); } // Add the suffix. List<T> suffix = list.subList(patch.start2 + patch.length1, Math.min( list.size(), patch.start2 + patch.length1 + padding)); if (suffix.size() != 0) { patch.diffs.addLast(new Diff<T>(Diff.Operation.EQUAL, suffix)); } // Roll back the start points. patch.start1 -= prefix.size(); patch.start2 -= prefix.size(); // Extend the lengths. patch.length1 += prefix.size() + suffix.size(); patch.length2 += prefix.size() + suffix.size(); } // ---------------------------------------------------------- private static <T> List<T> addLists(List<T>... lists) { ArrayList<T> newList = new ArrayList<T>(); for (List<T> list : lists) { newList.addAll(list); } return newList; } // ---------------------------------------------------------- /** * Compute a list of patches to turn list1 into list2. list2 is not * provided, diffs are the delta between list1 and list2. * * @param list1 * Old list. * @param diffs * Array of diff tuples for list1 to list2. * @return LinkedList of Patch objects. */ private PatchList<T> makePatch(List<T> list1, DiffList<T> diffs) { if (list1 == null || diffs == null) { throw new IllegalArgumentException("Null inputs. (patch_make)"); } PatchList<T> patches = new PatchList<T>(); if (diffs.isEmpty()) { return patches; // Get rid of the null case. } Patch<T> patch = new Patch<T>(); int char_count1 = 0; // Number of characters into the list1 List<T>. int char_count2 = 0; // Number of characters into the list2 List<T>. // Start with list1 (prepatch_list) and apply the diffs until we arrive at // list2 (postpatch_list). We recreate the patches one by one to determine // context info. List<T> prepatch_list = list1; List<T> postpatch_list = list1; for (Diff<T> aDiff : diffs) { if (patch.diffs.isEmpty() && aDiff.operation != Diff.Operation.EQUAL) { // A new patch starts here. patch.start1 = char_count1; patch.start2 = char_count2; } switch (aDiff.operation) { case INSERT: patch.diffs.add(aDiff); patch.length2 += aDiff.list.size(); postpatch_list = addLists(postpatch_list.subList(0, char_count2), aDiff.list, postpatch_list.subList(char_count2, postpatch_list.size())); break; case DELETE: patch.length1 += aDiff.list.size(); patch.diffs.add(aDiff); postpatch_list = addLists(postpatch_list.subList(0, char_count2), postpatch_list.subList(char_count2 + aDiff.list.size(), postpatch_list.size())); break; case EQUAL: if (aDiff.list.size() <= 2 * Patch_Margin && !patch.diffs.isEmpty() && aDiff != diffs.getLast()) { // Small equality inside a patch. patch.diffs.add(aDiff); patch.length1 += aDiff.list.size(); patch.length2 += aDiff.list.size(); } if (aDiff.list.size() >= 2 * Patch_Margin) { // Time for a new patch. if (!patch.diffs.isEmpty()) { addContext(patch, prepatch_list); patches.add(patch); patch = new Patch<T>(); // Unlike Unidiff, our patch lists have a rolling conlist. // http://code.google.com/p/google-diff-match-patch/wiki/Unidiff // Update prepatch list & pos to reflect the application of the // just completed patch. prepatch_list = postpatch_list; char_count1 = char_count2; } } break; } // Update the current character count. if (aDiff.operation != Diff.Operation.INSERT) { char_count1 += aDiff.list.size(); } if (aDiff.operation != Diff.Operation.DELETE) { char_count2 += aDiff.list.size(); } } // Pick up the leftover patch if not empty. if (!patch.diffs.isEmpty()) { addContext(patch, prepatch_list); patches.add(patch); } return patches; } // ---------------------------------------------------------- /** * Merge a set of patches onto the list. Return a patched list, as well as * an array of true/false values indicating which patches were applied. * * @param patches * Array of patch objects * @param list * Old list. * @return Two element Object array, containing the new list and an array of * boolean values. */ public PatchApplication<T> apply(List<T> list) { if (patches.isEmpty()) { return new PatchApplication<T>(list, new boolean[0]); } // Deep copy the patches so that no changes are made to originals. patches = patches.clone(); List<T> nullPadding = addPadding(patches, Patch_Margin); list = addLists(nullPadding, list, nullPadding); splitMax(patches); int x = 0; // delta keeps track of the offset between the expected and actual location // of the previous patch. If there are patches expected at positions 10 and // 20, but the first patch was found at 12, delta is 2 and the second patch // has an effective expected position of 22. int delta = 0; boolean[] results = new boolean[patches.size()]; for (Patch<T> aPatch : patches) { int expected_loc = aPatch.start2 + delta; List<T> list1 = aPatch.diffs.computeFirstList(); int start_loc; int end_loc = -1; if (list1.size() > this.Match_MaxBits) { // patch_splitMax will only provide an oversized pattern in the case of // a monster delete. start_loc = new DiffMatcher<T>(list, list1.subList(0, this.Match_MaxBits), expected_loc, comparator).getBestMatchIndex(); if (start_loc != -1) { end_loc = new DiffMatcher<T>(list, list1.subList(list1.size() - this.Match_MaxBits, list1.size()), expected_loc + list1.size() - this.Match_MaxBits, comparator).getBestMatchIndex(); if (end_loc == -1 || start_loc >= end_loc) { // Can't find valid trailing conlist. Drop this patch. start_loc = -1; } } } else { start_loc = new DiffMatcher<T>(list, list1, expected_loc, comparator).getBestMatchIndex(); } if (start_loc == -1) { // No match found. :( results[x] = false; // Subtract the delta for this failed patch from subsequent patches. delta -= aPatch.length2 - aPatch.length1; } else { // Found a match. :) results[x] = true; delta = start_loc - expected_loc; List<T> list2; if (end_loc == -1) { list2 = list.subList(start_loc, Math.min(start_loc + list1.size(), list.size())); } else { list2 = list.subList(start_loc, Math.min(end_loc + this.Match_MaxBits, list.size())); } if (DiffUtils.listsEqual(list1, list2, comparator)) { // Perfect match, just shove the replacement list in. list = addLists(list.subList(0, start_loc), aPatch.diffs.computeSecondList(), list.subList(start_loc + list1.size(), list.size())); } else { // Imperfect match. Run a diff to get a framework of equivalent // indices. DiffList<T> diffs = new Differ<T>(list1, list2, comparator).getDifferences(); if (list1.size() > this.Match_MaxBits && diffs.getLevenshteinDistance() / (float) list1.size() > this.Patch_DeleteThreshold) { // The end points match, but the content is unacceptably bad. results[x] = false; } else { //diff_cleanupSemanticLossless(diffs); int index1 = 0; for (Diff<T> aDiff : aPatch.diffs) { if (aDiff.operation != Diff.Operation.EQUAL) { int index2 = diffs.translateIndex(index1); if (aDiff.operation == Diff.Operation.INSERT) { // Insertion list = addLists( list.subList(0, start_loc + index2), aDiff.list, list.subList(start_loc + index2, list.size())); } else if (aDiff.operation == Diff.Operation.DELETE) { // Deletion list = addLists(list.subList(0, start_loc + index2), list.subList(start_loc + diffs.translateIndex(index1 + aDiff.list.size()), list.size())); } } if (aDiff.operation != Diff.Operation.DELETE) { index1 += aDiff.list.size(); } } } } } x++; } // Strip the padding off. list = list.subList(nullPadding.size(), list.size() - nullPadding.size()); return new PatchApplication<T>(list, results); } // ---------------------------------------------------------- /** * Add some padding on list start and end so that edges can match something. * Intended to be called only from within patch_apply. * * @param patches * Array of patch objects. * @return The padding List<T> added to each side. */ private static <T> List<T> addPadding(PatchList<T> patches, short margin) { short paddingLength = margin; List<T> nullPadding = new ArrayList<T>(); for (short x = 1; x <= paddingLength; x++) { //nullPadding += String.valueOf((char) x); nullPadding.add(null); } // Bump all the patches forward. for (Patch<T> aPatch : patches) { aPatch.start1 += paddingLength; aPatch.start2 += paddingLength; } // Add some padding on start of first diff. Patch<T> patch = patches.getFirst(); DiffList<T> diffs = patch.diffs; if (diffs.isEmpty() || diffs.getFirst().operation != Diff.Operation.EQUAL) { // Add nullPadding equality. diffs.addFirst(new Diff<T>(Diff.Operation.EQUAL, nullPadding)); patch.start1 -= paddingLength; // Should be 0. patch.start2 -= paddingLength; // Should be 0. patch.length1 += paddingLength; patch.length2 += paddingLength; } else if (paddingLength > diffs.getFirst().list.size()) { // Grow first equality. Diff<T> firstDiff = diffs.getFirst(); int extraLength = paddingLength - firstDiff.list.size(); firstDiff.list = addLists( nullPadding.subList(firstDiff.list.size(), nullPadding.size()), firstDiff.list); patch.start1 -= extraLength; patch.start2 -= extraLength; patch.length1 += extraLength; patch.length2 += extraLength; } // Add some padding on end of last diff. patch = patches.getLast(); diffs = patch.diffs; if (diffs.isEmpty() || diffs.getLast().operation != Diff.Operation.EQUAL) { // Add nullPadding equality. diffs.addLast(new Diff<T>(Diff.Operation.EQUAL, nullPadding)); patch.length1 += paddingLength; patch.length2 += paddingLength; } else if (paddingLength > diffs.getLast().list.size()) { // Grow last equality. Diff<T> lastDiff = diffs.getLast(); int extraLength = paddingLength - lastDiff.list.size(); lastDiff.list = addLists(lastDiff.list, nullPadding.subList(0, extraLength)); patch.length1 += extraLength; patch.length2 += extraLength; } return nullPadding; } // ---------------------------------------------------------- /** * Look through the patches and break up any which are longer than the * maximum limit of the match algorithm. Intended to be called only from * within patch_apply. * * @param patches * LinkedList of Patch objects. */ private void splitMax(PatchList<T> patches) { short patch_size = Match_MaxBits; List<T> precontext, postcontext; Patch<T> patch; int start1, start2; boolean empty; Diff.Operation diff_type; List<T> diff_text; ListIterator<Patch<T>> pointer = patches.listIterator(); Patch<T> bigpatch = pointer.hasNext() ? pointer.next() : null; while (bigpatch != null) { if (bigpatch.length1 <= Match_MaxBits) { bigpatch = pointer.hasNext() ? pointer.next() : null; continue; } // Remove the big old patch. pointer.remove(); start1 = bigpatch.start1; start2 = bigpatch.start2; precontext = new ArrayList<T>(); while (!bigpatch.diffs.isEmpty()) { // Create one of several smaller patches. patch = new Patch<T>(); empty = true; patch.start1 = start1 - precontext.size(); patch.start2 = start2 - precontext.size(); if (precontext.size() != 0) { patch.length1 = patch.length2 = precontext.size(); patch.diffs.add(new Diff<T>(Diff.Operation.EQUAL, precontext)); } while (!bigpatch.diffs.isEmpty() && patch.length1 < patch_size - Patch_Margin) { diff_type = bigpatch.diffs.getFirst().operation; diff_text = bigpatch.diffs.getFirst().list; if (diff_type == Diff.Operation.INSERT) { // Insertions are harmless. patch.length2 += diff_text.size(); start2 += diff_text.size(); patch.diffs.addLast(bigpatch.diffs.removeFirst()); empty = false; } else if (diff_type == Diff.Operation.DELETE && patch.diffs.size() == 1 && patch.diffs.getFirst().operation == Diff.Operation.EQUAL && diff_text.size() > 2 * patch_size) { // This is a large deletion. Let it pass in one chunk. patch.length1 += diff_text.size(); start1 += diff_text.size(); empty = false; patch.diffs.add(new Diff<T>(diff_type, diff_text)); bigpatch.diffs.removeFirst(); } else { // Deletion or equality. Only take as much as we can stomach. diff_text = diff_text.subList(0, Math.min(diff_text .size(), patch_size - patch.length1 - Patch_Margin)); patch.length1 += diff_text.size(); start1 += diff_text.size(); if (diff_type == Diff.Operation.EQUAL) { patch.length2 += diff_text.size(); start2 += diff_text.size(); } else { empty = false; } patch.diffs.add(new Diff<T>(diff_type, diff_text)); if (DiffUtils.listsEqual(diff_text, bigpatch.diffs.getFirst().list, comparator)/*diff_text.equals(bigpatch.diffs.getFirst().list)*/) { bigpatch.diffs.removeFirst(); } else { bigpatch.diffs.getFirst().list = bigpatch.diffs.getFirst().list.subList( diff_text.size(), bigpatch.diffs.getFirst().list.size()); } } } // Compute the head context for the next patch. precontext = patch.diffs.computeSecondList(); precontext = precontext.subList(Math.max(0, precontext .size() - Patch_Margin), precontext.size()); List<T> firstList = bigpatch.diffs.computeFirstList(); // Append the end context for this patch. if (firstList.size() > Patch_Margin) { postcontext = firstList.subList(0, Patch_Margin); } else { postcontext = firstList; } if (postcontext.size() != 0) { patch.length1 += postcontext.size(); patch.length2 += postcontext.size(); if (!patch.diffs.isEmpty() && patch.diffs.getLast().operation == Diff.Operation.EQUAL) { patch.diffs.getLast().list = addLists(patch.diffs.getLast().list, postcontext); } else { patch.diffs.add(new Diff<T>(Diff.Operation.EQUAL, postcontext)); } } if (!empty) { pointer.add(patch); } } bigpatch = pointer.hasNext() ? pointer.next() : null; } } //~ Static/instance variables ............................................. // When deleting a large block of text (over ~64 characters), how close does // the contents have to match the expected contents. (0.0 = perfection, 1.0 // = very loose). Note that Match_Threshold controls how closely the end // points of a delete need to match. private float Patch_DeleteThreshold = 0.5f; // Chunk size for context length. private short Patch_Margin = 4; // The number of bits in an int. private short Match_MaxBits = 32; private Comparator<T> comparator; private PatchList<T> patches; }