/**
*
*/
package org.owasp.webscarab.util;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* This class calculates the edits necessary to convert a source document to a
* target document It does this by finding the longest common substring, then
* recursively checking the parts to the left and right of the LCS
*
* Firstly, the documents can be tokenized. For example, they can be separated
* into individual lines or words prior to being compared.
*
* @author rdawes
*
*/
public class Diff {
private static final CharSequence DELETE = new String();
private Diff() {
}
public static List<Edit> getEdits(CharSequence src, CharSequence dst) {
return getEdits(0, src.length(), src, 0, dst.length(), dst);
}
/*
* Calculates the edits that will transform the src CharSequence to the
* destination.
*/
private static List<Edit> getEdits(int srcStart, int srcEnd,
CharSequence src, int dstStart, int dstEnd, CharSequence dst) {
// System.out.println("Called with src (" + srcStart + "," + srcEnd +
// ")'" + src.subSequence(srcStart, srcEnd) + "'");
// System.out.println("Called with dst (" + dstStart + "," + dstEnd +
// ")'" + dst.subSequence(dstStart, dstEnd) + "'");
List<Edit> edits = new LinkedList<Edit>();
// check for common prefix and suffix
while (srcStart < srcEnd && dstStart < dstEnd
&& src.charAt(srcStart) == dst.charAt(dstStart)) {
srcStart++;
dstStart++;
}
while (srcStart < srcEnd && dstStart < dstEnd
&& src.charAt(srcEnd - 1) == dst.charAt(dstEnd - 1)) {
srcEnd--;
dstEnd--;
}
// check if the two texts are identical - unlikely
if (srcStart == srcEnd && dstStart == dstEnd)
return edits;
// if the leader and trailer comprised the whole text of either src or
// dst
// then what remains is the difference
if (srcStart == srcEnd) {
edits.add(new Edit(srcStart, DELETE, dstStart, dst.subSequence(
dstStart, dstEnd)));
return edits;
}
if (dstStart == dstEnd) {
edits.add(new Edit(srcStart, src.subSequence(srcStart, srcEnd),
dstStart, DELETE));
return edits;
}
// now divide the text into left and right parts, separated by the
// longest common substring
// and process the two parts recursively
LCS lcs = lcs(srcStart, srcEnd, src, dstStart, dstEnd, dst);
// System.out.println("LCS = " + lcs + ", '" +
// src.subSequence(lcs.getSrcLocation(), lcs.getSrcLocation() +
// lcs.getLength()) + "'");
if (lcs.getLength() > 0) {
edits.addAll(getEdits(srcStart, lcs.getSrcLocation(), src,
dstStart, lcs.getDstLocation(), dst));
srcStart = lcs.getSrcLocation() + lcs.getLength();
dstStart = lcs.getDstLocation() + lcs.getLength();
edits
.addAll(getEdits(srcStart, srcEnd, src, dstStart, dstEnd,
dst));
} else {
edits.add(new Edit(srcStart, src.subSequence(srcStart, srcEnd),
dstStart, dst.subSequence(dstStart, dstEnd)));
}
return edits;
}
public static CharSequence[] split(CharSequence orig, char boundary) {
List<CharSequence> list = new LinkedList<CharSequence>();
int previous = 0, index = 0;
while (index < orig.length()) {
if (orig.charAt(index) == boundary) {
list.add(orig.subSequence(previous, index+1));
previous = index + 1;
}
index++;
}
if (index > previous)
list.add(orig.subSequence(previous, index));
return (CharSequence[]) list.toArray(new CharSequence[list.size()]);
}
public static List<Edit> getEdits(CharSequence src, CharSequence dst, char boundary) {
CharSequence[] srcArray = split(src, boundary);
CharSequence[] dstArray = split(dst, boundary);
List<ArrayEdit> edits = getEdits(srcArray, dstArray);
return convertArrayToOriginal(srcArray, dstArray, edits);
}
public static List<Edit> convertArrayToOriginal(CharSequence[] src, CharSequence[] dst, List<ArrayEdit> arrayEdits) {
List<Edit> edits = new LinkedList<Edit>();
Iterator<ArrayEdit> it = arrayEdits.iterator();
int srcLast = 0, dstLast = 0;
int srcOffset = 0, dstOffset = 0;
while (it.hasNext()) {
ArrayEdit edit = it.next();
// catch up things in between edits
while(srcLast<edit.getSrcStart())
srcOffset += src[srcLast++].length();
while (dstLast<edit.getDstStart())
dstOffset += dst[dstLast++].length();
int srcStart = srcOffset;
int dstStart = dstOffset;
StringBuffer srcEdit = new StringBuffer();
StringBuffer dstEdit = new StringBuffer();
while (srcLast<edit.getSrcEnd())
srcEdit.append(src[srcLast++]);
while (dstLast<edit.getDstEnd())
dstEdit.append(dst[dstLast++]);
srcOffset += srcEdit.length();
dstOffset += dstEdit.length();
edits.add(new Edit(srcStart, srcEdit.toString(), dstStart, dstEdit.toString()));
}
return edits;
}
public static List<ArrayEdit> getEdits(CharSequence src[], CharSequence dst[]) {
return getEdits(0, src.length, src, 0, dst.length, dst);
}
/*
* Calculates the edits that will transform the src CharSequence to the
* destination.
*/
/*
* Calculates the edits that will transform the src CharSequence to the
* destination.
*/
private static List<ArrayEdit> getEdits(int srcStart, int srcEnd,
CharSequence[] src, int dstStart, int dstEnd, CharSequence[] dst) {
List<ArrayEdit> edits = new LinkedList<ArrayEdit>();
// check for common prefix and suffix
while (srcStart < srcEnd && dstStart < dstEnd
&& src[srcStart].equals(dst[dstStart])) {
srcStart++;
dstStart++;
}
while (srcStart < srcEnd && dstStart < dstEnd
&& src[srcEnd - 1].equals(dst[dstEnd - 1])) {
srcEnd--;
dstEnd--;
}
// check if the two texts are identical - unlikely
if (srcStart == srcEnd && dstStart == dstEnd)
return edits;
// if the leader and trailer comprised the whole text of either src or
// dst
// then what remains is the difference
if (srcStart == srcEnd || dstStart == dstEnd) {
edits.add(new ArrayEdit(srcStart, srcEnd, dstStart, dstEnd));
return edits;
}
// now divide the text into left and right parts, separated by the
// longest common substring
// and process the two parts recursively
LCS lcs = lcs(srcStart, srcEnd, src, dstStart, dstEnd, dst);
// System.out.println("LCS = " + lcs + ", '" +
// src.subSequence(lcs.getSrcLocation(), lcs.getSrcLocation() +
// lcs.getLength()) + "'");
if (lcs.getLength() > 0) {
edits.addAll(getEdits(srcStart, lcs.getSrcLocation(), src,
dstStart, lcs.getDstLocation(), dst));
srcStart = lcs.getSrcLocation() + lcs.getLength();
dstStart = lcs.getDstLocation() + lcs.getLength();
edits
.addAll(getEdits(srcStart, srcEnd, src, dstStart, dstEnd,
dst));
} else {
edits.add(new ArrayEdit(srcStart, srcEnd, dstStart, dstEnd));
}
return edits;
}
public static List<Edit> refine(CharSequence src, CharSequence dst, List<Edit> edits) {
List<Edit> refined = new LinkedList<Edit>();
Iterator<Edit> it = edits.iterator();
while(it.hasNext()) {
Edit edit = it.next();
int srcStart = edit.getSrcLocation();
int srcEnd = srcStart + edit.getSrc().length();
int dstStart = edit.getDstLocation();
int dstEnd = dstStart + edit.getDst().length();
refined.addAll(getEdits(srcStart, srcEnd, src, dstStart, dstEnd, dst));
}
return refined;
}
public static int getDistance(List<Edit> edits) {
int distance = 0;
for (int i = 0; i < edits.size(); i++) {
Edit edit = edits.get(i);
distance += edit.getSrc().length() + edit.getDst().length();
}
return distance;
}
/*
* This method is useful for ensuring that the edits are properly calculated
*/
public static String apply(CharSequence src, List<Edit> edits) {
Iterator<Edit> it = edits.iterator();
StringBuffer buff = new StringBuffer();
int last = 0;
while (it.hasNext()) {
Edit edit = it.next();
// System.out.println(edit);
if (edit.getSrcLocation() > last) {
// catch up things in between edits
buff.append(src.subSequence(last, edit.getSrcLocation()));
}
if (edit.getDst().length() > 0) {
buff.append(edit.getDst());
}
last = edit.getSrcLocation() + edit.getSrc().length();
}
if (last < src.length()) {
buff.append(src.subSequence(last, src.length()));
}
return buff.toString();
}
/*
* This method is useful for ensuring that the edits are properly calculated
*/
public static String revert(CharSequence dst, List<Edit> edits) {
Iterator<Edit> it = edits.iterator();
StringBuffer buff = new StringBuffer();
int last = 0;
while (it.hasNext()) {
Edit edit = it.next();
if (edit.getDstLocation() > last) {
// catch up things in between edits
buff.append(dst.subSequence(last, edit.getDstLocation()));
}
if (edit.getSrc().length() > 0) {
buff.append(edit.getSrc());
}
last = edit.getDstLocation() + edit.getDst().length();
}
if (last < dst.length()) {
buff.append(dst.subSequence(last, dst.length()));
}
return buff.toString();
}
/**
* Construct the longest common substring between two strings if such a
* substring exists. Note that this is different from the longest common
* subsequence in that it assumes you want the longest continuous sequence.
* The cost of this routine can be made less by keeping a master copy of
* data around that you want to check input against. That is, imagine that
* you keep the sorted suffix arrays around for some collection of data
* items. Then finding the LCS against that set is just a matter of
* computing the suffix matrix for the input (e.g., line) and comparing
* against the pre-computed suffix arrays for each data item.
* <p>
* In any event, this routine always computes and sorts the suffix arrays
* for both input string parameters.
*
* @param src
* the first string instance
* @param dst
* the second string instance
* @return the longest common substring, or the empty string if at least one
* of the arguments are <code>null</code>, empty, or there is no
* match.
*/
private static LCS lcs(int srcStart, int srcEnd, CharSequence src,
int dstStart, int dstEnd, CharSequence dst) {
LCS lcs = new LCS(0, 0, 0);
for (int i = srcStart; i < srcEnd; i++) {
for (int j = dstStart; j < dstEnd; j++) {
// System.out.println("I = " + i + " J = " + j);
int len = 0;
int max = Math.min(srcEnd - i, dstEnd - j);
while (len < max) {
if (src.charAt(i + len) == dst.charAt(j + len)) {
len++;
} else {
break;
}
}
// got a longer match, so erase bestMatch and replace it.
if (len > lcs.getLength()) {
/* replace bestMatch with our current match, which is longer */
lcs = new LCS(i, j, len);
}
}
}
return lcs;
}
/**
* Construct the longest common substring between two strings if such a
* substring exists. Note that this is different from the longest common
* subsequence in that it assumes you want the longest continuous sequence.
* The cost of this routine can be made less by keeping a master copy of
* data around that you want to check input against. That is, imagine that
* you keep the sorted suffix arrays around for some collection of data
* items. Then finding the LCS against that set is just a matter of
* computing the suffix matrix for the input (e.g., line) and comparing
* against the pre-computed suffix arrays for each data item.
* <p>
* In any event, this routine always computes and sorts the suffix arrays
* for both input string parameters.
*
* @param src
* the first string instance
* @param dst
* the second string instance
* @return the longest common substring, or the empty string if at least one
* of the arguments are <code>null</code>, empty, or there is no
* match.
*/
private static LCS lcs(int srcStart, int srcEnd, CharSequence[] src,
int dstStart, int dstEnd, CharSequence[] dst) {
LCS lcs = new LCS(0, 0, 0);
for (int i = srcStart; i < srcEnd; i++) {
for (int j = dstStart; j < dstEnd; j++) {
int len = 0;
int max = Math.min(srcEnd - i, dstEnd - j);
while (len < max) {
if (src[i + len].equals(dst[j + len])) {
len++;
} else {
break;
}
}
// got a longer match, so erase bestMatch and replace it.
if (len > lcs.getLength()) {
/* replace bestMatch with our current match, which is longer */
lcs = new LCS(i, j, len);
}
}
}
return lcs;
}
private static class LCS {
private int srcLocation;
private int dstLocation;
private int length;
public LCS(int srcLocation, int dstLocation, int len) {
this.srcLocation = srcLocation;
this.dstLocation = dstLocation;
this.length = len;
}
public int getDstLocation() {
return this.dstLocation;
}
public int getLength() {
return this.length;
}
public int getSrcLocation() {
return this.srcLocation;
}
public String toString() {
return "(" + srcLocation + "," + dstLocation + "," + length + ")";
}
}
public static class Edit {
private int srcLocation, dstLocation;
private CharSequence src, dst;
public Edit(int srcLocation, CharSequence src, int dstLocation,
CharSequence dst) {
if (srcLocation < 0)
throw new IllegalArgumentException(
"Src Start may not be negative! " + srcLocation);
if (dstLocation < 0)
throw new IllegalArgumentException(
"Dst Start may not be negative! " + dstLocation);
this.srcLocation = srcLocation;
this.src = src;
this.dstLocation = dstLocation;
this.dst = dst;
}
public CharSequence getSrc() {
return this.src;
}
public int getSrcLocation() {
return this.srcLocation;
}
public CharSequence getDst() {
return this.dst;
}
public int getDstLocation() {
return this.dstLocation;
}
public String toString() {
return srcLocation + "- '" + src + "', " + dstLocation + "- '"
+ dst + "'";
}
}
public static class ArrayEdit {
private int srcStart, srcEnd, dstStart, dstEnd;
public ArrayEdit(int srcStart, int srcEnd, int dstStart, int dstEnd) {
if (srcStart < 0)
throw new IllegalArgumentException(
"Src Start may not be negative! " + srcStart);
if (dstStart < 0)
throw new IllegalArgumentException(
"Dst Start may not be negative! " + dstStart);
this.srcStart = srcStart;
this.srcEnd = srcEnd;
this.dstStart = dstStart;
this.dstEnd = dstEnd;
}
public int getDstEnd() {
return this.dstEnd;
}
public int getDstStart() {
return this.dstStart;
}
public int getSrcEnd() {
return this.srcEnd;
}
public int getSrcStart() {
return this.srcStart;
}
public String toString() {
return srcStart + "-" + srcEnd + ", " + dstStart + "-" + dstEnd;
}
}
private static void test(String src, String dst) {
List<Edit> edits = getEdits(src, dst, ' ');
String result = apply(src, edits);
if (!result.equals(dst)) {
System.err.println("Failed applying edits! '" + result + "' != '"
+ dst + "'");
} else {
System.err.println("Success applying!!");
}
result = revert(dst, edits);
if (!result.equals(src)) {
System.err.println("Failed reverting edits! '" + result + "' != '"
+ src + "'");
} else {
System.err.println("Success reverting!!");
}
}
public static void main(String[] args) {
test("cith", "ttttcithbbbb");
test("quicklyquicish", "quincequickish");
test("the cat in the hat box", "cat in the hat");
}
}