Diff.java example

Explorer
OWASP-WebScarab-master
- src
- test
  - java
    - test
      - unit
        org
        owasp
        webscarab
        plugin
        saml
        SamlTest.java
        util
        SunCertificateUtilsTest.java
/**
 * 
 */
package org.owasp.webscarab.util;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
 * This class calculates the edits necessary to convert a source document to a
 * target document It does this by finding the longest common substring, then
 * recursively checking the parts to the left and right of the LCS
 * 
 * Firstly, the documents can be tokenized. For example, they can be separated
 * into individual lines or words prior to being compared.
 * 
 * @author rdawes
 * 
 */
public class Diff {

	private static final CharSequence DELETE = new String();

	private Diff() {
	}

	public static List<Edit> getEdits(CharSequence src, CharSequence dst) {
		return getEdits(0, src.length(), src, 0, dst.length(), dst);
	}

	/*
	 * Calculates the edits that will transform the src CharSequence to the
	 * destination.
	 */
	private static List<Edit> getEdits(int srcStart, int srcEnd,
			CharSequence src, int dstStart, int dstEnd, CharSequence dst) {
		// System.out.println("Called with src (" + srcStart + "," + srcEnd +
		// ")'" + src.subSequence(srcStart, srcEnd) + "'");
		// System.out.println("Called with dst (" + dstStart + "," + dstEnd +
		// ")'" + dst.subSequence(dstStart, dstEnd) + "'");
		List<Edit> edits = new LinkedList<Edit>();

		// check for common prefix and suffix
		while (srcStart < srcEnd && dstStart < dstEnd
				&& src.charAt(srcStart) == dst.charAt(dstStart)) {
			srcStart++;
			dstStart++;
		}
		while (srcStart < srcEnd && dstStart < dstEnd
				&& src.charAt(srcEnd - 1) == dst.charAt(dstEnd - 1)) {
			srcEnd--;
			dstEnd--;
		}

		// check if the two texts are identical - unlikely
		if (srcStart == srcEnd && dstStart == dstEnd)
			return edits;

		// if the leader and trailer comprised the whole text of either src or
		// dst
		// then what remains is the difference
		if (srcStart == srcEnd) {
			edits.add(new Edit(srcStart, DELETE, dstStart, dst.subSequence(
					dstStart, dstEnd)));
			return edits;
		}
		if (dstStart == dstEnd) {
			edits.add(new Edit(srcStart, src.subSequence(srcStart, srcEnd),
					dstStart, DELETE));
			return edits;
		}

		// now divide the text into left and right parts, separated by the
		// longest common substring
		// and process the two parts recursively
		LCS lcs = lcs(srcStart, srcEnd, src, dstStart, dstEnd, dst);
		// System.out.println("LCS = " + lcs + ", '" +
		// src.subSequence(lcs.getSrcLocation(), lcs.getSrcLocation() +
		// lcs.getLength()) + "'");
		if (lcs.getLength() > 0) {
			edits.addAll(getEdits(srcStart, lcs.getSrcLocation(), src,
					dstStart, lcs.getDstLocation(), dst));
			srcStart = lcs.getSrcLocation() + lcs.getLength();
			dstStart = lcs.getDstLocation() + lcs.getLength();
			edits
					.addAll(getEdits(srcStart, srcEnd, src, dstStart, dstEnd,
							dst));
		} else {
			edits.add(new Edit(srcStart, src.subSequence(srcStart, srcEnd),
					dstStart, dst.subSequence(dstStart, dstEnd)));
		}
		return edits;
	}

	public static CharSequence[] split(CharSequence orig, char boundary) {
		List<CharSequence> list = new LinkedList<CharSequence>();
		int previous = 0, index = 0;
		while (index < orig.length()) {
			if (orig.charAt(index) == boundary) {
				list.add(orig.subSequence(previous, index+1));
				previous = index + 1;
			}
			index++;
		}
		if (index > previous)
			list.add(orig.subSequence(previous, index));
		return (CharSequence[]) list.toArray(new CharSequence[list.size()]);
	}
	
	public static List<Edit> getEdits(CharSequence src, CharSequence dst, char boundary) {
		CharSequence[] srcArray = split(src, boundary);
		CharSequence[] dstArray = split(dst, boundary);
		List<ArrayEdit> edits = getEdits(srcArray, dstArray);
		return convertArrayToOriginal(srcArray, dstArray, edits);
	}
	
	public static List<Edit> convertArrayToOriginal(CharSequence[] src, CharSequence[] dst, List<ArrayEdit> arrayEdits) {
		List<Edit> edits = new LinkedList<Edit>();
		Iterator<ArrayEdit> it = arrayEdits.iterator();
		int srcLast = 0, dstLast = 0;
		int srcOffset = 0, dstOffset = 0;
		while (it.hasNext()) {
			ArrayEdit edit = it.next();
			// catch up things in between edits
			while(srcLast<edit.getSrcStart())
				srcOffset += src[srcLast++].length();
			while (dstLast<edit.getDstStart())
				dstOffset += dst[dstLast++].length();
			int srcStart = srcOffset;
			int dstStart = dstOffset;
			StringBuffer srcEdit = new StringBuffer();
			StringBuffer dstEdit = new StringBuffer();
			while (srcLast<edit.getSrcEnd())
				srcEdit.append(src[srcLast++]);
			while (dstLast<edit.getDstEnd())
				dstEdit.append(dst[dstLast++]);
			srcOffset += srcEdit.length();
			dstOffset += dstEdit.length();
			edits.add(new Edit(srcStart, srcEdit.toString(), dstStart, dstEdit.toString()));
		}
		return edits;
	}
	
	public static List<ArrayEdit> getEdits(CharSequence src[], CharSequence dst[]) {
		return getEdits(0, src.length, src, 0, dst.length, dst);
	}

	/*
	 * Calculates the edits that will transform the src CharSequence to the
	 * destination.
	 */
	/*
	 * Calculates the edits that will transform the src CharSequence to the
	 * destination.
	 */
	private static List<ArrayEdit> getEdits(int srcStart, int srcEnd,
			CharSequence[] src, int dstStart, int dstEnd, CharSequence[] dst) {
		List<ArrayEdit> edits = new LinkedList<ArrayEdit>();

		// check for common prefix and suffix
		while (srcStart < srcEnd && dstStart < dstEnd
				&& src[srcStart].equals(dst[dstStart])) {
			srcStart++;
			dstStart++;
		}
		while (srcStart < srcEnd && dstStart < dstEnd
				&& src[srcEnd - 1].equals(dst[dstEnd - 1])) {
			srcEnd--;
			dstEnd--;
		}

		// check if the two texts are identical - unlikely
		if (srcStart == srcEnd && dstStart == dstEnd)
			return edits;

		// if the leader and trailer comprised the whole text of either src or
		// dst
		// then what remains is the difference
		if (srcStart == srcEnd || dstStart == dstEnd) {
			edits.add(new ArrayEdit(srcStart, srcEnd, dstStart, dstEnd));
			return edits;
		}

		// now divide the text into left and right parts, separated by the
		// longest common substring
		// and process the two parts recursively
		LCS lcs = lcs(srcStart, srcEnd, src, dstStart, dstEnd, dst);
		// System.out.println("LCS = " + lcs + ", '" +
		// src.subSequence(lcs.getSrcLocation(), lcs.getSrcLocation() +
		// lcs.getLength()) + "'");
		if (lcs.getLength() > 0) {
			edits.addAll(getEdits(srcStart, lcs.getSrcLocation(), src,
					dstStart, lcs.getDstLocation(), dst));
			srcStart = lcs.getSrcLocation() + lcs.getLength();
			dstStart = lcs.getDstLocation() + lcs.getLength();
			edits
					.addAll(getEdits(srcStart, srcEnd, src, dstStart, dstEnd,
							dst));
		} else {
			edits.add(new ArrayEdit(srcStart, srcEnd, dstStart, dstEnd));
		}
		return edits;
	}

	public static List<Edit> refine(CharSequence src, CharSequence dst, List<Edit> edits) {
		List<Edit> refined = new LinkedList<Edit>();
		Iterator<Edit> it = edits.iterator();
		while(it.hasNext()) {
			Edit edit = it.next();
			int srcStart = edit.getSrcLocation();
			int srcEnd = srcStart + edit.getSrc().length();
			int dstStart = edit.getDstLocation();
			int dstEnd = dstStart + edit.getDst().length();
			refined.addAll(getEdits(srcStart, srcEnd, src, dstStart, dstEnd, dst));
		}
		return refined;
	}
	
	public static int getDistance(List<Edit> edits) {
		int distance = 0;
		for (int i = 0; i < edits.size(); i++) {
			Edit edit = edits.get(i);
			distance += edit.getSrc().length() + edit.getDst().length();
		}
		return distance;
	}

	/*
	 * This method is useful for ensuring that the edits are properly calculated
	 */
	public static String apply(CharSequence src, List<Edit> edits) {
		Iterator<Edit> it = edits.iterator();
		StringBuffer buff = new StringBuffer();
		int last = 0;
		while (it.hasNext()) {
			Edit edit = it.next();
			// System.out.println(edit);
			if (edit.getSrcLocation() > last) {
				// catch up things in between edits
				buff.append(src.subSequence(last, edit.getSrcLocation()));
			}
			if (edit.getDst().length() > 0) {
				buff.append(edit.getDst());
			}
			last = edit.getSrcLocation() + edit.getSrc().length();
		}
		if (last < src.length()) {
			buff.append(src.subSequence(last, src.length()));
		}
		return buff.toString();
	}

	/*
	 * This method is useful for ensuring that the edits are properly calculated
	 */
	public static String revert(CharSequence dst, List<Edit> edits) {
		Iterator<Edit> it = edits.iterator();
		StringBuffer buff = new StringBuffer();
		int last = 0;
		while (it.hasNext()) {
			Edit edit = it.next();
			if (edit.getDstLocation() > last) {
				// catch up things in between edits
				buff.append(dst.subSequence(last, edit.getDstLocation()));
			}
			if (edit.getSrc().length() > 0) {
				buff.append(edit.getSrc());
			}
			last = edit.getDstLocation() + edit.getDst().length();
		}
		if (last < dst.length()) {
			buff.append(dst.subSequence(last, dst.length()));
		}
		return buff.toString();
	}

	/**
	 * Construct the longest common substring between two strings if such a
	 * substring exists. Note that this is different from the longest common
	 * subsequence in that it assumes you want the longest continuous sequence.
	 * The cost of this routine can be made less by keeping a master copy of
	 * data around that you want to check input against. That is, imagine that
	 * you keep the sorted suffix arrays around for some collection of data
	 * items. Then finding the LCS against that set is just a matter of
	 * computing the suffix matrix for the input (e.g., line) and comparing
	 * against the pre-computed suffix arrays for each data item.
	 * <p>
	 * In any event, this routine always computes and sorts the suffix arrays
	 * for both input string parameters.
	 * 
	 * @param src
	 *            the first string instance
	 * @param dst
	 *            the second string instance
	 * @return the longest common substring, or the empty string if at least one
	 *         of the arguments are <code>null</code>, empty, or there is no
	 *         match.
	 */
	private static LCS lcs(int srcStart, int srcEnd, CharSequence src,
			int dstStart, int dstEnd, CharSequence dst) {
		LCS lcs = new LCS(0, 0, 0);

		for (int i = srcStart; i < srcEnd; i++) {
			for (int j = dstStart; j < dstEnd; j++) {
				// System.out.println("I = " + i + " J = " + j);

				int len = 0;
				int max = Math.min(srcEnd - i, dstEnd - j);

				while (len < max) {
					if (src.charAt(i + len) == dst.charAt(j + len)) {
						len++;
					} else {
						break;
					}
				}
				// got a longer match, so erase bestMatch and replace it.
				if (len > lcs.getLength()) {
					/* replace bestMatch with our current match, which is longer */
					lcs = new LCS(i, j, len);
				}
			}
		}
		return lcs;
	}

	/**
	 * Construct the longest common substring between two strings if such a
	 * substring exists. Note that this is different from the longest common
	 * subsequence in that it assumes you want the longest continuous sequence.
	 * The cost of this routine can be made less by keeping a master copy of
	 * data around that you want to check input against. That is, imagine that
	 * you keep the sorted suffix arrays around for some collection of data
	 * items. Then finding the LCS against that set is just a matter of
	 * computing the suffix matrix for the input (e.g., line) and comparing
	 * against the pre-computed suffix arrays for each data item.
	 * <p>
	 * In any event, this routine always computes and sorts the suffix arrays
	 * for both input string parameters.
	 * 
	 * @param src
	 *            the first string instance
	 * @param dst
	 *            the second string instance
	 * @return the longest common substring, or the empty string if at least one
	 *         of the arguments are <code>null</code>, empty, or there is no
	 *         match.
	 */
	private static LCS lcs(int srcStart, int srcEnd, CharSequence[] src,
			int dstStart, int dstEnd, CharSequence[] dst) {
		LCS lcs = new LCS(0, 0, 0);

		for (int i = srcStart; i < srcEnd; i++) {
			for (int j = dstStart; j < dstEnd; j++) {

				int len = 0;
				int max = Math.min(srcEnd - i, dstEnd - j);

				while (len < max) {
					if (src[i + len].equals(dst[j + len])) {
						len++;
					} else {
						break;
					}
				}
				// got a longer match, so erase bestMatch and replace it.
				if (len > lcs.getLength()) {
					/* replace bestMatch with our current match, which is longer */
					lcs = new LCS(i, j, len);
				}
			}
		}
		return lcs;
	}

	private static class LCS {
		private int srcLocation;

		private int dstLocation;

		private int length;

		public LCS(int srcLocation, int dstLocation, int len) {
			this.srcLocation = srcLocation;
			this.dstLocation = dstLocation;
			this.length = len;
		}

		public int getDstLocation() {
			return this.dstLocation;
		}

		public int getLength() {
			return this.length;
		}

		public int getSrcLocation() {
			return this.srcLocation;
		}

		public String toString() {
			return "(" + srcLocation + "," + dstLocation + "," + length + ")";
		}
	}

	public static class Edit {

		private int srcLocation, dstLocation;

		private CharSequence src, dst;

		public Edit(int srcLocation, CharSequence src, int dstLocation,
				CharSequence dst) {
			if (srcLocation < 0)
				throw new IllegalArgumentException(
						"Src Start may not be negative! " + srcLocation);
			if (dstLocation < 0)
				throw new IllegalArgumentException(
						"Dst Start may not be negative! " + dstLocation);

			this.srcLocation = srcLocation;
			this.src = src;
			this.dstLocation = dstLocation;
			this.dst = dst;
		}

		public CharSequence getSrc() {
			return this.src;
		}

		public int getSrcLocation() {
			return this.srcLocation;
		}

		public CharSequence getDst() {
			return this.dst;
		}

		public int getDstLocation() {
			return this.dstLocation;
		}

		public String toString() {
			return srcLocation + "- '" + src + "', " + dstLocation + "- '"
					+ dst + "'";
		}
	}

	public static class ArrayEdit {

		private int srcStart, srcEnd, dstStart, dstEnd;

		public ArrayEdit(int srcStart, int srcEnd, int dstStart, int dstEnd) {
			if (srcStart < 0)
				throw new IllegalArgumentException(
						"Src Start may not be negative! " + srcStart);
			if (dstStart < 0)
				throw new IllegalArgumentException(
						"Dst Start may not be negative! " + dstStart);

			this.srcStart = srcStart;
			this.srcEnd = srcEnd;
			this.dstStart = dstStart;
			this.dstEnd = dstEnd;
		}

		public int getDstEnd() {
			return this.dstEnd;
		}

		public int getDstStart() {
			return this.dstStart;
		}

		public int getSrcEnd() {
			return this.srcEnd;
		}

		public int getSrcStart() {
			return this.srcStart;
		}

		public String toString() {
			return srcStart + "-" + srcEnd + ", " + dstStart + "-" + dstEnd;
		}
	}

	private static void test(String src, String dst) {
		List<Edit> edits = getEdits(src, dst, ' ');
		String result = apply(src, edits);
		if (!result.equals(dst)) {
			System.err.println("Failed applying edits! '" + result + "' != '"
					+ dst + "'");
		} else {
			System.err.println("Success applying!!");
		}
		result = revert(dst, edits);
		if (!result.equals(src)) {
			System.err.println("Failed reverting edits! '" + result + "' != '"
					+ src + "'");
		} else {
			System.err.println("Success reverting!!");
		}

	}

	public static void main(String[] args) {
		test("cith", "ttttcithbbbb");
		test("quicklyquicish", "quincequickish");
		test("the cat in the hat box", "cat in the hat");
	}
}