package net.varkhan.data.diff; import net.varkhan.base.containers.Collection; import net.varkhan.base.containers.Container; import net.varkhan.base.containers.Iterable; import net.varkhan.base.containers.Iterator; import net.varkhan.base.containers.array.Arrays; import net.varkhan.base.containers.list.List; import net.varkhan.base.containers.list.ArrayList; import java.util.Comparator; /** * <b></b>. * <p/> * Eugene Myers: "An O(ND) Difference Algorithm and its Variations", in Algorithmica Vol. 1 No. 2, 1986, p 251. * * @author varkhan * @date 9/21/14 * @time 3:00 PM */ public class EugeneMyersDiff<T, S extends Container<T>, X> implements Diff<T,S,X> { protected final Comparator<T> comp; public EugeneMyersDiff(Comparator<T> comp) { this.comp=comp; } @Override @SuppressWarnings("unchecked") public Iterable<Diff.Block<T>> invoke(S srcL, S srcR, X ctx) { Object[] datL=getArray(srcL); Object[] datR=getArray(srcR); int max = datL.length + datR.length + 1; // edits for the begin sequence int[] begE=new int[2*max+2]; // edits for the end sequence int[] endE=new int[2*max+2]; boolean[] edtR = new boolean[datL.length+2]; boolean[] edtL = new boolean[datR.length+2]; LCS(datL, edtL, 0, datL.length, datR, edtR, 0, datR.length, begE, endE); return GDB(datL, edtL, datR, edtR); } protected Object[] getArray(S src) { Object[] dat = new Object[(int)src.size()]; int i = 0; for(Iterator<? extends T> it=src.iterator();it.hasNext();) { dat[i++] = it.next(); } return dat; } @SuppressWarnings("unchecked") protected boolean equals(Object l, Object r) { return comp.compare((T)l,(T)r)==0; } /** * An implementation of the longest common-subsequence (LCS) that looks for * optimal subsequences anchored at either end of the specified boundaries. * * @param datL the left-side data * @param edtL the left-side edit flags * @param begL the left-side start position * @param endL the left-side end position * @param datR the right-side data * @param edtR the right-side edit flags * @param begR the right-side start position * @param endR the right-side end position * @param begE start-side edit sequence * @param endE end-side edit sequence */ protected void LCS(Object[] datL, boolean[] edtL, int begL, int endL, Object[] datR, boolean[] edtR, int begR, int endR, int[] begE, int[] endE) { // skip identical beg sequences while( begL<endL && begR<endR && equals(datL[begL],datR[begR]) ) { begL++; begR++; } // skip identical end sequences while( begL<endL && begR<endR && equals(datL[endL-1],datR[endR-1]) ) { --endL; --endR; } // Insertions and deletions if(begL==endL) { while(begR<endR) edtR[begR++]=true; } else if(begR==endR) { while(begL<endL) edtL[begL++]=true; } else { // Compute the shortest middle snake (l,r), to get the optimal path int[] sms=SMS(datL, begL, endL, datR, begR, endR, begE, endE); // The path is from beg to (l,r) and from (l,r) to end LCS(datL, edtL, begL, sms[0], datR, edtR, begR, sms[1], begE, endE); LCS(datL, edtL, sms[0], endL, datR, edtR, sms[1], endR, begE, endE); } } /** * Look for the Shortest Middle Snake between the specified boundaries. * * @param datL the left-side data * @param begL the left-side start position * @param endL the left-side end position * @param datR the right-side data * @param begR the right-side start position * @param endR the right-side end position * @param begE start-side edit sequence * @param endE end-side edit sequence * @return the positions of the shortest middle snake */ protected int[] SMS(Object[] datL, int begL, int endL, Object[] datR, int begR, int endR, int[] begE, int[] endE) { int max = datL.length+datR.length+1; // Beg search starts at this Kline int begK = begL-begR; // End search starts at this K-line int endK = endL-endR; // The original algo uses arrays that accepts negative indices. // We use 0-based arrays instead, and add respective offsets: // beg0 for begE / end0 for endE int beg0 = max-begK; int end0 = max-endK; int difD=(endL-begL)-(endR-begR); boolean odd = (difD&1)!=0; int maxD= ((endL-begL+endR-begR)/2) + 1; // init vectors begE[beg0+begK+1]=begL; endE[end0+endK-1]=endL; for(int d=0; d<=maxD; d++) { // Extend the forward path. for(int k=begK-d; k<=begK+d; k+=2) { // Find the starting point int x, y; if(k==begK-d) x=begE[beg0+k+1]; // down else { x=begE[beg0+k-1]+1; // right if(k<begK+d && begE[beg0+k+1]>=x) x=begE[beg0+k+1]; // down } y=x-k; // Find the end of the furthest reaching forward d-path in diagonal k. while( x<endL && y<endR && equals(datL[x],datR[y]) ) { x++; y++; } begE[beg0+k]=x; // overlap ? if(odd && endK-d<k && k<endK+d) { if(endE[end0+k] <= begE[beg0+k]) { return new int[] { begE[beg0+k], begE[beg0+k]-k }; } } } // Extend the reverse path. for(int k=endK-d; k<=endK+d; k+=2) { // Find the starting point int x, y; if(k==endK+d) x=endE[end0+k-1]; // up else { x=endE[end0+k+1]-1; // left if(k>endK-d && endE[end0+k-1]<x) x=endE[end0+k-1]; // up } y=x-k; // Find the end of the furthest reaching backward d-path in diagonal k. while( x>begL && y>begR && equals(datL[x-1],datR[y-1]) ) { x--; y--; } endE[end0+k]=x; // overlap ? if(!odd && begK-d<=k && k<=begK+d) { if(endE[end0+k] <= begE[beg0+k]) { return new int[] { begE[beg0+k], begE[beg0+k]-k }; } } } } // We should never get there throw new RuntimeException("Ran out of possible edits!"); } /** * Scan the edit sequences on both sides, start to end, to generate the edit script. * * @param datL the left-side data * @param edtL the left-side edit flags * @param datR the right-side data * @param edtR the right-side edit flags * @return the sequence of diff blocks */ protected Collection<Diff.Block<T>> GDB(Object[] datL, boolean[] edtL, Object[] datR, boolean edtR[]) { Collection<Diff.Block<T>> c = new ArrayList<Diff.Block<T>>(); final List datLa = Arrays.asList(datL); final List datRa = Arrays.asList(datR); int lenR=datR.length; int lenL=datL.length; int endL=0; int endR=0; while(endL<lenL||endR<lenR) { // Unchanged if(endL<lenL&& !edtL[endL]&&endR<lenR&& !edtR[endR]) { endL++; endR++; } // Edited else { int begL=endL; int begR=endR; while(endL<lenL&& (endR>=lenR||edtL[endL])) endL++; while(endR<lenR&& (endL>=lenL||edtR[endR])) endR++; if( begL<endL || begR<endR ) { c.add(new DiffBlock<T>(datLa, begL, endL, datRa, begR, endR)); } } } return c; } }