/**
Copyright (C) 2012 Delcyon, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.delcyon.capo.util.diff;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.util.ArrayList;
import com.delcyon.capo.util.diff.InputStreamTokenizer.TokenList;
/**
* @author jeremiah
* Generates DiffEntrys based on inputStream or String or byte[] arrays.
* This can be used to process very long streams of data, or just simple text.
* If you use one of the getDifferences methods, the differences will be processed, and the class is finished.
* If however you use the getInputStream() method, Diff will spin off it's own thread, an continiously process the two input streams for data
* and make all of the diff entries available in the input stream for reading. In theory this should should handle a huge amount of streaming data.
*
* You can use the addCustomTokenList to add tokens on which to break up the input stream into 'lines'.
* These lines are what's compared to each other to generate the diff.
* If you do so, you will need to keep track of them so you can determine how your stream was tokenized.
* The default tokenList is NEW_LINE.
*
* The window is a sliding window, so it size will limit how many lines are compared to each other at once.
* If it's too small, you might not get 100% accuracy, and if too large will probably slow things down. Generally not a big deal though. And defaults to 256 'lines'
*/
public class Diff implements Runnable
{
public enum Side
{
BASE('-'),
MOD('+'),
BOTH('=');
private char directionChar;
Side(char directionChar)
{
this.directionChar = directionChar;
}
public char getDirectionChar()
{
return directionChar;
}
/**
* Simple convince method to always return the opposite side
* @return
*/
public Side getOppositeSide()
{
switch (this)
{
case BASE:
return MOD;
case MOD:
return BASE;
default:
return BOTH;
}
}
}
public static final int DEFAULT_WINDOW_SIZE = 256;
private int windowSize;
private InputStream baseInputStream;
private InputStream otherInputStream;
private OutputStream outputStream;
private boolean threadStarted = false;
private ArrayList<ArrayList<Integer>> tokenLists = new ArrayList<ArrayList<Integer>>();
private TokenList tokenList;
public Diff(InputStream baseInputStream, InputStream otherInputStream,int windowSize,TokenList tokenList) throws IOException
{
this.windowSize = windowSize;
this.baseInputStream = baseInputStream;
this.otherInputStream = otherInputStream;
this.tokenLists = tokenList.getTokenLists();
this.tokenList = tokenList;
}
/**
* Uses DEFAULT_WINDOW_SIZE
* Uses NEW_LINE token list
* @param baseInputStream
* @param otherInputStream
* @throws IOException
*/
public Diff(InputStream baseInputStream, InputStream otherInputStream) throws IOException
{
this(baseInputStream, otherInputStream, DEFAULT_WINDOW_SIZE,TokenList.NEW_LINE);
}
/**
* Uses DEFAULT_WINDOW_SIZE
* @param baseInputStream
* @param otherInputStream
* @param tokenList
* @throws IOException
*/
public Diff(InputStream baseInputStream, InputStream otherInputStream,TokenList tokenList) throws IOException
{
this(baseInputStream, otherInputStream, DEFAULT_WINDOW_SIZE,tokenList);
}
/**
* Uses the addition of the text lengths for the window size
* Uses NEW_LINE token list
* @param base
* @param other
* @throws IOException
*/
public Diff(String baseText, String otherText) throws IOException
{
this(new ByteArrayInputStream(baseText.getBytes()),new ByteArrayInputStream(otherText.getBytes()),baseText.length()+otherText.length(),TokenList.NEW_LINE);
}
/**
* Uses the addition of the text lengths for the window size
* Allow the specification of a CUSTOM tokenList
* @param base
* @param other
* @param tokenLists
* @throws IOException
*/
public Diff(String baseText, String otherText, ArrayList<ArrayList<Integer>> tokenLists) throws IOException
{
this(new ByteArrayInputStream(baseText.getBytes()),new ByteArrayInputStream(otherText.getBytes()),baseText.length()+otherText.length(),TokenList.CUSTOM);
this.tokenLists = tokenLists;
}
/**
* Uses the addition of the text lengths for the window size
* @param base
* @param other
* @throws IOException
*/
public Diff(String baseText, String otherText,TokenList tokenList) throws IOException
{
this(new ByteArrayInputStream(baseText.getBytes()),new ByteArrayInputStream(otherText.getBytes()),baseText.length()+otherText.length(),tokenList);
}
/**
* Uses the addition of the array lengths for the window size
* Uses NEW_LINE token list
* @param base
* @param other
* @throws IOException
*/
public Diff(byte[] base, byte[] other) throws IOException
{
this(new ByteArrayInputStream(base),new ByteArrayInputStream(other),base.length+other.length,TokenList.NEW_LINE);
}
/**
* Uses the addition of the array lengths for the window size
* @param base
* @param other
* @throws IOException
*/
public Diff(byte[] base, byte[] other,TokenList tokenList) throws IOException
{
this(new ByteArrayInputStream(base),new ByteArrayInputStream(other),base.length+other.length,tokenList);
}
public int getWindowSize()
{
return windowSize;
}
/**
* see class description
* @param windowSize
*/
public void setWindowSize(int windowSize)
{
this.windowSize = windowSize;
}
/**
* Will change the tokenList to CUSTOM, and add this list of tokens to the current tokenLists array.
* You will need to clear he token list if you want only your custom tokens used.
* Clear it BEFORE you add you custom token lists.
* @param eolMatch
*/
public void addCustomTokenList(char... eolMatch)
{
this.tokenList = TokenList.CUSTOM;
ArrayList<Integer> lineBrake = new ArrayList<Integer>();
for (char c : eolMatch)
{
lineBrake.add((int)c);
}
tokenLists.add(lineBrake);
}
/**
* Clears out all of the current token lists, and sets tokenList to CUSTOM
*/
public void clearTokenLists()
{
tokenLists.clear();
this.tokenList = TokenList.CUSTOM;
}
public TokenList getTokenList()
{
return tokenList;
}
public ArrayList<ArrayList<Integer>> getTokenLists()
{
return tokenLists;
}
/**
* @return difference entries as a string
* @throws Exception
*/
public String getDifferences() throws Exception
{
outputStream = new ByteArrayOutputStream();
processDifferences();
return outputStream.toString();
}
/**
*
* @return difference entries as a byte[]
* @throws Exception
*/
public byte[] getDifferencesAsBytes() throws Exception
{
outputStream = new ByteArrayOutputStream();
processDifferences();
return ((ByteArrayOutputStream) outputStream).toByteArray();
}
/**
*
* @return an input stream which can be used for reading DiffEntries from. This will cause a new Thread to be started, and can be used in conjunction with large data streams.
* @throws IOException
*/
public InputStream getInputStream() throws IOException
{
PipedInputStream pipedInputStream = null;
if (this.threadStarted == false)
{
this.outputStream = new PipedOutputStream();
pipedInputStream = new PipedInputStream((PipedOutputStream) outputStream, windowSize*80);
new Thread(this).start();
}
return pipedInputStream;
}
/**
* used by getInputStream method
*/
@Override
public void run()
{
try
{
this.threadStarted = true;
processDifferences();
}
catch (Exception e)
{
e.printStackTrace();
}
}
/**
* Actually creates an processes the DiffEntries this is the main method of this class.
* @throws Exception
*/
private void processDifferences() throws Exception
{
Window baseWindow = new Window(Side.BASE,windowSize);
Window otherWindow = new Window(Side.MOD,windowSize);
InputStreamTokenizer baseInputStreamBreaker = null;
InputStreamTokenizer otherInputStreamBreaker = null;
if (tokenList == TokenList.CUSTOM)
{
baseInputStreamBreaker = new InputStreamTokenizer(baseInputStream, tokenLists);
otherInputStreamBreaker = new InputStreamTokenizer(otherInputStream, tokenLists);
}
else
{
baseInputStreamBreaker = new InputStreamTokenizer(baseInputStream, tokenList);
otherInputStreamBreaker = new InputStreamTokenizer(otherInputStream, tokenList);
}
//fill base window
readIntoWindow(baseWindow, baseInputStreamBreaker, windowSize);
//fill other window
readIntoWindow(otherWindow, otherInputStreamBreaker, windowSize);
for (WindowItem otherWindowItem : otherWindow.getWindowItems())
{
if (baseWindow.hasMatch(otherWindowItem) == true)
{
otherWindowItem.addMatches(baseWindow.getMatches(otherWindowItem));
}
else
{
//skip
}
}
//printMatchTable(baseWindow, otherWindow);
/*
* Basic algorithm
* starting with base window
* get Cheapest chain from current stream position for current window (starting w/ base)
* walk to that chains start position
* while walking:
* decapitate/break any chains we intersect for that windowItem's stream position
* add windowItem to script
* increment script position
* remove windowItem from window
* when we arrive at position:
* walk chain
* while walking:
* add window item to script (equals)
* increment script position
* when done walking
* remove window items in chain
* remove chain
* refresh Window
* change direction !! apparently we shouldn't do this, or we do it implicitly.
* The examples online appears to be wacky. We are not looking for the largest common sequence, we are looking for the closest common sequence in the match grid.
* Maybe it's just my lack of understanding, but the LCS is not always the best sequence to use, since it might require a large amount changes just to get to it.
* rinse repeat
*/
long scriptPosition = 0;
long baseStreamPosition = 0l;
long otherStreamPosition = 0l;
while(true)
{
int baseWindowItemsIndex = 0;
//get list of window items to walk through
ArrayList<WindowItem> baseWindowItems = baseWindow.getWindowItems();
ArrayList<WindowItem> otherWindowItems = otherWindow.getWindowItems();
if(baseWindowItems.isEmpty() && otherWindowItems.isEmpty())
{
break;
}
//get cheapest chain for first window item
ArrayList<WindowItemLink> currentChain = baseWindow.getCheapestChain(baseWindow,otherWindow);
long baseWindowChainStartPosition = -1;
long otherWindowChainStartPosition = -1;
long otherWindowChainEndPosition = -1;
int currentChainSize = 0;
if(currentChain == null)
{
//give us a bogus end to read to
//add one to the ends to make sure we read fully when dealing with the difference below.
if (otherWindowItems.isEmpty() == false)
{
otherWindowChainStartPosition = otherWindowItems.get(otherWindowItems.size() - 1).getStreamPosition()+1l;
otherWindowChainEndPosition = otherWindowItems.get(otherWindowItems.size() - 1).getStreamPosition()+1l;
}
if (baseWindowItems.isEmpty() == false)
{
baseWindowChainStartPosition = baseWindowItems.get(baseWindowItems.size()-1).getStreamPosition()+1l;
}
}
else
{
baseWindowChainStartPosition = currentChain.get(0).getBaseWindowItem().getStreamPosition();
otherWindowChainStartPosition = currentChain.get(0).getOtherWindowItem().getStreamPosition();
otherWindowChainEndPosition = currentChain.get(currentChain.size()-1).getOtherWindowItem().getStreamPosition();
}
if (currentChain != null)
{
currentChainSize = currentChain.size();
//System.out.println(currentWindow+" chain = "+currentChain.get(0)+"["+currentChainSize+"]");
}
//figure out distance from currentWindowItem.streamPosition to start of chain
//walk to cheapest chain start position in the base window
if (baseWindowItems.isEmpty() == false)
{
int diffrence = (int) (baseWindowChainStartPosition - baseWindowItems.get(0).getStreamPosition());
for(int currentIndex = 0;currentIndex < diffrence; currentIndex++)
{
writeLine(baseWindowItems.get(currentIndex).getSide(), baseStreamPosition, otherStreamPosition, scriptPosition, baseWindowItems.get(currentIndex).getData(), outputStream, true);
baseWindowItemsIndex++;
baseStreamPosition++;
}
}
//walk to cheapest chain start position in the other window
if (otherWindowItems.isEmpty() == false)
{
int otherWindowItemsIndex = 0;
while(otherWindowItems.size() > otherWindowItemsIndex && otherWindowItems.get(otherWindowItemsIndex).getStreamPosition() < otherWindowChainStartPosition)
{
writeLine(otherWindowItems.get(otherWindowItemsIndex).getSide(), baseStreamPosition, otherStreamPosition, scriptPosition, otherWindowItems.get(otherWindowItemsIndex).getData(), outputStream, true);
otherWindowItemsIndex++;
scriptPosition++;
otherStreamPosition++;
}
}
//walk the chain
if (currentChain != null)
{
for (WindowItemLink windowItemLink : currentChain)
{
writeLine(null, baseStreamPosition, otherStreamPosition, scriptPosition, windowItemLink.getWindowItemForSide(Side.BASE).getData(), outputStream, true);
scriptPosition++;
baseStreamPosition++;
otherStreamPosition++;
}
}
//cleanup
if (baseWindowItems.isEmpty() == false)
{
long baseWindowEndStreamPosition = baseWindowItems.get(0).getStreamPosition()+baseWindowItemsIndex+(long)currentChainSize-1l;
baseWindow.removeUntil(baseWindowEndStreamPosition);
}
if (otherWindowItems.isEmpty() == false)
{
otherWindow.removeUntil(otherWindowChainEndPosition);
}
//try and fill the buffers a little more
readIntoWindow(baseWindow, baseInputStreamBreaker, windowSize - baseWindowItems.size());
//fill other window
readIntoWindow(otherWindow, otherInputStreamBreaker, windowSize - otherWindowItems.size());
//process all of the matches
for (WindowItem baseWindowItem : baseWindow.getWindowItems())
{
if (otherWindow.hasMatch(baseWindowItem) == true)
{
baseWindowItem.addMatches(otherWindow.getMatches(baseWindowItem));
}
else
{
//skip
}
}
//printMatchTable(baseWindow, otherWindow);
}
outputStream.flush();
outputStream.close();
}
/**
* Creates a DiffEntry from our current processing data
* @param side
* @param basePosition
* @param otherPosition
* @param outputPosition
* @param data
* @param outputStream
* @param addLineDelimiter
* @throws Exception
*/
private void writeLine(Side side, long basePosition,long otherPosition,long outputPosition, byte[] data, OutputStream outputStream,boolean addLineDelimiter) throws Exception
{
//this is probably a little heavy but it insures that the format of the stream is the same for everyone
DiffEntry diffEntry = new DiffEntry(side, data.length, basePosition, otherPosition, data);
outputStream.write(diffEntry.toByteArray());
}
/**
* Reads the data from the TokenizedInputStream into the window
* @param window
* @param inputStreamTokenizer to read from
* @param numberOfLinesToRead
*/
private void readIntoWindow(Window window, InputStreamTokenizer inputStreamTokenizer, int count) throws Exception
{
for(int readCount = 0; readCount < count; readCount++)
{
byte[] readLine = inputStreamTokenizer.readBytes();
if (readLine.length != 0)
{
window.addWindowItem(readLine);
}
else
{
break;
}
}
}
/**
* This is used for debugging, and kinda cool, so I left it in.
* @param baseWindow
* @param otherWindow
*/
public static void printMatchTable(Window baseWindow, Window otherWindow)
{
System.out.print("\n\t\t\t ");
for (WindowItem otherWindowItem : otherWindow.getWindowItems())
{
System.out.print(String.format(" %02d", otherWindowItem.getStreamPosition()));
}
System.out.println();
for (WindowItem baseWindowItem : baseWindow.getWindowItems())
{
if (baseWindowItem == null)
{
break;
}
System.out.print(String.format("%016x\t%02d", baseWindowItem.getDataHashCode(),baseWindowItem.getStreamPosition()));
for (WindowItem otherWindowItem : otherWindow.getWindowItems())
{
if (otherWindowItem == null)
{
break;
}
if (otherWindowItem.getDataHashCode() == baseWindowItem.getDataHashCode())
{
System.out.print("|"+baseWindowItem.getChainID(otherWindowItem));
}
else
{
System.out.print("| ");
}
}
System.out.println("|");
}
}
}