/* * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * Free SoftwareFoundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Scott Ferguson */ package com.caucho.util; import com.caucho.vfs.*; import java.io.*; import java.util.*; public class BinaryHashDiff { //private static long PRIME = (1L << 61) - 1; // private static long PRIME = (1L << 31) - 1; // Prime table at http://primes.utm.edu/lists/2small/0bit.html // 2^54 to avoid long overflow when multiplying byte data and MUL private static long PRIME = (1L << 54) - 33; private static long MUL = 251; int _oldLength; ArrayList<TempBuffer> _oldBuffers; byte[][] _oldBytes; int _copyMin = 8; int _chunkSize = 4; int _chunkCount; long _hashFactor; long []_hashArray; int []_suffixArray; int []_charMin; int []_charMax; public void delta(OutputStream out, InputStream oldFile, InputStream newFile) throws IOException { readBuffers(oldFile); processOrigFile(); processNewFile(out, newFile); } protected void add(OutputStream out, byte []buffer, int offset, int length) throws IOException { System.out.println("ADD(" + length + "): '" + new String(buffer, offset, length) + "'"); } protected void copy(OutputStream out, int offset, int length) throws IOException { System.out.println("COPY(" + offset + "," + length + ")"); } private void readBuffers(InputStream oldFile) throws IOException { _oldBuffers = new ArrayList<TempBuffer>(); _oldLength = 0; while (true) { TempBuffer buf = TempBuffer.allocate(); byte []buffer = buf.getBuffer(); int sublen = readAll(oldFile, buffer); if (sublen < 0) { TempBuffer.free(buf); break; } _oldLength += sublen; _oldBuffers.add(buf); if (sublen < buffer.length) break; } _oldBytes = new byte[_oldBuffers.size()][]; for (int i = 0; i < _oldBytes.length; i++) { _oldBytes[i] = _oldBuffers.get(i).getBuffer(); } } private void processOrigFile() { long factor = 1; for (int i = 1; i < _chunkSize; i++) { factor = (MUL * factor) % PRIME; } _hashFactor = factor; _chunkCount = _oldLength / _chunkSize; _hashArray = new long[_chunkCount]; _suffixArray = new int[_chunkCount]; int chunkSize = _chunkSize; int chunkCount = _chunkCount; long []hashArray = _hashArray; int []suffixArray = _suffixArray; byte [][]data = _oldBytes; for (int i = 0; i < chunkCount; i++) { int offset = i * chunkSize; suffixArray[i] = i; byte []buffer = data[offset / TempBuffer.SIZE]; offset = offset % TempBuffer.SIZE; long hash = 0; for (int k = 0; k < chunkSize; k++) { hash = hash(hash, buffer[offset + k], 0, factor); } hashArray[i] = hash; } sort(suffixArray, hashArray); } private static void sort(int []suffixArray, long []hashArray) { int length = suffixArray.length; int dataLength = length; sort(suffixArray, hashArray, dataLength, 0, length); } private static void sort(int []suffixArray, long []hashArray, int dataLength, int min, int max) { int delta = max - min; if (delta < 2) { } else if (delta == 2) { int aIndex = suffixArray[min]; int bIndex = suffixArray[min + 1]; long aValue = hashArray[aIndex]; long bValue = hashArray[bIndex]; if (bValue < aValue) { suffixArray[min] = bIndex; suffixArray[min + 1] = aIndex; } } else { int pivotIndex = suffixArray[min]; long pivotValue = hashArray[pivotIndex]; int pivotMax = max; int pivot = min; while (pivot + 1 < pivotMax) { long value = hashArray[suffixArray[pivot + 1]]; if (value < pivotValue) { suffixArray[pivot] = suffixArray[pivot + 1]; suffixArray[pivot + 1] = pivotIndex; pivot += 1; } else { int temp = suffixArray[pivotMax - 1]; suffixArray[pivotMax - 1] = suffixArray[pivot + 1]; suffixArray[pivot + 1] = temp; pivotMax -= 1; } } if (min < pivot) { sort(suffixArray, hashArray, dataLength, min, pivot); sort(suffixArray, hashArray, dataLength, pivot, max); } else { sort(suffixArray, hashArray, dataLength, pivot + 1, max); } } } private static int suffixCompareTo(int a, int b, long []hashArray, int length) { int sublen = length - a; if (length - b < sublen) sublen = length - b; for (int i = 0; i < sublen; i++) { long a1 = hashArray[a + i]; long b1 = hashArray[b + i]; if (a1 < b1) return -1; else if (b1 < a1) return 1; } return 0; } private void processNewFile(OutputStream out, InputStream newIn) throws IOException { TempBuffer tempBuf = TempBuffer.allocate(); byte []buffer = tempBuf.getBuffer();; int length = readAll(newIn, buffer); if (length < _chunkSize) return; int prevOffset = 0; int offset = 0; int chunkSize = _chunkSize; byte [][]data = _oldBytes; int []suffixArray = _suffixArray; long []hashArray = _hashArray; long hashFactor = _hashFactor; long hash = 0; for (int i = 0; i < chunkSize - 1; i++) { hash = hash(hash, buffer[i], 0, hashFactor); } loop: for (; offset + chunkSize < length; offset += 1) { byte oldValue = 0; if (offset > 0) oldValue = buffer[offset - 1]; hash = hash(hash, buffer[offset + chunkSize - 1], oldValue, hashFactor); int suffixIndex = findBlock(hash, suffixArray, hashArray); if (suffixIndex < 0) continue; int suffixOffset = suffixArray[suffixIndex] + 1; int dataOffset = offset + chunkSize; long prevHash = hash; loop_match: for (; dataOffset + chunkSize < length && suffixOffset < hashArray.length; dataOffset += chunkSize, suffixOffset += 1) { long hash2 = 0; for (int i = 0; i < chunkSize; i++) { hash2 = hash(hash2, buffer[dataOffset + i], 0, hashFactor); } if (hash2 == hashArray[suffixOffset]) { } else if (hash2 < hashArray[suffixOffset]) { int delta = suffixOffset - suffixArray[suffixIndex]; for (int i = suffixIndex - 1; i >= 0 && hashArray[suffixArray[i]] == hash && suffixArray[i] + delta < hashArray.length && hashArray[suffixArray[i] + delta - 1] == prevHash; i--) { if (hashArray[suffixArray[i] + delta] == hash2) { suffixIndex = i; suffixOffset = suffixArray[i] + delta; prevHash = hash2; // XXX: also need to revalidate in between continue loop_match; } } break; } else { int delta = suffixOffset - suffixArray[suffixIndex]; for (int i = suffixIndex + 1; i < suffixArray.length && hashArray[suffixArray[i]] == hash && suffixArray[i] + delta < hashArray.length && hashArray[suffixArray[i] + delta - 1] == prevHash; i++) { if (hashArray[suffixArray[i] + delta] == hash2) { suffixIndex = i; suffixOffset = suffixArray[i] + delta; prevHash = hash2; // XXX: also need to revalidate in between continue loop_match; } } break; } prevHash = hash2; } if (dataOffset - offset >= _copyMin) { if (prevOffset < offset) add(out, buffer, prevOffset, offset - prevOffset); copy(out, suffixArray[suffixIndex] * _chunkSize, dataOffset - offset); hash = 0; offset = dataOffset - 1; for (int i = 0; i < _chunkSize; i++) hash = hash(hash, buffer[offset + i], 0, hashFactor); prevOffset = offset + 1; } } if (prevOffset < length) add(out, buffer, prevOffset, length - prevOffset); TempBuffer.free(tempBuf); } private int findBlock(long hash, int []suffixArray, long []hashArray) { int min = 0; int max = suffixArray.length; while (min < max) { int pivot = (min + max) / 2; long hashValue = hashArray[suffixArray[pivot]]; if (hash == hashValue) return pivot; else if (hash < hashValue) max = pivot; else min = pivot + 1; } return -1; } private int readAll(InputStream is, byte []buffer) throws IOException { int offset = 0; while (offset < buffer.length) { int sublen = buffer.length - offset; sublen = is.read(buffer, offset, sublen); if (sublen < 0) return offset > 0 ? offset : -1; offset += sublen; } return buffer.length; } public void testHash() { Random random = new Random(); byte []data = new byte[8192]; for (int k = 0; k < 256 * 1024 * 1024; k++) { int size = random.nextInt(256); if (size < 1) size = 1; long factor = 1; for (int i = 1; i < size; i++) { factor = (MUL * factor) % PRIME; } byte d0 = (byte) random.nextInt(); for (int i = 0; i < size; i++) { data[i] = (byte) random.nextInt(); } long hash = 0; hash = hash(hash, d0, 0, factor); for (int i = 0; i < size - 1; i++) { hash = hash(hash, data[i], 0, factor); } hash = hash(hash, data[size - 1], d0, factor); long oldHash = hash; hash = 0; for (int i = 0; i < size; i++) { hash = hash(hash, data[i], 0, factor); } long newHash = hash; if (oldHash != newHash) System.out.println("OLD: " + oldHash + " " + newHash); } } // rabin-karp hash (PRIME = (1L << 54) - 33, MUL = 13); private static long hash(long hash, int newData, int oldData, long factor) { oldData = oldData & 0xff; newData = newData & 0xff; long old = ((PRIME << 8) + hash - factor * oldData) % PRIME; return (MUL * old + newData) % PRIME; } }