/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.lang.reflect.Field; import java.nio.ByteOrder; import java.security.AccessController; import java.security.PrivilegedAction; import com.google.common.primitives.Longs; import com.google.common.primitives.UnsignedBytes; import sun.misc.Unsafe; /** * Utility function that operate on a java * bytearray * @author rana * */ @SuppressWarnings("restriction") public class ByteArrayUtils { /** * Search the data byte array for the first occurrence of the byte array * pattern. */ public static final int indexOf(byte[] data,int offset,int length, byte[] pattern) { int[] failure = computeFailure(pattern); int j = 0; for (int i = 0; i < length; i++) { while (j > 0 && pattern[j] != data[offset + i]) { j = failure[j - 1]; } if (pattern[j] == data[offset + i]) { j++; } if (j == pattern.length) { return offset + (i - pattern.length + 1); } } return -1; } /** * Computes the failure function using a boot-strapping process, where the * pattern is matched against itself. */ private static final int[] computeFailure(byte[] pattern) { int[] failure = new int[pattern.length]; int j = 0; for (int i = 1; i < pattern.length; i++) { while (j > 0 && pattern[j] != pattern[i]) { j = failure[j - 1]; } if (pattern[j] == pattern[i]) { j++; } failure[i] = j; } return failure; } public static long parseLong(byte[] s,int offset,int length, int radix) throws NumberFormatException { if (s == null) { throw new NumberFormatException("null"); } if (radix < Character.MIN_RADIX) { throw new NumberFormatException("radix " + radix + " less than Character.MIN_RADIX"); } if (radix > Character.MAX_RADIX) { throw new NumberFormatException("radix " + radix + " greater than Character.MAX_RADIX"); } long result = 0; boolean negative = false; int i = 0, len = length; long limit = -Long.MAX_VALUE; long multmin; int digit; if (len > 0) { char firstChar = (char) s[offset]; if (firstChar < '0') { // Possible leading "-" if (firstChar == '-') { negative = true; limit = Long.MIN_VALUE; } else throw new NumberFormatException(); if (len == 1) // Cannot have lone "-" throw new NumberFormatException(); i++; } multmin = limit / radix; while (i < len) { // Accumulating negatively avoids surprises near MAX_VALUE digit = Character.digit((char)s[offset + i++], radix); if (digit < 0) { throw new NumberFormatException(); } if (result < multmin) { throw new NumberFormatException(); } result *= radix; if (result < limit + digit) { throw new NumberFormatException(); } result -= digit; } } else { throw new NumberFormatException(); } return negative ? result : -result; } static final Unsafe theUnsafe; /** The offset to the first element in a byte array. */ static final int BYTE_ARRAY_BASE_OFFSET; static { theUnsafe = (Unsafe) AccessController.doPrivileged( new PrivilegedAction<Object>() { @Override public Object run() { try { Field f = Unsafe.class.getDeclaredField("theUnsafe"); f.setAccessible(true); return f.get(null); } catch (NoSuchFieldException e) { // It doesn't matter what we throw; // it's swallowed in getBestComparer(). throw new Error(); } catch (IllegalAccessException e) { throw new Error(); } } }); BYTE_ARRAY_BASE_OFFSET = theUnsafe.arrayBaseOffset(byte[].class); // sanity check - this should never fail if (theUnsafe.arrayIndexScale(byte[].class) != 1) { throw new AssertionError(); } } static final boolean littleEndian = ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN); /** * Returns true if x1 is less than x2, when both values are treated as * unsigned. */ static boolean lessThanUnsigned(long x1, long x2) { return (x1 + Long.MIN_VALUE) < (x2 + Long.MIN_VALUE); } /** * Lexicographically compare two arrays. * * @param buffer1 left operand * @param buffer2 right operand * @param offset1 Where to start comparing in the left buffer * @param offset2 Where to start comparing in the right buffer * @param length1 How much to compare from the left buffer * @param length2 How much to compare from the right buffer * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareBytes(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2) { // Short circuit equal case if (buffer1 == buffer2 && offset1 == offset2 && length1 == length2) { return 0; } int minLength = Math.min(length1, length2); int minWords = minLength / Longs.BYTES; int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET; int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET; /* * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a * time is no slower than comparing 4 bytes at a time even on 32-bit. * On the other hand, it is substantially faster on 64-bit. */ for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) { long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i); long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i); long diff = lw ^ rw; if (diff != 0) { if (!littleEndian) { return lessThanUnsigned(lw, rw) ? -1 : 1; } // Use binary search int n = 0; int y; int x = (int) diff; if (x == 0) { x = (int) (diff >>> 32); n = 32; } y = x << 16; if (y == 0) { n += 16; } else { x = y; } y = x << 8; if (y == 0) { n += 8; } return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL)); } } // The epilogue to cover the last (minLength % 8) elements. for (int i = minWords * Longs.BYTES; i < minLength; i++) { int result = UnsignedBytes.compare( buffer1[offset1 + i], buffer2[offset2 + i]); if (result != 0) { return result; } } return length1 - length2; } }