/**
* Copyright 2012 Willet Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.willetinc.hadoop.mapreduce.dynamodb;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import com.amazonaws.services.dynamodb.model.AttributeValue;
import com.amazonaws.services.dynamodb.model.ComparisonOperator;
/**
* This method needs to determine the splits between two user-provided byte
* arrays. In the case where the user's bytes are 0x0 and 0xF, this is not hard;
* we could create two splits from [0x0, 0x7] and [0x8, 0xF], 16 splits for
* bytes.
*
* If a user has provided us with the byte arrays "[0xD, 0xA, 0xD]" and [0xD,
* 0xA, 0xB], however, we need to create splits that differ in the third byte.
*
* The algorithm used is as follows: Since there are 16 values per bit, we
* interpret byts as digits in base 16. Given a byte array b containing bytes
* b_0, b_1 .. b_n, we interpret the string as the number: 0.b_0 b_1 b_2.. b_n
* in base 16. Having mapped the low and high strings into floating-point
* values, we then use the BigDecimalSplitter to establish the even split
* points, then map the resulting floating point values back into byte arrays.
*/
public class BinarySplitter extends BigDecimalSplitter {
private static final Log LOG = LogFactory.getLog(BinarySplitter.class);
private final static int MAX_BYTES = 16;
private final static BigDecimal ONE_PLACE = new BigDecimal(16);
@Override
void generateRangeKeySplits(
Configuration conf,
List<InputSplit> splits,
Types hashKeyType,
AttributeValue hashKeyValue,
Types rangeKeyType,
AttributeValue minRangeKeyValue,
AttributeValue maxRangeKeyValue,
int numRangeSplits) {
byte[] minBytes = minRangeKeyValue.getB().array();
byte[] maxBytes = maxRangeKeyValue.getB().array();
// If there is a common prefix between minString and maxString,
// establish it
// and pull it out of minString and maxString.
int maxPrefixLen = Math.min(minBytes.length, maxBytes.length);
int sharedLen;
for (sharedLen = 0; sharedLen < maxPrefixLen; sharedLen++) {
byte b1 = minBytes[sharedLen];
byte b2 = maxBytes[sharedLen];
if (b1 != b2) {
break;
}
}
// The common prefix has length 'sharedLen'. Extract it from both.
byte[] commonPrefix = Arrays.copyOfRange(minBytes, 0, sharedLen);
minBytes = Arrays.copyOfRange(minBytes, sharedLen, minBytes.length);
maxBytes = Arrays.copyOfRange(maxBytes, sharedLen, maxBytes.length);
List<BigDecimal> splitValues =
split(numRangeSplits, minBytes, maxBytes);
// Convert the list of split point strings into an actual set of
// InputSplits.
byte[] start =
ArrayUtils.addAll(
commonPrefix,
bigDecimalToByteArray(splitValues.get(0), MAX_BYTES));
for (int i = 1; i < splitValues.size(); i++) {
byte[] end =
ArrayUtils
.addAll(
commonPrefix,
bigDecimalToByteArray(
splitValues.get(i),
MAX_BYTES));
//if (compareBytes(start, end) >= 0)
// continue;
List<AttributeValue> rangeKeyValues =
new ArrayList<AttributeValue>();
rangeKeyValues.add(new AttributeValue().withB(ByteBuffer
.wrap(start)));
rangeKeyValues
.add(new AttributeValue().withB(ByteBuffer.wrap(end)));
splits.add(new DynamoDBQueryInputFormat.DynamoDBQueryInputSplit(
hashKeyType,
hashKeyValue,
rangeKeyType,
rangeKeyValues,
ComparisonOperator.BETWEEN));
start = ArrayUtils.addAll(end, new byte[] { 0x0 });
}
}
public static int compareBytes(byte[] a, byte[] b) {
for (int i = 0; i < a.length && i < b.length; i++) {
if (a[i] < b[i]) {
return -1;
} else if (a[i] > b[i]) {
return 1;
}
}
return (a.length < b.length) ? -1 : (a.length > b.length) ? 1 : 0;
}
List<BigDecimal> split(int numSplits, byte[] minBytes, byte[] maxBytes) {
BigDecimal minVal = byteArrayToBigDecimal(minBytes, MAX_BYTES);
BigDecimal maxVal = byteArrayToBigDecimal(maxBytes, MAX_BYTES);
List<BigDecimal> splitPoints =
split(new BigDecimal(numSplits), minVal, maxVal);
List<BigDecimal> splitValues = new ArrayList<BigDecimal>();
// Convert the BigDecimal splitPoints into their string representations.
for (BigDecimal bd : splitPoints) {
splitValues.add(bd);
}
// Make sure that our user-specified boundaries are the first and last
// entries in the array.
if (splitValues.size() == 0
|| (0 != splitValues.get(0).compareTo(minVal))) {
splitValues.add(0, minVal);
}
if (splitValues.size() == 1
|| (0 != splitValues.get(splitValues.size() - 1).compareTo(
maxVal))) {
splitValues.add(maxVal);
}
return splitValues;
}
/**
* Return a BigDecimal representation of byte[] array suitable for use in a
* numerically-sorting order.
*/
static BigDecimal byteArrayToBigDecimal(byte[] array, int maxBytes) {
BigDecimal result = BigDecimal.ZERO;
BigDecimal curPlace = ONE_PLACE; // start with 1/16 to compute the
// first digit.
int len = Math.min(array.length, maxBytes);
for (int i = 0; i < len; i++) {
byte codePoint = array[i];
result = result.add(tryDivide(new BigDecimal(codePoint), curPlace));
// advance to the next less significant place. e.g., 1/(16^2) for
// the second char.
curPlace = curPlace.multiply(ONE_PLACE);
}
return result;
}
/**
* Return the string encoded in a BigDecimal. Repeatedly multiply the input
* value by 16; the integer portion after such a multiplication represents a
* single character in base 16. Convert that back into a char and create a
* string out of these until we have no data left.
*
* @throws IOException
*/
static byte[] bigDecimalToByteArray(BigDecimal bd, int maxBytes) {
BigDecimal cur = bd.stripTrailingZeros();
ByteArrayOutputStream sb = new ByteArrayOutputStream();
try {
byte[] curCodePoint = new byte[1];
for (int numConverted = 0; numConverted < maxBytes; numConverted++) {
cur = cur.multiply(ONE_PLACE);
curCodePoint[0] = cur.byteValue();
if (0x0 == curCodePoint[0]) {
break;
}
cur =
cur.subtract(new BigDecimal(
new BigInteger(curCodePoint)));
sb.write(curCodePoint);
}
} catch (IOException e) {
LOG.error("Error writing byte array", e);
}
return sb.toByteArray();
}
}