/** * Copyright 2012 Willet Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.willetinc.hadoop.mapreduce.dynamodb; import java.math.BigDecimal; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputSplit; import com.amazonaws.services.dynamodb.model.AttributeValue; import com.amazonaws.services.dynamodb.model.ComparisonOperator; /** * This method needs to determine the splits between two user-provided strings. * In the case where the user's strings are 'A' and 'Z', this is not hard; we * could create two splits from ['A', 'M') and ['M', 'Z'], 26 splits for strings * beginning with each letter, etc. * * If a user has provided us with the strings "Ham" and "Haze", however, we need * to create splits that differ in the third letter. * * The algorithm used is as follows: * Since there are 2**8 UTF8 unicode characters, we interpret characters as digits in * base 265. Given a string 's' containing characters s_0, s_1 .. s_n, we interpret * the string as the number: 0.s_0 s_1 s_2.. s_n in base 256. Having mapped the * low and high strings into floating-point values, we then use the BigDecimalSplitter * to establish the even split points, then map the resulting floating point values * back into strings. */ public class TextSplitter extends BigDecimalSplitter { private final static String FRIST_PRINTABLE_CHAR = new String(new byte[] {0x20}); @Override protected void generateRangeKeySplits( Configuration conf, List<InputSplit> splits, Types hashKeyType, AttributeValue hashKeyValue, Types rangeKeyType, AttributeValue minRangeKeyValue, AttributeValue maxRangeKeyValue, int numRangeSplits) { String minString = minRangeKeyValue.getS(); String maxString = maxRangeKeyValue.getS(); // If there is a common prefix between minString and maxString, // establish it // and pull it out of minString and maxString. int maxPrefixLen = Math.min(minString.length(), maxString.length()); int sharedLen; for (sharedLen = 0; sharedLen < maxPrefixLen; sharedLen++) { char c1 = minString.charAt(sharedLen); char c2 = maxString.charAt(sharedLen); if (c1 != c2) { break; } } // The common prefix has length 'sharedLen'. Extract it from both. String commonPrefix = minString.substring(0, sharedLen); minString = minString.substring(sharedLen); maxString = maxString.substring(sharedLen); List<BigDecimal> splitStrings = split(numRangeSplits, minString, maxString); // Convert the list of split point strings into an actual set of // InputSplits. String start = commonPrefix + bigDecimalToString(splitStrings.get(0), MAX_CHARS); for (int i = 1; i < splitStrings.size(); i++) { String end = commonPrefix + bigDecimalToString(splitStrings.get(i), MAX_CHARS); if(compareStrings(start, end) >= 0) continue; List<AttributeValue> rangeKeyValues = new ArrayList<AttributeValue>(); rangeKeyValues.add(new AttributeValue().withS(start)); rangeKeyValues.add(new AttributeValue().withS(end)); splits.add(new DynamoDBQueryInputFormat.DynamoDBQueryInputSplit( hashKeyType, hashKeyValue, rangeKeyType, rangeKeyValues, ComparisonOperator.BETWEEN)); start = end + FRIST_PRINTABLE_CHAR; } } public static int compareStrings(String a, String b) { for(int i = 0; i < a.length() && i < b.length(); i++) { if(a.charAt(i) < b.charAt(i)) { return -1; } else if(a.charAt(i) > b.charAt(i)) { return 1; } } return (a.length() < b.length()) ? -1 : (a.length() > b.length()) ? 1 : 0; } List<BigDecimal> split(int numSplits, String minString, String maxString) { BigDecimal minVal = stringToBigDecimal(minString, MAX_CHARS); BigDecimal maxVal = stringToBigDecimal(maxString, MAX_CHARS); List<BigDecimal> splitPoints = split(new BigDecimal(numSplits), minVal, maxVal); List<BigDecimal> splitStrings = new ArrayList<BigDecimal>(); // Convert the BigDecimal splitPoints into their string representations. for (BigDecimal bd : splitPoints) { splitStrings.add(bd); } // Make sure that our user-specified boundaries are the first and last // entries // in the array. if (splitStrings.size() == 0 || (0 != splitStrings.get(0).compareTo(minVal))) { splitStrings.add(0, minVal); } if (splitStrings.size() == 1 || (0 != splitStrings.get(splitStrings.size() - 1).compareTo( maxVal))) { splitStrings.add(maxVal); } return splitStrings; } private final static BigDecimal ONE_PLACE = new BigDecimal(256); // Maximum number of characters to convert. This is to prevent rounding // errors // or repeating fractions near the very bottom from getting out of control. // Note // that this still gives us a huge number of possible splits. private final static int MAX_CHARS = 8; /** * Return a BigDecimal representation of string 'str' suitable for use in a * numerically-sorting order. */ BigDecimal stringToBigDecimal(String str, int maxChars) { BigDecimal result = BigDecimal.ZERO; BigDecimal curPlace = ONE_PLACE; // start with 1/256 to compute the // first digit. int len = Math.min(str.length(), maxChars); for (int i = 0; i < len; i++) { int codePoint = str.codePointAt(i); result = result.add(tryDivide(new BigDecimal(codePoint), curPlace)); // advance to the next less significant place. e.g., 1/(256^2) for // the second char. curPlace = curPlace.multiply(ONE_PLACE); } return result; } /** * Return the string encoded in a BigDecimal. Repeatedly multiply the input * value by 256; the integer portion after such a multiplication * represents a single character in base 256. Convert that back into a * char and create a string out of these until we have no data left. */ String bigDecimalToString(BigDecimal bd, int maxChars) { BigDecimal cur = bd.stripTrailingZeros(); StringBuilder sb = new StringBuilder(); for (int numConverted = 0; numConverted < maxChars; numConverted++) { cur = cur.multiply(ONE_PLACE); int curCodePoint = cur.intValue(); if (0 == curCodePoint) { break; } cur = cur.subtract(new BigDecimal(curCodePoint)); sb.append(Character.toChars(curCodePoint)); } return sb.toString(); } }