/** * Copyright 2012 Willet Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.willetinc.hadoop.mapreduce.dynamodb; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputSplit; import com.amazonaws.services.dynamodb.model.AttributeValue; import com.amazonaws.services.dynamodb.model.ComparisonOperator; /** * <p> * Implements Splitter over DynamoDB Number datatype values. * </p> * <p> * * </p> */ public abstract class AbstractSplitter implements DynamoDBSplitter { private static final Log LOG = LogFactory.getLog(AbstractSplitter.class); @Override public List<InputSplit> split(Configuration conf) throws IOException { // load configuration boolean interpolate = DynamoDBQueryInputFormat.getInterpolateAcrossRangeKeyValues(conf); Types hashKeyType = DynamoDBQueryInputFormat.getHashKeyType(conf); AttributeValue hashKeyValue = DynamoDBQueryInputFormat.getHashKeyValue(conf); Types rangeKeyType = DynamoDBQueryInputFormat.getRangeKeyType(conf); Collection<AttributeValue> rangeKeyValues = DynamoDBQueryInputFormat.getRangeKeyValues(conf); ComparisonOperator rangeKeyoperator = DynamoDBQueryInputFormat.getRangeKeyComparisonOperator(conf); AttributeValue minRangeKeyValue = DynamoDBQueryInputFormat.getRangeKeyInterpolateMinValue(conf); AttributeValue maxRangeKeyValue = DynamoDBQueryInputFormat.getRangeKeyInterpolateMaxValue(conf); // ensure DynamoDBQueryInputFormat was configured correctly if (interpolate) { rangeKeyValues = new ArrayList<AttributeValue>(); } else { minRangeKeyValue = null; maxRangeKeyValue = null; } // compute number of input splits int numSplits = conf.getInt("mapred.map.tasks", 1); int numHashKeys = 1; int numRangeSplits = numSplits / numHashKeys; numRangeSplits = (!interpolate) ? 1 : numRangeSplits; numRangeSplits = (numRangeSplits <= 0) ? 1 : numRangeSplits; // generate input spits List<InputSplit> splits = new ArrayList<InputSplit>(); // handle cases where interpolation is turned off or unnecessary if (!interpolate || numRangeSplits <= 1 || minRangeKeyValue == null || maxRangeKeyValue == null) { LOG.info("Generating 1 split for each HashKey"); DynamoDBQueryInputFormat.DynamoDBQueryInputSplit split = new DynamoDBQueryInputFormat.DynamoDBQueryInputSplit( hashKeyType, hashKeyValue, rangeKeyType, rangeKeyValues, rangeKeyoperator); splits.add(split); } else { // interpolate between RangeKey values LOG.info(String.format( "Generating %d RangeKey splits for each HashKey", numRangeSplits)); if (null == hashKeyValue) { LOG.error("Cannot create a range when the HashKey is NULL. Ignoring range key interpolation."); } else { generateRangeKeySplits( conf, splits, hashKeyType, hashKeyValue, rangeKeyType, minRangeKeyValue, maxRangeKeyValue, numRangeSplits); } } return splits; } abstract void generateRangeKeySplits( Configuration conf, List<InputSplit> splits, Types hashKeyType, AttributeValue hashKeyValue, Types rangeKeyType, AttributeValue minRangeKeyValue, AttributeValue maxRangeKeyValue, int numRangeSplits); }