/** * Copyright 2010 TransPac Software, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bixolabs.simpledb; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConfigurable; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import cascading.tuple.Tuple; import com.bixolabs.aws.AWSException; import com.bixolabs.aws.BackoffHttpHandler; import com.bixolabs.aws.IHttpHandler; import com.bixolabs.aws.SimpleDB; public class SimpleDBInputFormat implements InputFormat<NullWritable, Tuple>, JobConfigurable { @Override public RecordReader<NullWritable, Tuple> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { return new SimpleDBRecordReader(split, new SimpleDBConfiguration(conf)); } @Override public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { SimpleDBConfiguration sdbConf = new SimpleDBConfiguration(conf); String domainName = sdbConf.getDomainName(); int numShards = sdbConf.getNumShards(); String query = sdbConf.getQuery(); int selectLimit = sdbConf.getSelectLimit(); int remainingLimit = selectLimit; IHttpHandler httpHandler = new BackoffHttpHandler(numShards); SimpleDB sdb = new SimpleDB(sdbConf.getSdbHost(), sdbConf.getAccessKeyId(), sdbConf.getSecretAccessKey(), httpHandler); // We want one split per shard. List<String> shardNames = SimpleDBUtils.getShardNames(domainName, numShards); List<SimpleDBInputSplit> splits = new ArrayList<SimpleDBInputSplit>(numShards); // FUTURE KKr - parallelize this, by submitting N tasks, one per shard. We'd have a minor issue // with wanting to round up the split limit, and then having to constrain down to not exceed the limit. for (int i = 0; i < numShards; i++) { // Make a select call to get the count of items. // Silly code to ensure that even for test cases, the combined shard limits will sum // to the actual selectLimit, even with integer division rounding errors. int shardLimit = SimpleDBUtils.NO_SELECT_LIMIT; if (selectLimit != SimpleDBUtils.NO_SELECT_LIMIT) { shardLimit = remainingLimit / (numShards - i); // During testing, for example, we can wind up with 0 items in a split. if (shardLimit == 0) { continue; } } String shardName = shardNames.get(i); try { int numItems = SimpleDBUtils.getItemCount(sdb, shardName, query, shardLimit); // If we actually have any matches in this shard, generate a split. if (numItems > 0) { shardLimit = Math.min(shardLimit, numItems); remainingLimit -= shardLimit; splits.add(new SimpleDBInputSplit(numItems, shardLimit, shardName)); } } catch (AWSException e) { throw new IOException("Error getting item count from domain " + shardName, e); } catch (InterruptedException e) { throw new IOException("Interruption while getting item count from domain " + shardName); } } return splits.toArray(new SimpleDBInputSplit[splits.size()]); } @Override public void configure(JobConf conf) { // TODO KKr - what should I be doing here? } }