/** * Copyright 2010 TransPac Software, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bixolabs.simpledb; import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.RecordReader; import org.apache.log4j.Logger; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import com.bixolabs.aws.BackoffHttpHandler; import com.bixolabs.aws.IHttpHandler; import com.bixolabs.aws.SimpleDB; public class SimpleDBRecordReader implements RecordReader<NullWritable, Tuple> { private static final Logger LOGGER = Logger.getLogger(SimpleDBRecordReader.class); private String _shardName; private Fields _schemeFields; private String _itemFieldName; private String _query; private int _selectLimit; private SimpleDB _sdb; private long _pos; private long _length; private String _nextToken; private List<Map<String, String[]>> _curItems; private int _curItemIndex; public SimpleDBRecordReader(InputSplit split, SimpleDBConfiguration sdbConf) throws IOException { SimpleDBInputSplit sdbSplit = (SimpleDBInputSplit)split; // FUTURE KKr - if we've got more than a threshold number of items (split.getLength()), then // we want to parallelize here by sub-selecting with the item hash _shardName = sdbSplit.getLocations()[0]; _schemeFields = sdbConf.getSchemeFields(); _itemFieldName = sdbConf.getItemFieldName(); _query = sdbConf.getQuery(); _selectLimit = sdbSplit.getSelectLimit(); IHttpHandler httpHandler = new BackoffHttpHandler(sdbConf.getMaxThreads()); _sdb = new SimpleDB(sdbConf.getSdbHost(), sdbConf.getAccessKeyId(), sdbConf.getSecretAccessKey(), httpHandler); _nextToken = null; _curItems = null; _pos = 0; _length = split.getLength(); } @Override public void close() throws IOException { } @Override public NullWritable createKey() { return NullWritable.get(); } @Override public Tuple createValue() { // TODO KKr - this feels wrong return new Tuple(new Object[_schemeFields.size()]); } @Override public long getPos() throws IOException { return _pos; } @Override public float getProgress() throws IOException { return (float)_pos/(float)_length; } @Override public boolean next(NullWritable key, Tuple value) throws IOException { if ((_curItems == null) || (_curItemIndex >= _curItems.size())) { // Short-circuit for case where there will be no more items. if ((_curItems != null) && (_nextToken == null)) { _curItems = null; return false; } try { String selectStr = String.format("select * from `%s`", _shardName); if (_query.length() > 0) { selectStr += String.format(" where %s", _query); } if (_selectLimit != SimpleDBUtils.NO_SELECT_LIMIT) { selectStr += String.format(" limit %d", _selectLimit); } LOGGER.trace(String.format("Making select request: %s", selectStr)); _curItems = _sdb.select(selectStr, _nextToken); _curItemIndex = 0; // If we're looping, we need to reduce our limit each time. if (_selectLimit != SimpleDBUtils.NO_SELECT_LIMIT) { // Just for safety, trim what we get back to be no more than our limit. if (_curItems.size() > _selectLimit) { _curItems.subList(_selectLimit, _curItems.size()).clear(); } _selectLimit -= _curItems.size(); if (_selectLimit > 0) { _nextToken = _sdb.getLastToken(); } else { _nextToken = null; } } else { _nextToken = _sdb.getLastToken(); } } catch (Exception e) { throw new IOException("Error selecting from " + _shardName, e); } if (_curItems.size() == 0) { _curItems = null; return false; } } // FUTURE KKr - return a linked list from sdb.select, and then remove items from // from to back as we process them, to save on memory usage. Map<String, String[]> values = _curItems.get(_curItemIndex++); // Pick off the actual item name, which is baked into the response by the SimpleDB code. String itemValue = values.get("ItemName")[0]; TupleEntry entry = new TupleEntry(_schemeFields, value); entry.set(_itemFieldName, itemValue); for (int i = 0; i < _schemeFields.size(); i++) { String attrName = _schemeFields.get(i).toString(); String[] attrValues = values.get(attrName); if ((attrValues != null) && (attrValues.length > 0)) { entry.set(attrName, attrValues[0]); _pos += attrValues[0].length(); } } return true; } }