/**
* Copyright 2010 TransPac Software, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Based on cascading.jdbc code released into the public domain by
* Concurrent, Inc.
*/
package com.bixolabs.simpledb;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.TapException;
import cascading.tap.hadoop.TapCollector;
import cascading.tap.hadoop.TapIterator;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import com.bixolabs.aws.AWSException;
import com.bixolabs.aws.BackoffHttpHandler;
import com.bixolabs.aws.SimpleDB;
/**
* The SimpleDB class is a {@link Tap} subclass. It is used in conjunction with the {@SimpleDBScheme}
* to allow for the reading and writing of data to and from Amazon's SimpleDB service.
*/
@SuppressWarnings("serial")
public class SimpleDBTap extends Tap {
private static final Logger LOGGER = Logger.getLogger(SimpleDBTap.class);
public static final String SCHEME = "simpledb";
private String _accessKeyId;
private String _secretAccessKey;
private String _baseDomainName;
private int _numShards;
private SinkMode _sinkMode;
private int _maxThreads;
private String _sdbHost = SimpleDB.DEFAULT_HOST;
private long _closeTimeout = SimpleDBConfiguration.DEFAULT_CLOSE_TIMEOUT;
private transient SimpleDB _sdb;
public SimpleDBTap(SimpleDBScheme scheme, String accessKeyId, String secretAccessKey, String baseDomainName, int numShards) {
this(scheme, accessKeyId, secretAccessKey, baseDomainName, numShards, SinkMode.UPDATE);
}
public SimpleDBTap(SimpleDBScheme scheme, String accessKeyId, String secretAccessKey, String baseDomainName, int numShards, SinkMode sinkMode) {
super(scheme, sinkMode);
_accessKeyId = accessKeyId;
_secretAccessKey = secretAccessKey;
_baseDomainName = baseDomainName;
_numShards = numShards;
_sinkMode = sinkMode;
_maxThreads = numShards;
}
/**
* Set the max number of threads per record writer.
*
* This is an advanced feature that lets users constrain the number of threads hitting
* SimpleDB, in order to minimize 503/service unavailable errors. For this to have any
* real value, the caller should set this based on the max number of simultaneous tasks
* that use SimpleDB for input or output. Also, the max value for reading is much higher
* than the max for writing.
*
* @param maxThreads maximum number of simultaneous HTTP requests to SimpleDB, per record writer
*/
public void setMaxThreads(int maxThreads) {
_maxThreads = maxThreads;
}
public int getMaxThreads() {
return _maxThreads;
}
public void setCloseTimeout(long closeTimeout) {
_closeTimeout = closeTimeout;
}
public long getCloseTimeout() {
return _closeTimeout;
}
public void setSdbHost(String sdbHost) {
_sdbHost = sdbHost;
_sdb = null;
}
public String getSdbHost() {
return _sdbHost;
}
public Path getPath() {
return new Path(getURI().toString());
}
public TupleEntryIterator openForRead(JobConf conf) throws IOException {
return new TupleEntryIterator(getSourceFields(), new TapIterator(this, conf));
}
public TupleEntryCollector openForWrite(JobConf conf) throws IOException {
return new TapCollector(this, conf);
}
public boolean makeDirs(JobConf conf) throws IOException {
boolean cleanup = true;
try {
// FUTURE KKr - multi-thread this code.
List<String> shardNames = getShardNames();
for (String shardName : shardNames) {
getSimpleDB().createDomain(shardName);
}
// We created all of them, so we're all set.
cleanup = false;
} catch (AWSException e) {
throw new IOException("Error creating domain(s)", e);
} catch (InterruptedException e) {
throw new IOException("Interrupted while creating domain(s)");
} finally {
// TODO KKr - when does makeDirs get called? Do I need to delete
// all of the tables.
// delete these tables?
if (cleanup) {
try {
// deletePath(conf);
} catch (Exception e) {
// ignore
}
}
}
return true;
}
public boolean deletePath(JobConf conf) throws IOException {
try {
// FUTURE KKr - multi-thread this code.
List<String> domainNames = getSimpleDB().listDomains();
for (String domainName : domainNames) {
if (domainName.startsWith(_baseDomainName)) {
getSimpleDB().deleteDomain(domainName);
}
}
} catch (AWSException e) {
throw new IOException("Error deleting domain(s)", e);
} catch (InterruptedException e) {
throw new IOException("Interrupted while deleting domain(s)");
}
return true;
}
public boolean pathExists(JobConf conf) throws IOException {
List<String> existingDomains;
try {
existingDomains = getSimpleDB().listDomains();
} catch (AWSException e) {
throw new IOException("Error listing domains", e);
} catch (InterruptedException e) {
throw new IOException("Interrupted while listing domains");
}
Set<String> domainSet = new HashSet<String>(existingDomains);
List<String> shardNames = getShardNames();
for (String shardName : shardNames) {
if (!domainSet.contains(shardName)) {
return false;
}
}
return true;
}
public long getPathModified(JobConf conf) throws IOException {
long mostRecentTime = 0;
List<String> shards = getShardNames();
for (String shard : shards) {
// FUTURE KKr - multithread this.
long lastModTime = getLastModified(shard);
if (lastModTime > mostRecentTime) {
mostRecentTime = lastModTime;
}
}
return mostRecentTime;
}
@Override
public void sinkInit(JobConf conf) throws IOException {
LOGGER.debug("sinking to domain: " + _baseDomainName);
// do not delete if initialized from within a task
if (isReplace() && (conf.get("mapred.task.partition") == null)) {
deletePath(conf);
}
makeDirs(conf);
setConf(conf);
super.sinkInit(conf);
}
@Override
public void sourceInit(JobConf conf) throws IOException {
LOGGER.debug("sourcing from domain: " + _baseDomainName);
FileInputFormat.setInputPaths(conf, _baseDomainName);
setConf(conf);
super.sourceInit(conf);
}
private SimpleDB getSimpleDB() {
if (_sdb == null) {
// Use the number of shards as the count for number of threads, since at the tap level
// we can't parallelize more than that.
_sdb = new SimpleDB(_sdbHost, _accessKeyId, _secretAccessKey, new BackoffHttpHandler(_numShards));
}
return _sdb;
}
private void setConf(JobConf conf) {
SimpleDBConfiguration sdbConf = new SimpleDBConfiguration(conf);
sdbConf.setAccessKeyId(_accessKeyId);
sdbConf.setSecretAccessKey(_secretAccessKey);
sdbConf.setDomainName(_baseDomainName);
sdbConf.setNumShards(_numShards);
sdbConf.setMaxThreads(_maxThreads);
sdbConf.setSdbHost(_sdbHost);
sdbConf.setCloseTimeout(_closeTimeout);
}
private URI getURI() {
try {
return new URI(SCHEME, "//" + _accessKeyId + "/" + _baseDomainName, null);
} catch (URISyntaxException exception) {
throw new TapException("unable to create uri", exception);
}
}
private List<String> getShardNames() {
return SimpleDBUtils.getShardNames(_baseDomainName, _numShards);
}
private long getLastModified(String domainName) throws IOException {
Map<String, String> metadata;
try {
metadata = getSimpleDB().domainMetaData(domainName);
} catch (AWSException e) {
throw new IOException("Exception getting domain metadata", e);
} catch (InterruptedException e) {
throw new IOException("Interrupted while getting domain metadata");
}
String timestamp = metadata.get(SimpleDB.TIMESTAMP_METADATA);
if (timestamp != null) {
try {
return Long.parseLong(timestamp);
} catch (NumberFormatException e) {
LOGGER.error("SimpleDB metadata returned invalid timestamp for " + _baseDomainName + ": " + timestamp);
}
} else {
LOGGER.error("SimpleDB metadata doesn't contain timestamp " + _baseDomainName);
}
return System.currentTimeMillis();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((_accessKeyId == null) ? 0 : _accessKeyId.hashCode());
result = prime * result + ((_baseDomainName == null) ? 0 : _baseDomainName.hashCode());
result = prime * result + ((_secretAccessKey == null) ? 0 : _secretAccessKey.hashCode());
result = prime * result + ((getScheme() == null) ? 0 : getScheme().hashCode());
result = prime * result + _sinkMode.hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
SimpleDBTap other = (SimpleDBTap) obj;
if (_accessKeyId == null) {
if (other._accessKeyId != null)
return false;
} else if (!_accessKeyId.equals(other._accessKeyId))
return false;
if (_baseDomainName == null) {
if (other._baseDomainName != null)
return false;
} else if (!_baseDomainName.equals(other._baseDomainName))
return false;
if (_secretAccessKey == null) {
if (other._secretAccessKey != null)
return false;
} else if (!_secretAccessKey.equals(other._secretAccessKey))
return false;
if (_sinkMode != other._sinkMode) {
return false;
}
if (getScheme() == null) {
if (other.getScheme() != null) {
return false;
}
} else if (!getScheme().equals(other.getScheme())) {
return false;
}
return true;
}
}