package com.scaleunlimited.cascading.hadoop;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobPriority;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.SequenceFile;
import cascading.scheme.hadoop.TextLine;
import cascading.scheme.hadoop.TextLine.Compress;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tap.hadoop.PartitionTap;
import cascading.tap.hadoop.TemplateTap;
import cascading.tap.partition.Partition;
import cascading.tuple.Fields;
import com.scaleunlimited.cascading.BasePath;
import com.scaleunlimited.cascading.BasePlatform;
import com.scaleunlimited.cascading.Level;
@SuppressWarnings({ "unchecked", "rawtypes", "serial" })
public class HadoopPlatform extends BasePlatform {
private static final Logger LOGGER = LoggerFactory.getLogger(HadoopPlatform.class);
public static final String PLATFORM_TYPE = "hadoop";
protected JobConf _conf;
public HadoopPlatform(Class applicationJarClass) {
this(applicationJarClass, new JobConf());
}
@Override
public String getPlatformType() {
return PLATFORM_TYPE;
}
public HadoopPlatform(Class applicationJarClass, JobConf jobConf) {
super(applicationJarClass);
_conf = jobConf;
}
public void setMapSpeculativeExecution(boolean speculativeExecution) {
_conf.setMapSpeculativeExecution(speculativeExecution);
}
public void setReduceSpeculativeExecution(boolean speculativeExecution) {
_conf.setReduceSpeculativeExecution(speculativeExecution);
}
public JobConf getJobConf() {
return HadoopUtil.createJobConf(_props, _conf);
}
@Override
public boolean isLocal() {
return HadoopUtils.isJobLocal(getJobConf());
}
@Override
public File getLogDir() {
return super.getLogDirHelper();
}
@Override
public Tap makeTap(Scheme scheme, BasePath path) throws Exception {
return makeTap(scheme, path, SinkMode.KEEP);
}
@Override
public void setJobPollingInterval(long interval) {
super.setJobPollingIntervalHelper(interval);
}
@Override
public void setLogDir(File logDir) {
super.setLogDirHealer(logDir);
}
@Override
public File getDefaultLogDir() {
String hadoopLogDir = System.getProperty("HADOOP_LOG_DIR");
if (hadoopLogDir == null) {
hadoopLogDir = System.getProperty("hadoop.log.dir");
}
if (hadoopLogDir == null) {
String hadoopHomeDir = System.getProperty("HADOOP_HOME");
if (hadoopHomeDir != null) {
hadoopLogDir = hadoopHomeDir = "/logs";
}
}
if (hadoopLogDir == null) {
if (isLocal()) {
hadoopLogDir = "./";
} else {
hadoopLogDir = "/mnt/hadoop/logs/";
}
}
return new File(hadoopLogDir);
}
@Override
public BasePath getTempDir() throws Exception {
if (isLocal()) {
return new HadoopPath("file://" + FileUtils.getTempDirectoryPath());
} else {
return new HadoopPath(Hfs.getTempPath(getJobConf()).getName(), _conf);
}
}
@Override
public boolean isTextSchemeCompressable() {
// TextLine for Hadoop can read/write compressed files.
return true;
}
@Override
public void setNumReduceTasks(int numReduceTasks) throws Exception {
if (numReduceTasks == CLUSTER_REDUCER_COUNT) {
numReduceTasks = HadoopUtils.getNumReducers(getJobConf());
}
_conf.setNumReduceTasks(numReduceTasks);
}
@Override
public int getNumReduceTasks() throws Exception {
return HadoopUtils.getNumReducers(getJobConf());
}
@Override
public String getProperty(String name) {
String result = super.getPropertyHelper(name);
if (result == null) {
result = _conf.get(name);
}
return result;
}
@Override
public boolean getBooleanProperty(String name) {
return super.getBooleanPropertyHelper(name);
}
@Override
public int getIntProperty(String name) {
return super.getIntPropertyHelper(name);
}
@Override
public Tap makePartitionTap(Tap parentTap, Partition partition) throws Exception {
return makePartitionTap(parentTap, partition, SinkMode.KEEP);
}
@Override
public void setFlowPriority(FlowPriority priority) throws Exception {
switch (priority) {
case HIGH:
_conf.setJobPriority(JobPriority.HIGH);
break;
case MEDIUM:
_conf.setJobPriority(JobPriority.NORMAL);
break;
case LOW:
_conf.setJobPriority(JobPriority.LOW);
break;
default:
throw new RuntimeException("Unknown flow priority: " + priority);
}
}
@Override
public void setLogLevel(Level level, String... packageNames) {
for (String packageName : packageNames) {
if (packageName.isEmpty()) {
// TODO set the logging level for the current (main) code that's calling
// us to - but we can't do that (???) using slf4j, so this would only work
// if we assume log4j is being used.
// Set the logging level for map & reduce jobs, using both old and new conf names.
_conf.set("mapred.map.child.log.level", level.toString());
_conf.set("mapreduce.map.log.level", level.toString());
_conf.set("mapred.reduce.child.log.level", level.toString());
_conf.set("mapreduce.reduce.log.level", level.toString());
}
}
super.setLogLevelHelper(level, packageNames);
}
@Override
public void setProperty(String name, String value) {
super.setPropertyHelper(name, value);
}
@Override
public void setProperty(String name, int value) {
super.setPropertyHelper(name, value);
}
@Override
public void setProperty(String name, boolean value) {
super.setPropertyHelper(name, value);
}
@Override
public FlowConnector makeFlowConnector() {
// Combine _props with JobConf.
Map<Object, Object> hadoopProps = HadoopUtil.createProperties(_conf);
Map<Object, Object> mergedProps = new HashMap<Object, Object>(_props);
for (Map.Entry<Object, Object> hadoopEntry : hadoopProps.entrySet()) {
Object key = hadoopEntry.getKey();
Object hadoopValue = hadoopEntry.getValue();
Object explicitValue = mergedProps.get(key);
// If we have a Hadoop property value, and there isn't something already
// set for that explicitly in our properties, then use it.
if ((hadoopValue != null) && (explicitValue == null)) {
mergedProps.put(key, hadoopValue);
}
}
return new HadoopFlowConnector(mergedProps);
}
@Override
public FlowProcess makeFlowProcess() throws Exception {
return new HadoopFlowProcess(getJobConf());
}
@Override
public BasePath makePath(String path) throws IOException {
return new HadoopPath(path, getJobConf());
}
@Override
public BasePath makePath(BasePath parent, String subdir) throws IOException {
return new HadoopPath(parent, subdir, getJobConf());
}
@Override
public Tap makeTap(Scheme scheme, BasePath path, SinkMode mode) {
return new Hfs(scheme, path.getAbsolutePath(), mode);
}
@Override
public Tap makeTemplateTap(Tap tap, String pattern, Fields fields) throws Exception {
return new TemplateTap((Hfs) tap, pattern, fields);
}
@Override
public Tap makePartitionTap(Tap parentTap, Partition partition, SinkMode mode) throws Exception {
if (parentTap instanceof Hfs) {
Hfs tap = (Hfs) parentTap;
return new PartitionTap(tap, partition, mode);
}
throw new RuntimeException("parentTap needs to an instance of Hfs - instead got: " + parentTap.getClass().getName());
}
@Override
public Scheme makeBinaryScheme(Fields fields) {
return new SequenceFile(fields);
}
@Override
public Scheme makeTextScheme(boolean enableCompression) {
if (enableCompression) {
return new TextLine(Compress.ENABLE);
} else {
return new TextLine(Compress.DISABLE);
}
}
@Override
public Scheme makeTextScheme() {
return new TextLine();
}
@Override
public boolean rename(BasePath src, BasePath dst) throws Exception {
Path srcPath = new Path(src.getAbsolutePath());
Path dstPath = new Path(dst.getAbsolutePath());
FileSystem fs = srcPath.getFileSystem(getJobConf());
return fs.rename(srcPath, dstPath);
}
@Override
public String shareLocalDir(String localDirName) {
String sharedDirName = null;
try {
Path localPath = new Path(localDirName);
JobConf conf = (JobConf) (makeFlowProcess().getConfigCopy());
String hadoopTmpDirName = conf.getJobLocalDir();
if (hadoopTmpDirName == null) {
hadoopTmpDirName = conf.get("hadoop.tmp.dir");
}
if (hadoopTmpDirName == null) {
hadoopTmpDirName = conf.getWorkingDirectory().toString();
}
if (hadoopTmpDirName == null) {
throw new IOException("Can't get Hadoop temporary directory");
}
HadoopPath hadoopHdfsTmpPath = (HadoopPath) (makePath(hadoopTmpDirName));
Path hdfsTmpPath = hadoopHdfsTmpPath.getHadoopPath();
String uniqueFolderName = String.format("%s-%s", localPath.getName(), UUID.randomUUID());
Path sharedPath = new Path(hdfsTmpPath, uniqueFolderName);
sharedDirName = sharedPath.toString();
FileSystem targetFs = sharedPath.getFileSystem(conf);
// Copy directory to (shared) HDFS location.
targetFs.copyFromLocalFile(localPath, sharedPath);
String message = String.format("Successfully copied shared directory from %s to %s", localDirName, sharedDirName);
LOGGER.info(message);
} catch (Exception e) {
String message = String.format("Exception sharing directory from %s to %s: %s", localDirName, sharedDirName, e);
LOGGER.error(message, e);
throw new RuntimeException(message);
}
return sharedDirName;
}
@Override
public String copySharedDirToLocal(FlowProcess flowProcess, String sharedDirName) {
String localDirName = null;
try {
// Main program on Hadoop master has written the directory to HDFS,
// so copy it to the slave's local hard drive.
Path sourcePath = new Path(sharedDirName);
JobConf conf = (JobConf) (flowProcess.getConfigCopy());
FileSystem sourceFs = sourcePath.getFileSystem(conf);
String tmpDirName = System.getProperty("java.io.tmpdir");
String uniqueFolderName = String.format("%s-%s", sourcePath.getName(), UUID.randomUUID());
File localDir = new File(tmpDirName, uniqueFolderName);
localDirName = localDir.getAbsolutePath();
Path localPath = new Path(localDirName);
// Copy directory from (shared) HDFS location.
sourceFs.copyToLocalFile(sourcePath, localPath);
String message = String.format("Successfully copied shared directory from %s to %s", sharedDirName, localDirName);
LOGGER.info(message);
} catch (IOException e) {
String message = String.format("Exception copying shared directory from %s to %s: %s", sharedDirName, localDirName, e);
LOGGER.error(message, e);
throw new RuntimeException(message);
}
return localDirName;
}
/**
* JobConf isn't serializable, so we handle that ourselves.
*
* @param out
* @throws IOException
*/
private void writeObject(ObjectOutputStream out) throws IOException {
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
DataOutputStream writeableOut = new DataOutputStream(byteStream);
_conf.write(writeableOut);
writeableOut.close();
// Now write out the byte array
byte[] confBytes = byteStream.toByteArray();
out.writeInt(confBytes.length);
out.write(confBytes, 0, confBytes.length);
}
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
int numBytes = in.readInt();
byte[] confBytes = new byte[numBytes];
in.readFully(confBytes);
DataInput writeableIn = new DataInputStream(new ByteArrayInputStream(confBytes));
_conf = new JobConf(false);
_conf.readFields(writeableIn);
}
/* (non-Javadoc)
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
HadoopPlatform other = (HadoopPlatform) obj;
if (_conf == null) {
if (other._conf != null)
return false;
} else {
// Make sure every value we've got exists and is equal to the other
// value. We can't do a two-way comparison, because when JobConf
// deserializes, it adds additional properties (aliases) for what we
// set.
Iterator<Entry<String, String>> iter = other._conf.iterator();
Map<String, String> otherValues = new HashMap<String, String>();
while (iter.hasNext()) {
Entry<String, String> entry = iter.next();
otherValues.put(entry.getKey(), entry.getValue());
}
iter = _conf.iterator();
while (iter.hasNext()) {
Entry<String, String> entry = iter.next();
if (!otherValues.containsKey(entry.getKey())) {
return false;
} else if (!entry.getValue().equals(otherValues.get(entry.getKey()))) {
return false;
}
}
}
return true;
}
}