/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.plan.physical;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.Index;
import com.linkedin.cubert.io.*;
import com.linkedin.cubert.utils.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.node.ArrayNode;
import org.codehaus.jackson.node.ObjectNode;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;
import java.util.Map.Entry;
/**
* Parses and executes the physical plan of a single Map-Reduce job.
*
* @author Maneesh Varshney
*/
public class JobExecutor
{
protected final JsonNode root;
protected static final ArrayNode singletonArray =
new ObjectMapper().createArrayNode();
protected final Job job;
protected final Configuration conf;
protected final ConfigurationDiff confDiff;
protected final FileSystem fs;
private final ObjectMapper mapper;
private final Path tmpDir;
// A map of folder name to file name prefix
private final Map<String, List<String>> teeFiles =
new HashMap<String, List<String>>();
private int teeFilePrefixCounter = 0;
private boolean profileMode;
public JobExecutor(String json, boolean profileMode) throws IOException,
ClassNotFoundException,
InstantiationException,
IllegalAccessException
{
this.job = new Job();
this.conf = job.getConfiguration();
this.confDiff = new ConfigurationDiff(conf);
this.fs = FileSystem.get(conf);
this.profileMode = profileMode;
// Turn on the symlink feature
DistributedCache.createSymlink(conf);
job.setJarByClass(JobExecutor.class);
if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null)
{
conf.set("mapreduce.job.credentials.binary",
System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
}
mapper = new ObjectMapper();
this.root = mapper.readValue(json, JsonNode.class);
if (root.has("tmpDir"))
{
tmpDir = new Path(getText(root, "tmpDir"));
}
else
{
tmpDir =
new Path(fs.getHomeDirectory(), "tmp/" + UUID.randomUUID().toString());
}
try
{
configureJob();
}
catch (URISyntaxException e)
{
throw new RuntimeException(e);
}
}
public void printCubertConfProperties()
{
Iterator<Entry<String, String>> it = conf.iterator();
while (it.hasNext())
{
Entry<String, String> entry = it.next();
if (entry.getKey().startsWith("cubert"))
{
print.f("%s => %s", entry.getKey(), entry.getValue());
}
}
}
public boolean run(boolean verbose) throws IOException,
InterruptedException,
ClassNotFoundException
{
// Logger.getLogger("org.apache.hadoop.mapred.MapTask").setLevel(Level.WARN);
// Logger.getLogger("org.apache.hadoop.mapred.Task").setLevel(Level.WARN);
// Logger.getLogger("org.apache.hadoop.mapred.Merger").setLevel(Level.WARN);
// Logger.getLogger("org.apache.hadoop.mapred.LocalJobRunner").setLevel(Level.WARN);
// Logger.getLogger("org.apache.hadoop.filecache.TrackerDistributedCacheManager")
// .setLevel(Level.WARN);
boolean retval = false;
try
{
retval = job.waitForCompletion(verbose);
}
finally
{
fs.delete(tmpDir, true);
}
if (!retval)
{
throw new InterruptedException("Job " + getText(root, "name") + " failed!");
}
moveTeeFiles();
postJobHooks();
doCompletionTasks();
return retval;
}
protected void configureJob() throws IOException,
ClassNotFoundException,
URISyntaxException,
InstantiationException,
IllegalAccessException
{
setJobName();
setLibjars();
setHadoopConf();
setPerfProfile();
serializeExecutionConfig();
cacheFiles();
cacheIndex();
prepareTeePaths();
preJobHooks();
int numReducers = root.get("reducers").getIntValue();
job.setNumReduceTasks(numReducers);
boolean foundPaths = false;
for (JsonNode map : root.path("map"))
{
foundPaths |= setInput(((ObjectNode) map).get("input"));
}
if (!foundPaths)
throw new IOException("Cannot find any input paths for job");
setOutput();
conf.set(CubertStrings.JSON_OUTPUT, root.get("output").toString());
if (root.has("metadata"))
conf.set(CubertStrings.JSON_METADATA, root.get("metadata").toString());
conf.set(CubertStrings.JSON_MAP_OPERATOR_LIST, root.get("map").toString());
job.setMapperClass(CubertMapper.class);
if (hasReducePhase())
{
setShuffle();
conf.set(CubertStrings.JSON_SHUFFLE, root.get("shuffle").toString());
conf.set(CubertStrings.JSON_REDUCE_OPERATOR_LIST, root.get("reduce")
.toString());
job.setReducerClass(CubertReducer.class);
}
if (conf.get("mapreduce.map.output.compress") == null)
conf.set("mapreduce.map.output.compress", "true");
if (conf.get("mapreduce.output.fileoutputformat.compress") == null)
conf.set("mapreduce.output.fileoutputformat.compress", "true");
}
public boolean hasReducePhase()
{
return root.has("shuffle") && !root.get("shuffle").isNull();
}
private void serializeExecutionConfig() throws IOException
{
ExecutionConfig.writeConf(getConf());
}
private void preJobHooks()
{
ArrayNode preHooks = (ArrayNode) root.get("preJobHooks");
if (preHooks != null)
processJobCommands(preHooks);
}
private void postJobHooks()
{
ArrayNode postHooks = (ArrayNode) root.get("postJobHooks");
if (postHooks != null)
processJobCommands(postHooks);
}
private void doCompletionTasks() throws IOException
{
if (root.has("onCompletion") && !root.get("onCompletion").isNull())
CompletionTasks.doCompletionTasks(root.get("onCompletion"));
}
private void processJobCommands(ArrayNode commands)
{
for (int i = 0; i < commands.size(); i++)
{
execJobCommand(commands.get(i));
}
}
private void execJobCommand(JsonNode jsonNode)
{
// TODO Auto-generated method stub
String[] commandSplits = jsonNode.getTextValue().split("\\s+");
String command = commandSplits[0];
try {
if (command.equalsIgnoreCase("METAFILE")){
String[] metadataArgs = Arrays.copyOfRange(commandSplits, 1, commandSplits.length);
CubertMD.execCommand(metadataArgs);
}
else if (command.equalsIgnoreCase("HDFS")){
execHdfsCommand(commandSplits[1], Arrays.copyOfRange(commandSplits, 2, commandSplits.length));
}
}
catch (IOException e){
throw new RuntimeException("Job command failed due to " + e.toString());
}
}
private void execHdfsCommand(String cmd, String[] args) throws IOException{
FileSystem fs = FileSystem.get(conf);
if (cmd.equalsIgnoreCase("RENAME"))
fs.rename(new Path(args[0]), new Path(args[1]));
else if (cmd.equalsIgnoreCase("DELETE"))
fs.delete(new Path(args[0]));
}
protected void setJobName()
{
job.setJobName(getText(root, "name"));
}
protected void setLibjars() throws IOException
{
if (!root.has("libjars"))
return;
FileSystem localFs = FileSystem.getLocal(conf);
HashSet<Path> jarsToCache = new HashSet<Path>();
for (JsonNode node : asArray(root, "libjars"))
{
Path path = new Path(node.getTextValue());
// Is path in local fs or HDFS
boolean fileIsLocal = localFs.exists(path);
FileSystem thisFs = fileIsLocal ? localFs : fs;
// If path is a directory, glob all jar files.
List<Path> sources = new LinkedList<Path>();
if (thisFs.isDirectory(path))
{
Path dirPath = new Path(path.toString() + "/*.jar");
FileStatus[] jars = thisFs.globStatus(dirPath);
for (FileStatus jar : jars)
{
Path filePath = jar.getPath();
sources.add(filePath);
}
}
else
{
sources.add(path);
}
// For all source jars corresponding to this <code>path</code>
// add to HDFS if path is local
for (Path srcPath : sources)
{
if (fileIsLocal)
{
Path dstPath = new Path(tmpDir, srcPath.getName());
fs.copyFromLocalFile(srcPath, dstPath);
srcPath = dstPath;
}
if (jarsToCache.contains(srcPath))
{
throw new RuntimeException("Duplicate jar specified: '" + srcPath.getName() + "'");
}
jarsToCache.add(srcPath);
}
}
// Add jars to distributed cache
for (Path path : jarsToCache)
{
DistributedCache.addFileToClassPath(path, conf, fs);
}
}
protected void setHadoopConf()
{
if (!root.has("hadoopConf"))
return;
JsonNode node = get(root, "hadoopConf");
Iterator<String> it = node.getFieldNames();
while (it.hasNext())
{
String name = it.next();
String value = getText(node, name);
conf.set(name, value);
}
}
protected void setPerfProfile()
{
conf.set(CubertStrings.PROFILE_MODE, profileMode ? "true" : "false");
}
protected void cacheFiles() throws URISyntaxException,
IOException
{
if (!root.has("cachedFiles") || root.get("cachedFiles").isNull()
|| root.get("cachedFiles").size() == 0)
return;
for (JsonNode cachedFile : root.path("cachedFiles"))
{
URI uri = new URI(cachedFile.getTextValue());
print.f("CACHING file %s", uri);
DistributedCache.addCacheFile(uri, conf);
}
}
protected void cacheIndex() throws IOException,
InstantiationException,
IllegalAccessException,
ClassNotFoundException,
URISyntaxException
{
if (!root.has("cacheIndex"))
return;
HashMap<String, Path> cachedIndexFiles = new HashMap<String, Path>();
for (JsonNode indexNode : root.path("cacheIndex"))
{
final String origPathName = getText(indexNode, "path");
final String indexName = JsonUtils.getText(indexNode, "name");
// Reuse index (to be put into distributed cache) if already created.
Path indexPath = cachedIndexFiles.get(origPathName);
if (indexPath == null)
{
// extract the index named by "index" from the location specified in "path";
Index indexToCache = Index.extractFromRelation(conf, origPathName);
indexPath = new Path(tmpDir, UUID.randomUUID().toString());
SerializerUtils.serializeToFile(conf, indexPath, indexToCache);
cachedIndexFiles.put(origPathName, indexPath);
}
DistributedCache.addCacheFile(new URI(indexPath.toString() + "#" + indexName), conf);
conf.set(CubertStrings.JSON_CACHE_INDEX_PREFIX + indexName, indexPath.getName());
print.f("Caching index at path [%s] as [%s]", origPathName, indexPath.toString());
}
}
protected boolean setInput(JsonNode input) throws IOException,
ClassNotFoundException
{
JsonNode params = input.get("params");
if (params == null)
params = mapper.createObjectNode();
// RelationType type = RelationType.valueOf(getText(input, "type"));
List<Path> paths = FileSystemUtils.getPaths(fs, input.get("path"), params);
if (paths.isEmpty())
{
return false;
}
job.setInputFormatClass(CubertInputFormat.class);
// storage specific configuration
confDiff.startDiff();
Storage storage = StorageFactory.get(getText(input, "type"));
storage.prepareInput(job, conf, params, paths);
if (params.has("combined") && Boolean.parseBoolean(getText(params, "combined")))
{
conf.setBoolean(CubertStrings.COMBINED_INPUT, true);
long originalMaxCombinedSplitSize =
conf.getLong("mapreduce.input.fileinputformat.split.maxsize", -1);
if (originalMaxCombinedSplitSize == -1)
{
throw new IllegalStateException("CONFIG ERROR: property mapreduce.input.fileinputformat.split.maxsize is not set when using combined input format");
}
}
// add input paths to the job
FileInputFormat.setInputPaths(job, paths.toArray(new Path[] {}));
confDiff.endDiff();
return true;
}
protected void setOutput() throws IOException
{
JsonNode output = get(root, "output");
JsonNode params = output.get("params");
if (params == null)
params = mapper.createObjectNode();
Path outputPath = new Path(getText(output, "path"));
FileOutputFormat.setOutputPath(job, outputPath);
if (params.has("overwrite") && Boolean.parseBoolean(getText(params, "overwrite")))
{
fs.delete(outputPath, true);
}
BlockSchema schema = new BlockSchema(output.get("schema"));
Storage storage = StorageFactory.get(getText(output, "type"));
storage.prepareOutput(job, conf, params, schema, outputPath);
}
protected void setShuffle()
{
JsonNode shuffle = get(root, "shuffle");
setPartitioner(shuffle);
Storage storage = StorageFactory.get(getText(shuffle, "type"));
storage.prepareOutput(job, conf, null, null, null);
if (shuffle.has("aggregates"))
{
job.setCombinerClass(CubertCombiner.class);
}
}
private void setPartitioner(JsonNode shuffle)
{
Class<? extends Partitioner> partitionerClass = null;
String mrPartitioner = getConf().get("mapreduce.partitioner.class");
if (mrPartitioner != null) {
try
{
partitionerClass = Class.forName(mrPartitioner).asSubclass(Partitioner.class);
} catch (ClassNotFoundException e)
{
throw new RuntimeException(e);
}
} else if (shuffle.has("partitionerClass"))
{
try
{
partitionerClass = Class.forName(getText(shuffle, "partitionerClass")).asSubclass(Partitioner.class);
job.setPartitionerClass(partitionerClass);
}
catch (ClassNotFoundException e)
{
throw new RuntimeException(e);
}
}
else
{
partitionerClass = CubertPartitioner.class;
}
print.f("Setting partitioner: " + partitionerClass.getName());
job.setPartitionerClass(partitionerClass);
}
protected void setNumReducers(int numReducers)
{
job.setNumReduceTasks(numReducers);
}
protected void setCompression(Class<? extends CompressionCodec> codecClass)
{
if (codecClass != null)
{
conf.setBoolean("mapred.output.compress", true);
conf.setClass("mapred.output.compression.codec",
codecClass,
CompressionCodec.class);
}
}
private void prepareTeePaths()
{
for (JsonNode mapNode : root.path("map"))
{
prepareTeePaths(mapNode.get("operators"));
}
if (hasReducePhase())
{
prepareTeePaths(root.get("reduce"));
}
}
private void prepareTeePaths(JsonNode operators)
{
for (JsonNode operatorNode : operators)
{
String name = operatorNode.get("operator").getTextValue();
if (name.equals("TEE"))
{
String path = operatorNode.get("path").getTextValue();
String teePrefix = String.format("tee-%04d", teeFilePrefixCounter++);
((ObjectNode) operatorNode).put("prefix", teePrefix);
List<String> prefixList = teeFiles.get(path);
if (prefixList == null)
{
prefixList = new ArrayList<String>();
teeFiles.put(path, prefixList);
}
prefixList.add(teePrefix);
}
}
}
private void moveTeeFiles() throws IOException
{
if (teeFiles.size() == 0)
return;
Path outputDir = new Path(root.get("output").get("path").getTextValue());
for (String dir : teeFiles.keySet())
{
// delete the old directory
Path teeDir = new Path(dir);
if (fs.exists(teeDir))
fs.delete(teeDir, true);
fs.mkdirs(teeDir);
for (String prefix : teeFiles.get(dir))
{
Path globPath = new Path(outputDir, prefix + "*");
FileStatus[] fileStatusList = fs.globStatus(globPath);
for (FileStatus fileStatus : fileStatusList)
{
fs.rename(fileStatus.getPath(), teeDir);
}
}
}
}
protected Job getJob()
{
return job;
}
protected Configuration getConf()
{
return conf;
}
public static JsonNode get(JsonNode node, String property)
{
JsonNode val = node.get(property);
if (val == null)
{
throw new IllegalArgumentException("Property " + property
+ " is not defined in " + node);
}
return val;
}
public static String getText(JsonNode node, String property, String defaultValue)
{
if (!node.has(property))
return defaultValue;
return get(node, property).getTextValue();
}
public static String getText(JsonNode node, String property)
{
return get(node, property).getTextValue();
}
public static JsonNode asArray(JsonNode node, String property)
{
JsonNode n = node.get(property);
if (n.isArray())
return node.path(property);
else
{
singletonArray.removeAll();
singletonArray.add(n);
return singletonArray;
}
}
}