package com.mongodb.hadoop.testutils;
import com.mongodb.MongoClientURI;
import com.mongodb.hadoop.HadoopVersionFilter;
import com.mongodb.hadoop.MongoInputFormat;
import com.mongodb.hadoop.MongoOutputFormat;
import com.mongodb.hadoop.util.MongoTool;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.zeroturnaround.exec.ProcessExecutor;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.concurrent.TimeoutException;
import static com.mongodb.hadoop.testutils.BaseHadoopTest.HADOOP_HOME;
import static com.mongodb.hadoop.testutils.BaseHadoopTest.HADOOP_VERSION;
import static com.mongodb.hadoop.testutils.BaseHadoopTest.PROJECT_HOME;
import static com.mongodb.hadoop.testutils.BaseHadoopTest.isHadoopV1;
import static com.mongodb.hadoop.util.MongoConfigUtil.INPUT_URI;
import static com.mongodb.hadoop.util.MongoConfigUtil.JOB_INPUT_FORMAT;
import static com.mongodb.hadoop.util.MongoConfigUtil.JOB_OUTPUT_FORMAT;
import static com.mongodb.hadoop.util.MongoConfigUtil.OUTPUT_URI;
import static java.lang.String.format;
public class MapReduceJob {
private static final Logger LOG = LoggerFactory.getLogger(MapReduceJob.class);
private Map<String, String> params = new LinkedHashMap<String, String>();
private final String className;
private final List<String> inputUris = new ArrayList<String>();
private MongoClientURI outputUri;
private File jarPath;
private Class<? extends InputFormat> inputFormat;
private Class<? extends OutputFormat> outputFormat;
private Class<? extends org.apache.hadoop.mapred.InputFormat> mapredInputFormat;
private Class<? extends org.apache.hadoop.mapred.OutputFormat> mapredOutputFormat;
private Class<? extends OutputCommitter> outputCommitter;
public MapReduceJob(final String className) {
this.className = className;
}
public MapReduceJob param(final String key, final String value) {
params.put(key, value);
return this;
}
public MapReduceJob inputUris(final MongoClientURI... inputUris) {
for (MongoClientURI inputUri : inputUris) {
this.inputUris.add(inputUri.getURI());
}
return this;
}
public MapReduceJob outputUri(final MongoClientURI uri) {
this.outputUri = uri;
return this;
}
public MapReduceJob inputUris(final URI... inputUris) {
for (URI inputUri : inputUris) {
this.inputUris.add(inputUri.toString());
}
return this;
}
public MapReduceJob outputCommitter(
final Class<? extends OutputCommitter> outputCommitter) {
this.outputCommitter = outputCommitter;
return this;
}
public MapReduceJob jar(final File path) {
jarPath = path;
return this;
}
public void execute(final boolean inVM) {
try {
copyJars();
if (inVM) {
executeInVM();
} else {
executeExternal();
}
} catch (Exception e) {
throw new RuntimeException(e.getMessage(), e);
}
}
public void executeExternal() throws IOException, TimeoutException, InterruptedException {
List<String> cmd = new ArrayList<String>();
cmd.add(new File(HADOOP_HOME, "bin/hadoop").getCanonicalPath());
cmd.add("jar");
cmd.add(jarPath.getAbsolutePath());
cmd.add(className);
for (Pair<String, String> entry : processSettings()) {
cmd.add(format("-D%s=%s", entry.getKey(), entry.getValue()));
}
Map<String, String> env = new TreeMap<String, String>(System.getenv());
if (HADOOP_VERSION.startsWith("cdh")) {
env.put("MAPRED_DIR", "share/hadoop/mapreduce2");
}
LOG.info("Executing hadoop job:");
StringBuilder output = new StringBuilder();
Iterator<String> iterator = cmd.iterator();
while (iterator.hasNext()) {
final String s = iterator.next();
if (output.length() != 0) {
output.append("\t");
} else {
output.append("\n");
}
output.append(s);
if (iterator.hasNext()) {
output.append(" \\");
}
output.append("\n");
}
LOG.info(output.toString());
new ProcessExecutor().command(cmd)
.environment(env)
.redirectError(System.out)
.execute();
}
@SuppressWarnings("unchecked")
public void executeInVM() throws Exception {
List<String> cmd = new ArrayList<String>();
for (Pair<String, String> entry : processSettings()) {
cmd.add(format("-D%s=%s", entry.getKey(), entry.getValue()));
}
Map<String, String> env = new TreeMap<String, String>(System.getenv());
if (HADOOP_VERSION.startsWith("cdh")) {
env.put("MAPRED_DIR", "share/hadoop/mapreduce2");
System.setProperty("MAPRED_DIR", "share/hadoop/mapreduce2");
}
LOG.info("Executing hadoop job");
Class<? extends MongoTool> jobClass = (Class<? extends MongoTool>) Class.forName(className);
Configuration conf = new Configuration();
MongoTool app = (MongoTool) jobClass.getConstructor(new Class[]{Configuration.class})
.newInstance(conf);
ToolRunner.run(conf, app, cmd.toArray(new String[cmd.size()]));
}
private List<Pair<String, String>> processSettings() {
List<Pair<String, String>> entries = new ArrayList<Pair<String, String>>();
for (Entry<String, String> entry : params.entrySet()) {
entries.add(new Pair<String, String>(entry.getKey(), entry.getValue()));
}
StringBuilder inputUri = new StringBuilder();
if (!inputUris.isEmpty()) {
for (String uri : inputUris) {
if (inputUri.length() != 0) {
inputUri.append(",");
}
inputUri.append(uri);
}
entries.add(new Pair<String, String>(INPUT_URI, inputUri.toString()));
}
if (outputUri != null) {
entries.add(
new Pair<String, String>(OUTPUT_URI, outputUri.toString()));
}
if (inputFormat != null) {
entries.add(new Pair<String, String>(JOB_INPUT_FORMAT, inputFormat.getName()));
} else if (mapredInputFormat != null) {
entries.add(new Pair<String, String>(JOB_INPUT_FORMAT, mapredInputFormat.getName()));
} else {
String name;
if (BaseHadoopTest.isHadoopV1()) {
name = com.mongodb.hadoop.mapred.MongoInputFormat.class.getName();
} else {
name = MongoInputFormat.class.getName();
}
entries.add(new Pair<String, String>(JOB_INPUT_FORMAT, name));
LOG.info("No input format defined. Defaulting to '%s'", name);
}
if (outputFormat != null) {
LOG.info("Adding output format '%s'", outputFormat.getName());
entries.add(new Pair<String, String>(JOB_OUTPUT_FORMAT, outputFormat.getName()));
} else if (mapredOutputFormat != null) {
LOG.info("Adding output format '%s'", mapredOutputFormat.getName());
entries.add(new Pair<String, String>(JOB_OUTPUT_FORMAT, mapredOutputFormat.getName()));
} else {
String name = isHadoopV1()
? com.mongodb.hadoop.mapred.MongoOutputFormat.class.getName()
: MongoOutputFormat.class.getName();
entries.add(new Pair<String, String>(JOB_OUTPUT_FORMAT, name));
LOG.info(format("No output format defined. Defaulting to '%s'", name));
}
if (outputCommitter != null) {
entries.add(
new Pair<String, String>(
"mapred.output.committer.class", outputCommitter.getName()));
}
return entries;
}
private void copyJars() {
String hadoopLib = format(isHadoopV1() ? HADOOP_HOME + "/lib" : HADOOP_HOME + "/share/hadoop/common");
try {
URLClassLoader classLoader = (URLClassLoader) getClass().getClassLoader();
for (URL url : classLoader.getURLs()) {
boolean contains = url.getPath().contains("mongo-java-driver");
if (contains) {
File file = new File(url.toURI());
FileUtils.copyFile(file, new File(hadoopLib, "mongo-java-driver.jar"));
}
}
File coreJar = new File(PROJECT_HOME, "core/build/libs").listFiles(new HadoopVersionFilter())[0];
FileUtils.copyFile(coreJar, new File(hadoopLib, "mongo-hadoop-core.jar"));
} catch (IOException e) {
throw new RuntimeException(e);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
public MapReduceJob inputFormat(final Class<? extends InputFormat> inputFormat) {
this.inputFormat = inputFormat;
return this;
}
public MapReduceJob outputFormat(final Class<? extends OutputFormat> outputFormat) {
this.outputFormat = outputFormat;
return this;
}
public MapReduceJob mapredInputFormat(final Class<? extends org.apache.hadoop.mapred.InputFormat> inputFormat) {
this.mapredInputFormat = inputFormat;
return this;
}
public MapReduceJob mapredOutputFormat(final Class<? extends org.apache.hadoop.mapred.OutputFormat> outputFormat) {
this.mapredOutputFormat = outputFormat;
return this;
}
private static final class Pair<T, U> {
private T key;
private U value;
private Pair(final T key, final U value) {
this.key = key;
this.value = value;
}
public T getKey() {
return key;
}
public U getValue() {
return value;
}
@Override
public String toString() {
return String.format("Pair{key=%s, value=%s}", key, value);
}
}
}