package com.mongodb.hadoop; import com.mongodb.MongoClientURI; import com.mongodb.hadoop.mapred.MongoInputFormat; import com.mongodb.hadoop.mapred.MongoOutputFormat; import com.mongodb.hadoop.streaming.io.MongoIdentifierResolver; import com.mongodb.hadoop.testutils.BaseHadoopTest; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.zeroturnaround.exec.ProcessExecutor; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; import java.net.URLClassLoader; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import java.util.concurrent.TimeoutException; import static com.mongodb.hadoop.testutils.BaseHadoopTest.HADOOP_HOME; import static com.mongodb.hadoop.testutils.BaseHadoopTest.HADOOP_VERSION; import static com.mongodb.hadoop.util.MongoConfigUtil.INPUT_URI; import static com.mongodb.hadoop.util.MongoConfigUtil.OUTPUT_URI; import static java.lang.String.format; public class StreamingJob { private static final String STREAMING_JAR; private static final String STREAMING_MAPPER; private static final String STREAMING_REDUCER; private static final File STREAMING_HOME; static { try { File current = new File(".").getCanonicalFile(); File home = new File(current, "streaming"); while (!home.exists() && current.getParentFile().exists()) { current = current.getParentFile(); home = new File(current, "streaming"); } STREAMING_HOME = home; STREAMING_JAR = new File(STREAMING_HOME, "build/libs").listFiles(new HadoopVersionFilter())[0].getAbsolutePath(); STREAMING_MAPPER = new File(STREAMING_HOME, "examples/treasury/mapper.py").getAbsolutePath(); STREAMING_REDUCER = new File(STREAMING_HOME, "examples/treasury/reducer.py").getAbsolutePath(); } catch (final IOException e) { throw new RuntimeException(e.getMessage(), e); } } private static final Log LOG = LogFactory.getLog(StreamingJob.class); private List<String> cmd = new ArrayList<String>(); private final List<MongoClientURI> inputUris = new ArrayList<MongoClientURI>(); private final List<MongoClientURI> outputUris = new ArrayList<MongoClientURI>(); private String inputFormat = MongoInputFormat.class.getName(); private String inputPath = format("file://%s/in", System.getProperty("java.io.tmpdir")); private String outputFormat = MongoOutputFormat.class.getName(); private String outputPath = format("file://%s/out", System.getProperty("java.io.tmpdir")); private Map<String, String> params; public StreamingJob() { cmd.add(HADOOP_HOME + "/bin/hadoop"); cmd.add("jar"); if (HADOOP_VERSION.startsWith("1.1")) { cmd.add(String.format("%s/contrib/streaming/hadoop-streaming-%s.jar", HADOOP_HOME, HADOOP_VERSION)); } else { cmd.add(String.format("%s/share/hadoop/tools/lib/hadoop-streaming-%s.jar", HADOOP_HOME, HADOOP_VERSION)); } // add("-libjars", STREAMING_JAR); add("-io", "mongodb"); add("-jobconf", "stream.io.identifier.resolver.class=" + MongoIdentifierResolver.class.getName()); add("-mapper", STREAMING_MAPPER); add("-reducer", STREAMING_REDUCER); } public StreamingJob inputUris(final MongoClientURI... inputUris) { this.inputUris.addAll(Arrays.asList(inputUris)); return this; } public StreamingJob outputUris(final MongoClientURI... outputUris) { this.outputUris.addAll(Arrays.asList(outputUris)); return this; } public StreamingJob inputFormat(final String format) { inputFormat = format; return this; } public StreamingJob inputPath(final String path) { inputPath = path; return this; } public StreamingJob outputFormat(final String format) { outputFormat = format; return this; } public StreamingJob outputPath(final String path) { outputPath = path; return this; } public StreamingJob params(final Map<String, String> params) { this.params = params; return this; } public void execute() { try { copyJars(); add("-input", inputPath); add("-output", outputPath); add("-inputformat", inputFormat); add("-outputformat", outputFormat); add("-jobconf", format("%s=%s", INPUT_URI, inputUris.get(0))); add("-jobconf", format("%s=%s", OUTPUT_URI, outputUris.get(0))); for (final Entry<String, String> entry : params.entrySet()) { add("-jobconf", entry.getKey() + "=" + entry.getValue()); } final Map<String, String> env = new TreeMap<String, String>(System.getenv()); if (HADOOP_VERSION.startsWith("cdh")) { env.put("MAPRED_DIR", "share/hadoop/mapreduce2"); } LOG.info("Executing hadoop job:"); final StringBuilder output = new StringBuilder(); final Iterator<String> iterator = cmd.iterator(); while (iterator.hasNext()) { final String s = iterator.next(); if (output.length() != 0) { output.append("\t"); } else { output.append("\n"); } output.append(s); if (iterator.hasNext()) { output.append(" \\"); } output.append("\n"); } LOG.info(output); new ProcessExecutor().command(cmd) .environment(env) .redirectError(System.out) .execute(); } catch (final IOException e) { throw new RuntimeException(e.getMessage(), e); } catch (final InterruptedException e) { throw new RuntimeException(e.getMessage(), e); } catch (final TimeoutException e) { throw new RuntimeException(e.getMessage(), e); } } private void add(final String flag, final String value) { cmd.add(flag); cmd.add(value); } private void copyJars() { String hadoopLib = HADOOP_VERSION.startsWith("1") ? HADOOP_HOME + "/lib" : HADOOP_HOME + "/share/hadoop/common"; try { URLClassLoader classLoader = (URLClassLoader) getClass().getClassLoader(); for (URL url : classLoader.getURLs()) { boolean contains = url.getPath().contains("mongo-java-driver"); if (contains) { File file = new File(url.toURI()); FileUtils.copyFile(file, new File(hadoopLib, "mongo-java-driver.jar")); } } File coreJar = new File(BaseHadoopTest.PROJECT_HOME, "core/build/libs").listFiles(new HadoopVersionFilter())[0]; FileUtils.copyFile(coreJar, new File(hadoopLib, "mongo-hadoop-core.jar")); File mongoStreamingJar = new File(BaseHadoopTest.PROJECT_HOME, "streaming/build/libs").listFiles(new HadoopVersionFilter())[0]; FileUtils.copyFile(mongoStreamingJar, new File(hadoopLib, "mongo-hadoop-streaming.jar")); File hadoopStreamingJar; String streamingLibRoot; if (HADOOP_VERSION.startsWith("1")) { streamingLibRoot = "/../contrib/streaming"; } else { streamingLibRoot = "/../tools/lib"; } hadoopStreamingJar = new File(hadoopLib + streamingLibRoot).listFiles(new FilenameFilter() { @Override public boolean accept(final File dir, final String name) { return name.startsWith("hadoop-streaming-"); } })[0]; FileUtils.copyFile(hadoopStreamingJar, new File(hadoopLib, hadoopStreamingJar.getName())); } catch (IOException e) { throw new RuntimeException(e); } catch (URISyntaxException e) { throw new RuntimeException(e); } } }