/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.streaming;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.cli2.*;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.PropertyOption;
import org.apache.commons.cli2.resource.ResourceConstants;
import org.apache.commons.cli2.util.HelpFormatter;
import org.apache.commons.cli2.validation.InvalidArgumentException;
import org.apache.commons.cli2.validation.Validator;
import org.apache.commons.logging.*;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorCombiner;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileAsTextInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.filecache.*;
import org.apache.hadoop.util.*;
/** All the client-side work happens here.
* (Jar packaging, MapRed job submission and monitoring)
*/
public class StreamJob {
protected static final Log LOG = LogFactory.getLog(StreamJob.class.getName());
final static String REDUCE_NONE = "NONE";
/** -----------Streaming CLI Implementation **/
private DefaultOptionBuilder builder =
new DefaultOptionBuilder("-","-", false);
private ArgumentBuilder argBuilder = new ArgumentBuilder();
private Parser parser = new Parser();
private Group allOptions;
HelpFormatter helpFormatter = new HelpFormatter(" ", " ", " ", 900);
// need these two at class level to extract values later from
// commons-cli command line
private MultiPropertyOption jobconf = new MultiPropertyOption(
"-jobconf", "(n=v) Optional. Add or override a JobConf property.", 'D');
private MultiPropertyOption cmdenv = new MultiPropertyOption(
"-cmdenv", "(n=v) Pass env.var to streaming commands.", 'E');
public StreamJob(String[] argv, boolean mayExit) {
setupOptions();
argv_ = argv;
mayExit_ = mayExit;
}
/**
* This is the method that actually
* intializes the job conf and submits the job
* to the jobtracker
* @throws IOException
*/
public int go() throws IOException {
init();
preProcessArgs();
parseArgv();
postProcessArgs();
setJobConf();
return submitAndMonitorJob();
}
protected void init() {
try {
env_ = new Environment();
} catch (IOException io) {
throw new RuntimeException(io);
}
}
void preProcessArgs() {
verbose_ = false;
addTaskEnvironment_ = "";
}
void postProcessArgs() throws IOException {
if (cluster_ == null) {
// hadoop-default.xml is standard, hadoop-local.xml is not.
cluster_ = "default";
}
hadoopAliasConf_ = "hadoop-" + getClusterNick() + ".xml";
if (inputSpecs_.size() == 0) {
fail("Required argument: -input <name>");
}
if (output_ == null) {
fail("Required argument: -output ");
}
msg("addTaskEnvironment=" + addTaskEnvironment_);
Iterator it = packageFiles_.iterator();
while (it.hasNext()) {
File f = new File((String) it.next());
if (f.isFile()) {
shippedCanonFiles_.add(f.getCanonicalPath());
}
}
msg("shippedCanonFiles_=" + shippedCanonFiles_);
// careful with class names..
mapCmd_ = unqualifyIfLocalPath(mapCmd_);
comCmd_ = unqualifyIfLocalPath(comCmd_);
redCmd_ = unqualifyIfLocalPath(redCmd_);
}
String unqualifyIfLocalPath(String cmd) throws IOException {
if (cmd == null) {
//
} else {
String prog = cmd;
String args = "";
int s = cmd.indexOf(" ");
if (s != -1) {
prog = cmd.substring(0, s);
args = cmd.substring(s + 1);
}
String progCanon;
try {
progCanon = new File(prog).getCanonicalPath();
} catch (IOException io) {
progCanon = prog;
}
boolean shipped = shippedCanonFiles_.contains(progCanon);
msg("shipped: " + shipped + " " + progCanon);
if (shipped) {
// Change path to simple filename.
// That way when PipeMapRed calls Runtime.exec(),
// it will look for the excutable in Task's working dir.
// And this is where TaskRunner unjars our job jar.
prog = new File(prog).getName();
if (args.length() > 0) {
cmd = prog + " " + args;
} else {
cmd = prog;
}
}
}
msg("cmd=" + cmd);
return cmd;
}
String getHadoopAliasConfFile() {
return new File(getHadoopClientHome() + "/conf", hadoopAliasConf_).getAbsolutePath();
}
void parseArgv(){
CommandLine cmdLine = null;
try{
cmdLine = parser.parse(argv_);
}catch(Exception oe){
LOG.error(oe.getMessage());
if (detailedUsage_) {
exitUsage(true);
} else {
exitUsage(false);
}
}
if (cmdLine != null){
verbose_ = cmdLine.hasOption("-verbose");
detailedUsage_ = cmdLine.hasOption("-info");
debug_ = cmdLine.hasOption("-debug")? debug_ + 1 : debug_;
inputSpecs_.addAll(cmdLine.getValues("-input"));
output_ = (String) cmdLine.getValue("-output");
mapCmd_ = (String)cmdLine.getValue("-mapper");
comCmd_ = (String)cmdLine.getValue("-combiner");
redCmd_ = (String)cmdLine.getValue("-reducer");
postMapCmd_ = (String)cmdLine.getValue("-postmapper");
postRedCmd_ = (String)cmdLine.getValue("-postreducer");
preRedCmd_ = (String)cmdLine.getValue("-prereducer");
jobName_ = (String)cmdLine.getValue("-jobname");
packageFiles_.addAll(cmdLine.getValues("-file"));
cluster_ = (String)cmdLine.getValue("-cluster");
configPath_.addAll(cmdLine.getValues("-config"));
String fsName = (String)cmdLine.getValue("-dfs");
if (null != fsName){
userJobConfProps_.put("fs.default.name", fsName);
}
String jt = (String)cmdLine.getValue("mapred.job.tracker");
if (null != jt){
userJobConfProps_.put("fs.default.name", jt);
}
additionalConfSpec_ = (String)cmdLine.getValue("-additionalconfspec");
inputFormatSpec_ = (String)cmdLine.getValue("-inputformat");
outputFormatSpec_ = (String)cmdLine.getValue("-outputformat");
numReduceTasksSpec_ = (String)cmdLine.getValue("-numReduceTasks");
partitionerSpec_ = (String)cmdLine.getValue("-partitioner");
inReaderSpec_ = (String)cmdLine.getValue("-inputreader");
List<String> car = cmdLine.getValues("-cacheArchive");
if (null != car){
for(String s : car){
cacheArchives = (cacheArchives == null)?s :cacheArchives + "," + s;
}
}
List<String> caf = cmdLine.getValues("-cacheFile");
if (null != caf){
for(String s : caf){
cacheFiles = (cacheFiles == null)?s :cacheFiles + "," + s;
}
}
List<String> jobConfArgs = (List<String>)cmdLine.getValue(jobconf);
List<String> envArgs = (List<String>)cmdLine.getValue(cmdenv);
if (null != jobConfArgs){
for(String s : jobConfArgs){
String []parts = s.split("=", 2);
userJobConfProps_.put(parts[0], parts[1]);
}
}
if (null != envArgs){
for(String s : envArgs){
if (addTaskEnvironment_.length() > 0) {
addTaskEnvironment_ += " ";
}
addTaskEnvironment_ += s;
}
}
}else if (detailedUsage_) {
exitUsage(true);
}
}
protected void msg(String msg) {
if (verbose_) {
System.out.println("STREAM: " + msg);
}
}
private Option createOption(String name, String desc,
String argName, int max, boolean required){
Argument argument = argBuilder.
withName(argName).
withMinimum(1).
withMaximum(max).
create();
return builder.
withLongName(name).
withArgument(argument).
withDescription(desc).
withRequired(required).
create();
}
private Option createOption(String name, String desc,
String argName, int max, boolean required, Validator validator){
Argument argument = argBuilder.
withName(argName).
withMinimum(1).
withMaximum(max).
withValidator(validator).
create();
return builder.
withLongName(name).
withArgument(argument).
withDescription(desc).
withRequired(required).
create();
}
private Option createBoolOption(String name, String desc){
return builder.withLongName(name).withDescription(desc).create();
}
private void setupOptions(){
final Validator fileValidator = new Validator(){
public void validate(final List values) throws InvalidArgumentException {
// Note : This code doesnt belong here, it should be changed to
// an can exec check in java 6
for (String file : (List<String>)values) {
File f = new File(file);
if (!f.exists()) {
throw new InvalidArgumentException("Argument : " +
f.getAbsolutePath() + " doesn't exist.");
}
if (!f.isFile()) {
throw new InvalidArgumentException("Argument : " +
f.getAbsolutePath() + " is not a file.");
}
if (!f.canRead()) {
throw new InvalidArgumentException("Argument : " +
f.getAbsolutePath() + " is not accessible");
}
}
}
};
// Note: not extending CLI2's FileValidator, that overwrites
// the String arg into File and causes ClassCastException
// in inheritance tree.
final Validator execValidator = new Validator(){
public void validate(final List values) throws InvalidArgumentException {
// Note : This code doesnt belong here, it should be changed to
// an can exec check in java 6
for (String file : (List<String>)values) {
try{
Runtime.getRuntime().exec("chmod 0777 " + (new File(file)).getAbsolutePath());
}catch(IOException ioe){
// ignore
}
}
fileValidator.validate(values);
}
};
Option input = createOption("input",
"DFS input file(s) for the Map step",
"path",
Integer.MAX_VALUE,
true);
Option output = createOption("output",
"DFS output directory for the Reduce step",
"path", 1, true);
Option mapper = createOption("mapper",
"The streaming command to run", "cmd", 1, false);
Option combiner = createOption("combiner",
"The streaming command to run", "cmd", 1, false);
// reducer could be NONE
Option reducer = createOption("reducer",
"The streaming command to run", "cmd", 1, false);
Option file = createOption("file",
"File/dir to be shipped in the Job jar file",
"file", Integer.MAX_VALUE, false, execValidator);
Option dfs = createOption("dfs",
"Optional. Override DFS configuration", "<h:p>|local", 1, false);
Option jt = createOption("jt",
"Optional. Override JobTracker configuration", "<h:p>|local", 1, false);
Option additionalconfspec = createOption("additionalconfspec",
"Optional.", "spec", 1, false);
Option inputformat = createOption("inputformat",
"Optional.", "spec", 1, false);
Option outputformat = createOption("outputformat",
"Optional.", "spec", 1, false);
Option partitioner = createOption("partitioner",
"Optional.", "spec", 1, false);
Option numReduceTasks = createOption("numReduceTasks",
"Optional.", "spec",1, false );
Option inputreader = createOption("inputreader",
"Optional.", "spec", 1, false);
Option cacheFile = createOption("cacheFile",
"File name URI", "fileNameURI", Integer.MAX_VALUE, false);
Option cacheArchive = createOption("cacheArchive",
"File name URI", "fileNameURI", 1, false);
Option jobname = createOption("jobname",
"Optional", "spec", 1, false);
Option postmapper = createOption("postmapper",
"Optional", "spec", 1, false);
Option postreducer = createOption("postreducer",
"Optional", "spec", 1, false);
Option prereducer = createOption("prereducer",
"Optional", "spec", 1, false);
// boolean properties
Option verbose = createBoolOption("verbose", "print verbose output");
Option info = createBoolOption("info", "print verbose output");
Option help = createBoolOption("help", "print this help message");
Option debug = createBoolOption("debug", "print debug output");
Option inputtagged = createBoolOption("inputtagged", "inputtagged");
allOptions = new GroupBuilder().
withOption(input).
withOption(output).
withOption(mapper).
withOption(postmapper).
withOption(postreducer).
withOption(prereducer).
withOption(combiner).
withOption(reducer).
withOption(file).
withOption(dfs).
withOption(jt).
withOption(additionalconfspec).
withOption(inputformat).
withOption(outputformat).
withOption(partitioner).
withOption(numReduceTasks).
withOption(inputreader).
withOption(jobconf).
withOption(cmdenv).
withOption(jobname).
withOption(cacheFile).
withOption(cacheArchive).
withOption(verbose).
withOption(info).
withOption(debug).
withOption(inputtagged).
withOption(help).
create();
parser.setGroup(allOptions);
}
public void exitUsage(boolean detailed) {
// 1 2 3 4 5 6 7
//1234567890123456789012345678901234567890123456789012345678901234567890123456789
if (!detailed) {
System.out.println("Usage: $HADOOP_HOME/bin/hadoop [--config dir] jar \\");
System.out.println(" $HADOOP_HOME/hadoop-streaming.jar [options]");
System.out.println("Options:");
System.out.println(" -input <path> DFS input file(s) for the Map step");
System.out.println(" -output <path> DFS output directory for the Reduce step");
System.out.println(" -mapper <cmd|JavaClassName> The streaming command to run");
System.out.println(" -postmapper <JavaClassName> Map Class to post process streaming mapper");
System.out.println(" -postreducer <JavaClassName> Map Class to post process streaming reducer");
System.out.println(" -prereducer <JavaClassName> Reduce Class to process reduce input before streaming");
System.out.println(" -combiner <JavaClassName> Combiner has to be a Java class");
System.out.println(" -reducer <cmd|JavaClassName> The streaming command to run");
System.out.println(" -file <file> File/dir to be shipped in the Job jar file");
System.out.println(" -dfs <h:p>|local Optional. Override DFS configuration");
System.out.println(" -jt <h:p>|local Optional. Override JobTracker configuration");
System.out.println(" -additionalconfspec specfile Optional.");
System.out.println(" -inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional.");
System.out.println(" -outputformat TextOutputFormat(default)|JavaClassName Optional.");
System.out.println(" -partitioner JavaClassName Optional.");
System.out.println(" -numReduceTasks <num> Optional.");
System.out.println(" -inputreader <spec> Optional.");
System.out.println(" -jobconf <n>=<v> Optional. Add or override a JobConf property");
System.out.println(" -cmdenv <n>=<v> Optional. Pass env.var to streaming commands");
System.out.println(" -jobname <name> Optional. Set the name of the job");
System.out.println(" -cacheFile fileNameURI");
System.out.println(" -cacheArchive fileNameURI");
System.out.println(" -verbose");
System.out.println();
System.out.println("For more details about these options:");
System.out.println("Use $HADOOP_HOME/bin/hadoop jar build/hadoop-streaming.jar -info");
fail("");
}
System.out.println("In -input: globbing on <path> is supported and can have multiple -input");
System.out.println("Default Map input format: a line is a record in UTF-8");
System.out.println(" the key part ends at first TAB, the rest of the line is the value");
System.out.println("Custom input format: -inputformat package.MyInputFormat ");
System.out.println("Map output format, reduce input/output format:");
System.out.println(" Format defined by what the mapper command outputs. Line-oriented");
System.out.println();
System.out.println("The files or directories named in the -file argument[s] end up in the");
System.out.println(" working directory when the mapper and reducer are run.");
System.out.println(" The location of this working directory is unspecified.");
System.out.println();
System.out.println("To set the number of reduce tasks (num. of output files):");
System.out.println(" -jobconf mapred.reduce.tasks=10");
System.out.println("To skip the sort/combine/shuffle/sort/reduce step:");
System.out.println(" Use -numReduceTasks 0");
System.out
.println(" A Task's Map output then becomes a 'side-effect output' rather than a reduce input");
System.out
.println(" This speeds up processing, This also feels more like \"in-place\" processing");
System.out.println(" because the input filename and the map input order are preserved");
System.out.println(" This equivalent -reducer NONE");
System.out.println();
System.out.println("To speed up the last reduces:");
System.out.println(" -jobconf mapred.speculative.execution=true");
System.out.println("To name the job (appears in the JobTracker Web UI):");
System.out.println(" -jobconf mapred.job.name='My Job' ");
System.out.println("To change the local temp directory:");
System.out.println(" -jobconf dfs.data.dir=/tmp/dfs");
System.out.println(" -jobconf stream.tmpdir=/tmp/streaming");
System.out.println("Additional local temp directories with -cluster local:");
System.out.println(" -jobconf mapred.local.dir=/tmp/local");
System.out.println(" -jobconf mapred.system.dir=/tmp/system");
System.out.println(" -jobconf mapred.temp.dir=/tmp/temp");
System.out.println("Use a custom hadoopStreaming build along a standard hadoop install:");
System.out.println(" $HADOOP_HOME/bin/hadoop jar /path/my-hadoop-streaming.jar [...]\\");
System.out
.println(" [...] -jobconf stream.shipped.hadoopstreaming=/path/my-hadoop-streaming.jar");
System.out.println("For more details about jobconf parameters see:");
System.out.println(" http://wiki.apache.org/lucene-hadoop/JobConfFile");
System.out.println("To set an environement variable in a streaming command:");
System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/");
System.out.println();
System.out.println("Shortcut:");
System.out
.println(" setenv HSTREAMING \"$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar\"");
System.out.println();
System.out.println("Example: $HSTREAMING -mapper \"/usr/local/bin/perl5 filter.pl\"");
System.out.println(" -file /local/filter.pl -input \"/logs/0604*/*\" [...]");
System.out.println(" Ships a script, invokes the non-shipped perl interpreter");
System.out.println(" Shipped files go to the working directory so filter.pl is found by perl");
System.out.println(" Input files are all the daily logs for days in month 2006-04");
fail("");
}
public void fail(String message) {
if (mayExit_) {
System.err.println(message);
throw new RuntimeException(message);
} else {
throw new IllegalArgumentException(message);
}
}
// --------------------------------------------
protected String getHadoopClientHome() {
String h = env_.getProperty("HADOOP_HOME"); // standard Hadoop
if (h == null) {
//fail("Missing required environment variable: HADOOP_HOME");
h = "UNDEF";
}
return h;
}
protected boolean isLocalHadoop() {
boolean local;
if (jobConf_ == null) {
local = getClusterNick().equals("local");
} else {
local = StreamUtil.isLocalJobTracker(jobConf_);
}
return local;
}
protected String getClusterNick() {
return cluster_;
}
/** @return path to the created Jar file or null if no files are necessary.
*/
protected String packageJobJar() throws IOException {
ArrayList unjarFiles = new ArrayList();
// Runtime code: ship same version of code as self (job submitter code)
// usually found in: build/contrib or build/hadoop-<version>-dev-streaming.jar
// First try an explicit spec: it's too hard to find our own location in this case:
// $HADOOP_HOME/bin/hadoop jar /not/first/on/classpath/custom-hadoop-streaming.jar
// where findInClasspath() would find the version of hadoop-streaming.jar in $HADOOP_HOME
String runtimeClasses = userJobConfProps_.get("stream.shipped.hadoopstreaming"); // jar or class dir
System.out.println(runtimeClasses + "=@@@userJobConfProps_.get(stream.shipped.hadoopstreaming");
if (runtimeClasses == null) {
runtimeClasses = StreamUtil.findInClasspath(StreamJob.class.getName());
}
if (runtimeClasses == null) {
throw new IOException("runtime classes not found: " + getClass().getPackage());
} else {
msg("Found runtime classes in: " + runtimeClasses);
}
if (isLocalHadoop()) {
// don't package class files (they might get unpackaged in "." and then
// hide the intended CLASSPATH entry)
// we still package everything else (so that scripts and executable are found in
// Task workdir like distributed Hadoop)
} else {
if (new File(runtimeClasses).isDirectory()) {
packageFiles_.add(runtimeClasses);
} else {
unjarFiles.add(runtimeClasses);
}
}
if (packageFiles_.size() + unjarFiles.size() == 0) {
return null;
}
String tmp = jobConf_.get("stream.tmpdir"); //, "/tmp/${user.name}/"
File tmpDir = (tmp == null) ? null : new File(tmp);
// tmpDir=null means OS default tmp dir
File jobJar = File.createTempFile("hive-streamjob", ".jar", tmpDir);
System.out.println("packageJobJar: " + packageFiles_ + " " + unjarFiles + " " + jobJar
+ " tmpDir=" + tmpDir);
if (debug_ == 0) {
jobJar.deleteOnExit();
}
JarBuilder builder = new JarBuilder();
if (verbose_) {
builder.setVerbose(true);
}
String jobJarName = jobJar.getAbsolutePath();
builder.merge(packageFiles_, unjarFiles, jobJarName);
return jobJarName;
}
/**
* This method sets the user jobconf variable specified
* by user using -jobconf key=value
* @param doEarlyProps
*/
protected void setUserJobConfProps(boolean doEarlyProps) {
Iterator it = userJobConfProps_.keySet().iterator();
while (it.hasNext()) {
String key = (String) it.next();
String val = (String)userJobConfProps_.get(key);
boolean earlyName = key.equals("fs.default.name");
earlyName |= key.equals("stream.shipped.hadoopstreaming");
if (doEarlyProps == earlyName) {
msg("xxxJobConf: set(" + key + ", " + val + ") early=" + doEarlyProps);
jobConf_.set(key, val);
}
}
}
/**
* get the uris of all the files/caches
*/
protected void getURIs(String lcacheArchives, String lcacheFiles) {
String archives[] = StringUtils.getStrings(lcacheArchives);
String files[] = StringUtils.getStrings(lcacheFiles);
fileURIs = StringUtils.stringToURI(files);
archiveURIs = StringUtils.stringToURI(archives);
}
protected void setJobConf() throws IOException {
msg("hadoopAliasConf_ = " + hadoopAliasConf_);
config_ = new Configuration();
if (!cluster_.equals("default")) {
config_.addResource(new Path(getHadoopAliasConfFile()));
} else {
// use only defaults: hadoop-default.xml and hadoop-site.xml
}
System.out.println("additionalConfSpec_:" + additionalConfSpec_);
if (additionalConfSpec_ != null) {
config_.addResource(new Path(additionalConfSpec_));
}
Iterator it = configPath_.iterator();
while (it.hasNext()) {
String pathName = (String) it.next();
config_.addResource(new Path(pathName));
}
// general MapRed job properties
jobConf_ = new JobConf(config_);
// All streaming jobs get the task timeout value
// from the configuration settings.
setUserJobConfProps(true);
// The correct FS must be set before this is called!
// (to resolve local vs. dfs drive letter differences)
// (mapred.working.dir will be lazily initialized ONCE and depends on FS)
for (int i = 0; i < inputSpecs_.size(); i++) {
FileInputFormat.addInputPaths(jobConf_,
(String) inputSpecs_.get(i));
}
jobConf_.set("stream.numinputspecs", "" + inputSpecs_.size());
String defaultPackage = this.getClass().getPackage().getName();
Class c;
Class fmt = null;
if (inReaderSpec_ == null && inputFormatSpec_ == null) {
fmt = TextInputFormat.class;
} else if (inputFormatSpec_ != null) {
if (inputFormatSpec_.equals(TextInputFormat.class.getName())
|| inputFormatSpec_.equals(TextInputFormat.class.getCanonicalName())) {
fmt = TextInputFormat.class;
} else if (inputFormatSpec_.equals(KeyValueTextInputFormat.class
.getName())
|| inputFormatSpec_.equals(KeyValueTextInputFormat.class
.getCanonicalName())) {
} else if (inputFormatSpec_.equals(SequenceFileInputFormat.class
.getName())
|| inputFormatSpec_
.equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class
.getCanonicalName())) {
fmt = SequenceFileInputFormat.class;
} else if (inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class
.getName())
|| inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class
.getCanonicalName())) {
fmt = SequenceFileAsTextInputFormat.class;
} else {
c = StreamUtil.goodClassOrNull(inputFormatSpec_, defaultPackage);
if (c != null) {
fmt = c;
} else {
}
}
}
if (fmt == null) {
fmt = StreamInputFormat.class;
}
jobConf_.setInputFormat(fmt);
jobConf_.setOutputKeyClass(Text.class);
jobConf_.setOutputValueClass(Text.class);
if (jobName_ != null) {
jobConf_.setJobName(jobName_);
}
jobConf_.set("stream.addenvironment", addTaskEnvironment_);
if (mapCmd_ != null) {
c = StreamUtil.goodClassOrNull(mapCmd_, defaultPackage);
if (c != null) {
if(postMapCmd_ != null) {
fail("-postmapper cannot be combined with a Java map class");
}
jobConf_.setMapperClass(c);
} else {
jobConf_.setMapperClass(PipeMapper.class);
jobConf_.set("stream.map.streamprocessor",
URLEncoder.encode(mapCmd_, "UTF-8"));
}
}
if (postMapCmd_ != null) {
c = StreamUtil.goodClassOrNull(postMapCmd_, defaultPackage);
if (c != null) {
jobConf_.setClass("stream.map.posthook", c, Mapper.class);
} else {
fail("postmapper: "+postMapCmd_+" is not a valid Java Class");
}
}
if (comCmd_ != null) {
c = StreamUtil.goodClassOrNull(comCmd_, defaultPackage);
if (c != null) {
jobConf_.setCombinerClass(c);
}
}
boolean reducerNone_ = false;
if (redCmd_ != null) {
reducerNone_ = redCmd_.equals(REDUCE_NONE);
if (redCmd_.compareToIgnoreCase("aggregate") == 0) {
if(postRedCmd_ != null) {
fail("-postreducer cannot be combined with a Java reduce class");
}
if(preRedCmd_ != null) {
fail("-prereducer cannot be combined with a Java reduce class");
}
jobConf_.setReducerClass(ValueAggregatorReducer.class);
jobConf_.setCombinerClass(ValueAggregatorCombiner.class);
} else {
c = StreamUtil.goodClassOrNull(redCmd_, defaultPackage);
if (c != null) {
if(postRedCmd_ != null) {
fail("-postreducer cannot be combined with a Java reduce class "+c.getName());
}
if(preRedCmd_ != null) {
fail("-prereducer cannot be combined with a Java reduce class "+c.getName());
}
jobConf_.setReducerClass(c);
} else {
jobConf_.setReducerClass(PipeReducer.class);
jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode(
redCmd_, "UTF-8"));
}
}
}
if (postRedCmd_ != null) {
c = StreamUtil.goodClassOrNull(postRedCmd_, defaultPackage);
if (c != null) {
jobConf_.setClass("stream.reduce.posthook", c, Mapper.class);
} else {
fail("postreducer: "+postRedCmd_+" is not a valid Java Class");
}
}
if (preRedCmd_ != null) {
c = StreamUtil.goodClassOrNull(preRedCmd_, defaultPackage);
if (c != null) {
jobConf_.setClass("stream.reduce.prehook", c, Reducer.class);
} else {
fail("postreducer: "+preRedCmd_+" is not a valid Java Class");
}
}
if (inReaderSpec_ != null) {
String[] args = inReaderSpec_.split(",");
String readerClass = args[0];
// this argument can only be a Java class
c = StreamUtil.goodClassOrNull(readerClass, defaultPackage);
if (c != null) {
jobConf_.set("stream.recordreader.class", c.getName());
} else {
fail("-inputreader: class not found: " + readerClass);
}
for (int i = 1; i < args.length; i++) {
String[] nv = args[i].split("=", 2);
String k = "stream.recordreader." + nv[0];
String v = (nv.length > 1) ? nv[1] : "";
jobConf_.set(k, v);
}
}
setUserJobConfProps(false);
FileOutputFormat.setOutputPath(jobConf_,new Path(output_));
fmt = null;
if (outputFormatSpec_!= null) {
c = StreamUtil.goodClassOrNull(outputFormatSpec_, defaultPackage);
if (c != null) {
fmt = c;
}
}
if (fmt == null) {
fmt = TextOutputFormat.class;
}
jobConf_.setOutputFormat(fmt);
if (partitionerSpec_!= null) {
c = StreamUtil.goodClassOrNull(partitionerSpec_, defaultPackage);
if (c != null) {
jobConf_.setPartitionerClass(c);
}
}
if (numReduceTasksSpec_!= null) {
int numReduceTasks = Integer.parseInt(numReduceTasksSpec_);
jobConf_.setNumReduceTasks(numReduceTasks);
}
if (reducerNone_) {
jobConf_.setNumReduceTasks(0);
}
// last, allow user to override anything
// (although typically used with properties we didn't touch)
jar_ = packageJobJar();
if (jar_ != null) {
jobConf_.setJar(jar_);
}
if ((cacheArchives != null) || (cacheFiles != null)){
getURIs(cacheArchives, cacheFiles);
boolean b = DistributedCache.checkURIs(fileURIs, archiveURIs);
if (!b)
fail(LINK_URI);
}
DistributedCache.createSymlink(jobConf_);
// set the jobconf for the caching parameters
if (cacheArchives != null)
DistributedCache.setCacheArchives(archiveURIs, jobConf_);
if (cacheFiles != null)
DistributedCache.setCacheFiles(fileURIs, jobConf_);
if (verbose_) {
listJobConfProperties();
}
msg("submitting to jobconf: " + getJobTrackerHostPort());
}
/**
* Prints out the jobconf properties on stdout
* when verbose is specified.
*/
protected void listJobConfProperties()
{
msg("==== JobConf properties:");
Iterator it = jobConf_.iterator();
TreeMap sorted = new TreeMap();
while(it.hasNext()) {
Map.Entry en = (Map.Entry)it.next();
sorted.put(en.getKey(), en.getValue());
}
it = sorted.entrySet().iterator();
while(it.hasNext()) {
Map.Entry en = (Map.Entry)it.next();
msg(en.getKey() + "=" + en.getValue());
}
msg("====");
}
protected String getJobTrackerHostPort() {
return jobConf_.get("mapred.job.tracker");
}
protected void jobInfo() {
if (isLocalHadoop()) {
LOG.info("Job running in-process (local Hadoop)");
} else {
String hp = getJobTrackerHostPort();
LOG.info("To kill this job, run:");
LOG.info(getHadoopClientHome() + "/bin/hadoop job -Dmapred.job.tracker=" + hp + " -kill "
+ jobId_);
//LOG.info("Job file: " + running_.getJobFile());
LOG.info("Tracking URL: " + StreamUtil.qualifyHost(running_.getTrackingURL()));
}
}
// Based on JobClient
public int submitAndMonitorJob() throws IOException {
if (jar_ != null && isLocalHadoop()) {
// getAbs became required when shell and subvm have different working dirs...
File wd = new File(".").getAbsoluteFile();
StreamUtil.unJar(new File(jar_), wd);
}
// if jobConf_ changes must recreate a JobClient
jc_ = new JobClient(jobConf_);
boolean error = true;
running_ = null;
String lastReport = null;
try {
running_ = jc_.submitJob(jobConf_);
jobId_ = running_.getJobID();
LOG.info("getLocalDirs(): " + Arrays.asList(jobConf_.getLocalDirs()));
LOG.info("Running job: " + jobId_);
jobInfo();
while (!running_.isComplete()) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
}
running_ = jc_.getJob(jobId_);
String report = null;
report = " map " + Math.round(running_.mapProgress() * 100) + "% reduce "
+ Math.round(running_.reduceProgress() * 100) + "%";
if (!report.equals(lastReport)) {
LOG.info(report);
lastReport = report;
}
}
if (!running_.isSuccessful()) {
jobInfo();
LOG.error("Job not Successful!");
return 1;
}
LOG.info("Job complete: " + jobId_);
LOG.info("Output: " + output_);
error = false;
} catch(FileNotFoundException fe) {
LOG.error("Error launching job , bad input path : " + fe.getMessage());
return 2;
} catch(InvalidJobConfException je) {
LOG.error("Error launching job , Invalid job conf : " + je.getMessage());
return 3;
} catch(FileAlreadyExistsException fae) {
LOG.error("Error launching job , Output path already exists : "
+ fae.getMessage());
return 4;
} catch(IOException ioe) {
LOG.error("Error Launching job : " + ioe.getMessage());
return 5;
} finally {
if (error && (running_ != null)) {
LOG.info("killJob...");
running_.killJob();
}
jc_.close();
}
return 0;
}
/** Support -jobconf x=y x1=y1 type options **/
class MultiPropertyOption extends PropertyOption{
private String optionString;
MultiPropertyOption(){
super();
}
MultiPropertyOption(final String optionString,
final String description,
final int id){
super(optionString, description, id);
this.optionString = optionString;
}
public boolean canProcess(final WriteableCommandLine commandLine,
final String argument) {
boolean ret = (argument != null) && argument.startsWith(optionString);
return ret;
}
public void process(final WriteableCommandLine commandLine,
final ListIterator arguments) throws OptionException {
final String arg = (String) arguments.next();
if (!canProcess(commandLine, arg)) {
throw new OptionException(this,
ResourceConstants.UNEXPECTED_TOKEN, arg);
}
ArrayList properties = new ArrayList();
String next = "";
while(arguments.hasNext()){
next = (String) arguments.next();
if (!next.startsWith("-")){
properties.add(next);
}else{
arguments.previous();
break;
}
}
// add to any existing values (support specifying args multiple times)
List<String> oldVal = (List<String>)commandLine.getValue(this);
if (oldVal == null){
commandLine.addValue(this, properties);
}else{
oldVal.addAll(properties);
}
}
}
protected boolean mayExit_;
protected String[] argv_;
protected boolean verbose_;
protected boolean detailedUsage_;
protected int debug_;
protected Environment env_;
protected String jar_;
protected boolean localHadoop_;
protected Configuration config_;
protected JobConf jobConf_;
protected JobClient jc_;
// command-line arguments
protected ArrayList inputSpecs_ = new ArrayList(); // <String>
protected TreeSet seenPrimary_ = new TreeSet(); // <String>
protected boolean hasSimpleInputSpecs_;
protected ArrayList packageFiles_ = new ArrayList(); // <String>
protected ArrayList shippedCanonFiles_ = new ArrayList(); // <String>
protected TreeMap<String, String> userJobConfProps_ = new TreeMap<String, String>();
protected String output_;
protected String mapCmd_;
protected String comCmd_;
protected String redCmd_;
protected String jobName_;
protected String cluster_;
protected String cacheFiles;
protected String cacheArchives;
protected URI[] fileURIs;
protected URI[] archiveURIs;
protected ArrayList configPath_ = new ArrayList(); // <String>
protected String hadoopAliasConf_;
protected String inReaderSpec_;
protected String inputFormatSpec_;
protected String outputFormatSpec_;
protected String partitionerSpec_;
protected String numReduceTasksSpec_;
protected String additionalConfSpec_;
protected String postMapCmd_;
protected String postRedCmd_;
protected String preRedCmd_;
// Use to communicate config to the external processes (ex env.var.HADOOP_USER)
// encoding "a=b c=d"
protected String addTaskEnvironment_;
protected boolean outputSingleNode_;
protected long minRecWrittenToEnableSkip_;
protected RunningJob running_;
protected String jobId_;
protected static String LINK_URI = "You need to specify the uris as hdfs://host:port/#linkname," +
"Please specify a different link name for all of your caching URIs";
}