/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.streaming; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.cli2.*; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.commons.cli2.option.PropertyOption; import org.apache.commons.cli2.resource.ResourceConstants; import org.apache.commons.cli2.util.HelpFormatter; import org.apache.commons.cli2.validation.InvalidArgumentException; import org.apache.commons.cli2.validation.Validator; import org.apache.commons.logging.*; import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorCombiner; import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorReducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InvalidJobConfException; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileAsTextInputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.filecache.*; import org.apache.hadoop.util.*; /** All the client-side work happens here. * (Jar packaging, MapRed job submission and monitoring) */ public class StreamJob { protected static final Log LOG = LogFactory.getLog(StreamJob.class.getName()); final static String REDUCE_NONE = "NONE"; /** -----------Streaming CLI Implementation **/ private DefaultOptionBuilder builder = new DefaultOptionBuilder("-","-", false); private ArgumentBuilder argBuilder = new ArgumentBuilder(); private Parser parser = new Parser(); private Group allOptions; HelpFormatter helpFormatter = new HelpFormatter(" ", " ", " ", 900); // need these two at class level to extract values later from // commons-cli command line private MultiPropertyOption jobconf = new MultiPropertyOption( "-jobconf", "(n=v) Optional. Add or override a JobConf property.", 'D'); private MultiPropertyOption cmdenv = new MultiPropertyOption( "-cmdenv", "(n=v) Pass env.var to streaming commands.", 'E'); public StreamJob(String[] argv, boolean mayExit) { setupOptions(); argv_ = argv; mayExit_ = mayExit; } /** * This is the method that actually * intializes the job conf and submits the job * to the jobtracker * @throws IOException */ public int go() throws IOException { init(); preProcessArgs(); parseArgv(); postProcessArgs(); setJobConf(); return submitAndMonitorJob(); } protected void init() { try { env_ = new Environment(); } catch (IOException io) { throw new RuntimeException(io); } } void preProcessArgs() { verbose_ = false; addTaskEnvironment_ = ""; } void postProcessArgs() throws IOException { if (cluster_ == null) { // hadoop-default.xml is standard, hadoop-local.xml is not. cluster_ = "default"; } hadoopAliasConf_ = "hadoop-" + getClusterNick() + ".xml"; if (inputSpecs_.size() == 0) { fail("Required argument: -input <name>"); } if (output_ == null) { fail("Required argument: -output "); } msg("addTaskEnvironment=" + addTaskEnvironment_); Iterator it = packageFiles_.iterator(); while (it.hasNext()) { File f = new File((String) it.next()); if (f.isFile()) { shippedCanonFiles_.add(f.getCanonicalPath()); } } msg("shippedCanonFiles_=" + shippedCanonFiles_); // careful with class names.. mapCmd_ = unqualifyIfLocalPath(mapCmd_); comCmd_ = unqualifyIfLocalPath(comCmd_); redCmd_ = unqualifyIfLocalPath(redCmd_); } String unqualifyIfLocalPath(String cmd) throws IOException { if (cmd == null) { // } else { String prog = cmd; String args = ""; int s = cmd.indexOf(" "); if (s != -1) { prog = cmd.substring(0, s); args = cmd.substring(s + 1); } String progCanon; try { progCanon = new File(prog).getCanonicalPath(); } catch (IOException io) { progCanon = prog; } boolean shipped = shippedCanonFiles_.contains(progCanon); msg("shipped: " + shipped + " " + progCanon); if (shipped) { // Change path to simple filename. // That way when PipeMapRed calls Runtime.exec(), // it will look for the excutable in Task's working dir. // And this is where TaskRunner unjars our job jar. prog = new File(prog).getName(); if (args.length() > 0) { cmd = prog + " " + args; } else { cmd = prog; } } } msg("cmd=" + cmd); return cmd; } String getHadoopAliasConfFile() { return new File(getHadoopClientHome() + "/conf", hadoopAliasConf_).getAbsolutePath(); } void parseArgv(){ CommandLine cmdLine = null; try{ cmdLine = parser.parse(argv_); }catch(Exception oe){ LOG.error(oe.getMessage()); if (detailedUsage_) { exitUsage(true); } else { exitUsage(false); } } if (cmdLine != null){ verbose_ = cmdLine.hasOption("-verbose"); detailedUsage_ = cmdLine.hasOption("-info"); debug_ = cmdLine.hasOption("-debug")? debug_ + 1 : debug_; inputSpecs_.addAll(cmdLine.getValues("-input")); output_ = (String) cmdLine.getValue("-output"); mapCmd_ = (String)cmdLine.getValue("-mapper"); comCmd_ = (String)cmdLine.getValue("-combiner"); redCmd_ = (String)cmdLine.getValue("-reducer"); postMapCmd_ = (String)cmdLine.getValue("-postmapper"); postRedCmd_ = (String)cmdLine.getValue("-postreducer"); preRedCmd_ = (String)cmdLine.getValue("-prereducer"); jobName_ = (String)cmdLine.getValue("-jobname"); packageFiles_.addAll(cmdLine.getValues("-file")); cluster_ = (String)cmdLine.getValue("-cluster"); configPath_.addAll(cmdLine.getValues("-config")); String fsName = (String)cmdLine.getValue("-dfs"); if (null != fsName){ userJobConfProps_.put("fs.default.name", fsName); } String jt = (String)cmdLine.getValue("mapred.job.tracker"); if (null != jt){ userJobConfProps_.put("fs.default.name", jt); } additionalConfSpec_ = (String)cmdLine.getValue("-additionalconfspec"); inputFormatSpec_ = (String)cmdLine.getValue("-inputformat"); outputFormatSpec_ = (String)cmdLine.getValue("-outputformat"); numReduceTasksSpec_ = (String)cmdLine.getValue("-numReduceTasks"); partitionerSpec_ = (String)cmdLine.getValue("-partitioner"); inReaderSpec_ = (String)cmdLine.getValue("-inputreader"); List<String> car = cmdLine.getValues("-cacheArchive"); if (null != car){ for(String s : car){ cacheArchives = (cacheArchives == null)?s :cacheArchives + "," + s; } } List<String> caf = cmdLine.getValues("-cacheFile"); if (null != caf){ for(String s : caf){ cacheFiles = (cacheFiles == null)?s :cacheFiles + "," + s; } } List<String> jobConfArgs = (List<String>)cmdLine.getValue(jobconf); List<String> envArgs = (List<String>)cmdLine.getValue(cmdenv); if (null != jobConfArgs){ for(String s : jobConfArgs){ String []parts = s.split("=", 2); userJobConfProps_.put(parts[0], parts[1]); } } if (null != envArgs){ for(String s : envArgs){ if (addTaskEnvironment_.length() > 0) { addTaskEnvironment_ += " "; } addTaskEnvironment_ += s; } } }else if (detailedUsage_) { exitUsage(true); } } protected void msg(String msg) { if (verbose_) { System.out.println("STREAM: " + msg); } } private Option createOption(String name, String desc, String argName, int max, boolean required){ Argument argument = argBuilder. withName(argName). withMinimum(1). withMaximum(max). create(); return builder. withLongName(name). withArgument(argument). withDescription(desc). withRequired(required). create(); } private Option createOption(String name, String desc, String argName, int max, boolean required, Validator validator){ Argument argument = argBuilder. withName(argName). withMinimum(1). withMaximum(max). withValidator(validator). create(); return builder. withLongName(name). withArgument(argument). withDescription(desc). withRequired(required). create(); } private Option createBoolOption(String name, String desc){ return builder.withLongName(name).withDescription(desc).create(); } private void setupOptions(){ final Validator fileValidator = new Validator(){ public void validate(final List values) throws InvalidArgumentException { // Note : This code doesnt belong here, it should be changed to // an can exec check in java 6 for (String file : (List<String>)values) { File f = new File(file); if (!f.exists()) { throw new InvalidArgumentException("Argument : " + f.getAbsolutePath() + " doesn't exist."); } if (!f.isFile()) { throw new InvalidArgumentException("Argument : " + f.getAbsolutePath() + " is not a file."); } if (!f.canRead()) { throw new InvalidArgumentException("Argument : " + f.getAbsolutePath() + " is not accessible"); } } } }; // Note: not extending CLI2's FileValidator, that overwrites // the String arg into File and causes ClassCastException // in inheritance tree. final Validator execValidator = new Validator(){ public void validate(final List values) throws InvalidArgumentException { // Note : This code doesnt belong here, it should be changed to // an can exec check in java 6 for (String file : (List<String>)values) { try{ Runtime.getRuntime().exec("chmod 0777 " + (new File(file)).getAbsolutePath()); }catch(IOException ioe){ // ignore } } fileValidator.validate(values); } }; Option input = createOption("input", "DFS input file(s) for the Map step", "path", Integer.MAX_VALUE, true); Option output = createOption("output", "DFS output directory for the Reduce step", "path", 1, true); Option mapper = createOption("mapper", "The streaming command to run", "cmd", 1, false); Option combiner = createOption("combiner", "The streaming command to run", "cmd", 1, false); // reducer could be NONE Option reducer = createOption("reducer", "The streaming command to run", "cmd", 1, false); Option file = createOption("file", "File/dir to be shipped in the Job jar file", "file", Integer.MAX_VALUE, false, execValidator); Option dfs = createOption("dfs", "Optional. Override DFS configuration", "<h:p>|local", 1, false); Option jt = createOption("jt", "Optional. Override JobTracker configuration", "<h:p>|local", 1, false); Option additionalconfspec = createOption("additionalconfspec", "Optional.", "spec", 1, false); Option inputformat = createOption("inputformat", "Optional.", "spec", 1, false); Option outputformat = createOption("outputformat", "Optional.", "spec", 1, false); Option partitioner = createOption("partitioner", "Optional.", "spec", 1, false); Option numReduceTasks = createOption("numReduceTasks", "Optional.", "spec",1, false ); Option inputreader = createOption("inputreader", "Optional.", "spec", 1, false); Option cacheFile = createOption("cacheFile", "File name URI", "fileNameURI", Integer.MAX_VALUE, false); Option cacheArchive = createOption("cacheArchive", "File name URI", "fileNameURI", 1, false); Option jobname = createOption("jobname", "Optional", "spec", 1, false); Option postmapper = createOption("postmapper", "Optional", "spec", 1, false); Option postreducer = createOption("postreducer", "Optional", "spec", 1, false); Option prereducer = createOption("prereducer", "Optional", "spec", 1, false); // boolean properties Option verbose = createBoolOption("verbose", "print verbose output"); Option info = createBoolOption("info", "print verbose output"); Option help = createBoolOption("help", "print this help message"); Option debug = createBoolOption("debug", "print debug output"); Option inputtagged = createBoolOption("inputtagged", "inputtagged"); allOptions = new GroupBuilder(). withOption(input). withOption(output). withOption(mapper). withOption(postmapper). withOption(postreducer). withOption(prereducer). withOption(combiner). withOption(reducer). withOption(file). withOption(dfs). withOption(jt). withOption(additionalconfspec). withOption(inputformat). withOption(outputformat). withOption(partitioner). withOption(numReduceTasks). withOption(inputreader). withOption(jobconf). withOption(cmdenv). withOption(jobname). withOption(cacheFile). withOption(cacheArchive). withOption(verbose). withOption(info). withOption(debug). withOption(inputtagged). withOption(help). create(); parser.setGroup(allOptions); } public void exitUsage(boolean detailed) { // 1 2 3 4 5 6 7 //1234567890123456789012345678901234567890123456789012345678901234567890123456789 if (!detailed) { System.out.println("Usage: $HADOOP_HOME/bin/hadoop [--config dir] jar \\"); System.out.println(" $HADOOP_HOME/hadoop-streaming.jar [options]"); System.out.println("Options:"); System.out.println(" -input <path> DFS input file(s) for the Map step"); System.out.println(" -output <path> DFS output directory for the Reduce step"); System.out.println(" -mapper <cmd|JavaClassName> The streaming command to run"); System.out.println(" -postmapper <JavaClassName> Map Class to post process streaming mapper"); System.out.println(" -postreducer <JavaClassName> Map Class to post process streaming reducer"); System.out.println(" -prereducer <JavaClassName> Reduce Class to process reduce input before streaming"); System.out.println(" -combiner <JavaClassName> Combiner has to be a Java class"); System.out.println(" -reducer <cmd|JavaClassName> The streaming command to run"); System.out.println(" -file <file> File/dir to be shipped in the Job jar file"); System.out.println(" -dfs <h:p>|local Optional. Override DFS configuration"); System.out.println(" -jt <h:p>|local Optional. Override JobTracker configuration"); System.out.println(" -additionalconfspec specfile Optional."); System.out.println(" -inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional."); System.out.println(" -outputformat TextOutputFormat(default)|JavaClassName Optional."); System.out.println(" -partitioner JavaClassName Optional."); System.out.println(" -numReduceTasks <num> Optional."); System.out.println(" -inputreader <spec> Optional."); System.out.println(" -jobconf <n>=<v> Optional. Add or override a JobConf property"); System.out.println(" -cmdenv <n>=<v> Optional. Pass env.var to streaming commands"); System.out.println(" -jobname <name> Optional. Set the name of the job"); System.out.println(" -cacheFile fileNameURI"); System.out.println(" -cacheArchive fileNameURI"); System.out.println(" -verbose"); System.out.println(); System.out.println("For more details about these options:"); System.out.println("Use $HADOOP_HOME/bin/hadoop jar build/hadoop-streaming.jar -info"); fail(""); } System.out.println("In -input: globbing on <path> is supported and can have multiple -input"); System.out.println("Default Map input format: a line is a record in UTF-8"); System.out.println(" the key part ends at first TAB, the rest of the line is the value"); System.out.println("Custom input format: -inputformat package.MyInputFormat "); System.out.println("Map output format, reduce input/output format:"); System.out.println(" Format defined by what the mapper command outputs. Line-oriented"); System.out.println(); System.out.println("The files or directories named in the -file argument[s] end up in the"); System.out.println(" working directory when the mapper and reducer are run."); System.out.println(" The location of this working directory is unspecified."); System.out.println(); System.out.println("To set the number of reduce tasks (num. of output files):"); System.out.println(" -jobconf mapred.reduce.tasks=10"); System.out.println("To skip the sort/combine/shuffle/sort/reduce step:"); System.out.println(" Use -numReduceTasks 0"); System.out .println(" A Task's Map output then becomes a 'side-effect output' rather than a reduce input"); System.out .println(" This speeds up processing, This also feels more like \"in-place\" processing"); System.out.println(" because the input filename and the map input order are preserved"); System.out.println(" This equivalent -reducer NONE"); System.out.println(); System.out.println("To speed up the last reduces:"); System.out.println(" -jobconf mapred.speculative.execution=true"); System.out.println("To name the job (appears in the JobTracker Web UI):"); System.out.println(" -jobconf mapred.job.name='My Job' "); System.out.println("To change the local temp directory:"); System.out.println(" -jobconf dfs.data.dir=/tmp/dfs"); System.out.println(" -jobconf stream.tmpdir=/tmp/streaming"); System.out.println("Additional local temp directories with -cluster local:"); System.out.println(" -jobconf mapred.local.dir=/tmp/local"); System.out.println(" -jobconf mapred.system.dir=/tmp/system"); System.out.println(" -jobconf mapred.temp.dir=/tmp/temp"); System.out.println("Use a custom hadoopStreaming build along a standard hadoop install:"); System.out.println(" $HADOOP_HOME/bin/hadoop jar /path/my-hadoop-streaming.jar [...]\\"); System.out .println(" [...] -jobconf stream.shipped.hadoopstreaming=/path/my-hadoop-streaming.jar"); System.out.println("For more details about jobconf parameters see:"); System.out.println(" http://wiki.apache.org/lucene-hadoop/JobConfFile"); System.out.println("To set an environement variable in a streaming command:"); System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/"); System.out.println(); System.out.println("Shortcut:"); System.out .println(" setenv HSTREAMING \"$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar\""); System.out.println(); System.out.println("Example: $HSTREAMING -mapper \"/usr/local/bin/perl5 filter.pl\""); System.out.println(" -file /local/filter.pl -input \"/logs/0604*/*\" [...]"); System.out.println(" Ships a script, invokes the non-shipped perl interpreter"); System.out.println(" Shipped files go to the working directory so filter.pl is found by perl"); System.out.println(" Input files are all the daily logs for days in month 2006-04"); fail(""); } public void fail(String message) { if (mayExit_) { System.err.println(message); throw new RuntimeException(message); } else { throw new IllegalArgumentException(message); } } // -------------------------------------------- protected String getHadoopClientHome() { String h = env_.getProperty("HADOOP_HOME"); // standard Hadoop if (h == null) { //fail("Missing required environment variable: HADOOP_HOME"); h = "UNDEF"; } return h; } protected boolean isLocalHadoop() { boolean local; if (jobConf_ == null) { local = getClusterNick().equals("local"); } else { local = StreamUtil.isLocalJobTracker(jobConf_); } return local; } protected String getClusterNick() { return cluster_; } /** @return path to the created Jar file or null if no files are necessary. */ protected String packageJobJar() throws IOException { ArrayList unjarFiles = new ArrayList(); // Runtime code: ship same version of code as self (job submitter code) // usually found in: build/contrib or build/hadoop-<version>-dev-streaming.jar // First try an explicit spec: it's too hard to find our own location in this case: // $HADOOP_HOME/bin/hadoop jar /not/first/on/classpath/custom-hadoop-streaming.jar // where findInClasspath() would find the version of hadoop-streaming.jar in $HADOOP_HOME String runtimeClasses = userJobConfProps_.get("stream.shipped.hadoopstreaming"); // jar or class dir System.out.println(runtimeClasses + "=@@@userJobConfProps_.get(stream.shipped.hadoopstreaming"); if (runtimeClasses == null) { runtimeClasses = StreamUtil.findInClasspath(StreamJob.class.getName()); } if (runtimeClasses == null) { throw new IOException("runtime classes not found: " + getClass().getPackage()); } else { msg("Found runtime classes in: " + runtimeClasses); } if (isLocalHadoop()) { // don't package class files (they might get unpackaged in "." and then // hide the intended CLASSPATH entry) // we still package everything else (so that scripts and executable are found in // Task workdir like distributed Hadoop) } else { if (new File(runtimeClasses).isDirectory()) { packageFiles_.add(runtimeClasses); } else { unjarFiles.add(runtimeClasses); } } if (packageFiles_.size() + unjarFiles.size() == 0) { return null; } String tmp = jobConf_.get("stream.tmpdir"); //, "/tmp/${user.name}/" File tmpDir = (tmp == null) ? null : new File(tmp); // tmpDir=null means OS default tmp dir File jobJar = File.createTempFile("hive-streamjob", ".jar", tmpDir); System.out.println("packageJobJar: " + packageFiles_ + " " + unjarFiles + " " + jobJar + " tmpDir=" + tmpDir); if (debug_ == 0) { jobJar.deleteOnExit(); } JarBuilder builder = new JarBuilder(); if (verbose_) { builder.setVerbose(true); } String jobJarName = jobJar.getAbsolutePath(); builder.merge(packageFiles_, unjarFiles, jobJarName); return jobJarName; } /** * This method sets the user jobconf variable specified * by user using -jobconf key=value * @param doEarlyProps */ protected void setUserJobConfProps(boolean doEarlyProps) { Iterator it = userJobConfProps_.keySet().iterator(); while (it.hasNext()) { String key = (String) it.next(); String val = (String)userJobConfProps_.get(key); boolean earlyName = key.equals("fs.default.name"); earlyName |= key.equals("stream.shipped.hadoopstreaming"); if (doEarlyProps == earlyName) { msg("xxxJobConf: set(" + key + ", " + val + ") early=" + doEarlyProps); jobConf_.set(key, val); } } } /** * get the uris of all the files/caches */ protected void getURIs(String lcacheArchives, String lcacheFiles) { String archives[] = StringUtils.getStrings(lcacheArchives); String files[] = StringUtils.getStrings(lcacheFiles); fileURIs = StringUtils.stringToURI(files); archiveURIs = StringUtils.stringToURI(archives); } protected void setJobConf() throws IOException { msg("hadoopAliasConf_ = " + hadoopAliasConf_); config_ = new Configuration(); if (!cluster_.equals("default")) { config_.addResource(new Path(getHadoopAliasConfFile())); } else { // use only defaults: hadoop-default.xml and hadoop-site.xml } System.out.println("additionalConfSpec_:" + additionalConfSpec_); if (additionalConfSpec_ != null) { config_.addResource(new Path(additionalConfSpec_)); } Iterator it = configPath_.iterator(); while (it.hasNext()) { String pathName = (String) it.next(); config_.addResource(new Path(pathName)); } // general MapRed job properties jobConf_ = new JobConf(config_); // All streaming jobs get the task timeout value // from the configuration settings. setUserJobConfProps(true); // The correct FS must be set before this is called! // (to resolve local vs. dfs drive letter differences) // (mapred.working.dir will be lazily initialized ONCE and depends on FS) for (int i = 0; i < inputSpecs_.size(); i++) { FileInputFormat.addInputPaths(jobConf_, (String) inputSpecs_.get(i)); } jobConf_.set("stream.numinputspecs", "" + inputSpecs_.size()); String defaultPackage = this.getClass().getPackage().getName(); Class c; Class fmt = null; if (inReaderSpec_ == null && inputFormatSpec_ == null) { fmt = TextInputFormat.class; } else if (inputFormatSpec_ != null) { if (inputFormatSpec_.equals(TextInputFormat.class.getName()) || inputFormatSpec_.equals(TextInputFormat.class.getCanonicalName())) { fmt = TextInputFormat.class; } else if (inputFormatSpec_.equals(KeyValueTextInputFormat.class .getName()) || inputFormatSpec_.equals(KeyValueTextInputFormat.class .getCanonicalName())) { } else if (inputFormatSpec_.equals(SequenceFileInputFormat.class .getName()) || inputFormatSpec_ .equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class .getCanonicalName())) { fmt = SequenceFileInputFormat.class; } else if (inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class .getName()) || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class .getCanonicalName())) { fmt = SequenceFileAsTextInputFormat.class; } else { c = StreamUtil.goodClassOrNull(inputFormatSpec_, defaultPackage); if (c != null) { fmt = c; } else { } } } if (fmt == null) { fmt = StreamInputFormat.class; } jobConf_.setInputFormat(fmt); jobConf_.setOutputKeyClass(Text.class); jobConf_.setOutputValueClass(Text.class); if (jobName_ != null) { jobConf_.setJobName(jobName_); } jobConf_.set("stream.addenvironment", addTaskEnvironment_); if (mapCmd_ != null) { c = StreamUtil.goodClassOrNull(mapCmd_, defaultPackage); if (c != null) { if(postMapCmd_ != null) { fail("-postmapper cannot be combined with a Java map class"); } jobConf_.setMapperClass(c); } else { jobConf_.setMapperClass(PipeMapper.class); jobConf_.set("stream.map.streamprocessor", URLEncoder.encode(mapCmd_, "UTF-8")); } } if (postMapCmd_ != null) { c = StreamUtil.goodClassOrNull(postMapCmd_, defaultPackage); if (c != null) { jobConf_.setClass("stream.map.posthook", c, Mapper.class); } else { fail("postmapper: "+postMapCmd_+" is not a valid Java Class"); } } if (comCmd_ != null) { c = StreamUtil.goodClassOrNull(comCmd_, defaultPackage); if (c != null) { jobConf_.setCombinerClass(c); } } boolean reducerNone_ = false; if (redCmd_ != null) { reducerNone_ = redCmd_.equals(REDUCE_NONE); if (redCmd_.compareToIgnoreCase("aggregate") == 0) { if(postRedCmd_ != null) { fail("-postreducer cannot be combined with a Java reduce class"); } if(preRedCmd_ != null) { fail("-prereducer cannot be combined with a Java reduce class"); } jobConf_.setReducerClass(ValueAggregatorReducer.class); jobConf_.setCombinerClass(ValueAggregatorCombiner.class); } else { c = StreamUtil.goodClassOrNull(redCmd_, defaultPackage); if (c != null) { if(postRedCmd_ != null) { fail("-postreducer cannot be combined with a Java reduce class "+c.getName()); } if(preRedCmd_ != null) { fail("-prereducer cannot be combined with a Java reduce class "+c.getName()); } jobConf_.setReducerClass(c); } else { jobConf_.setReducerClass(PipeReducer.class); jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode( redCmd_, "UTF-8")); } } } if (postRedCmd_ != null) { c = StreamUtil.goodClassOrNull(postRedCmd_, defaultPackage); if (c != null) { jobConf_.setClass("stream.reduce.posthook", c, Mapper.class); } else { fail("postreducer: "+postRedCmd_+" is not a valid Java Class"); } } if (preRedCmd_ != null) { c = StreamUtil.goodClassOrNull(preRedCmd_, defaultPackage); if (c != null) { jobConf_.setClass("stream.reduce.prehook", c, Reducer.class); } else { fail("postreducer: "+preRedCmd_+" is not a valid Java Class"); } } if (inReaderSpec_ != null) { String[] args = inReaderSpec_.split(","); String readerClass = args[0]; // this argument can only be a Java class c = StreamUtil.goodClassOrNull(readerClass, defaultPackage); if (c != null) { jobConf_.set("stream.recordreader.class", c.getName()); } else { fail("-inputreader: class not found: " + readerClass); } for (int i = 1; i < args.length; i++) { String[] nv = args[i].split("=", 2); String k = "stream.recordreader." + nv[0]; String v = (nv.length > 1) ? nv[1] : ""; jobConf_.set(k, v); } } setUserJobConfProps(false); FileOutputFormat.setOutputPath(jobConf_,new Path(output_)); fmt = null; if (outputFormatSpec_!= null) { c = StreamUtil.goodClassOrNull(outputFormatSpec_, defaultPackage); if (c != null) { fmt = c; } } if (fmt == null) { fmt = TextOutputFormat.class; } jobConf_.setOutputFormat(fmt); if (partitionerSpec_!= null) { c = StreamUtil.goodClassOrNull(partitionerSpec_, defaultPackage); if (c != null) { jobConf_.setPartitionerClass(c); } } if (numReduceTasksSpec_!= null) { int numReduceTasks = Integer.parseInt(numReduceTasksSpec_); jobConf_.setNumReduceTasks(numReduceTasks); } if (reducerNone_) { jobConf_.setNumReduceTasks(0); } // last, allow user to override anything // (although typically used with properties we didn't touch) jar_ = packageJobJar(); if (jar_ != null) { jobConf_.setJar(jar_); } if ((cacheArchives != null) || (cacheFiles != null)){ getURIs(cacheArchives, cacheFiles); boolean b = DistributedCache.checkURIs(fileURIs, archiveURIs); if (!b) fail(LINK_URI); } DistributedCache.createSymlink(jobConf_); // set the jobconf for the caching parameters if (cacheArchives != null) DistributedCache.setCacheArchives(archiveURIs, jobConf_); if (cacheFiles != null) DistributedCache.setCacheFiles(fileURIs, jobConf_); if (verbose_) { listJobConfProperties(); } msg("submitting to jobconf: " + getJobTrackerHostPort()); } /** * Prints out the jobconf properties on stdout * when verbose is specified. */ protected void listJobConfProperties() { msg("==== JobConf properties:"); Iterator it = jobConf_.iterator(); TreeMap sorted = new TreeMap(); while(it.hasNext()) { Map.Entry en = (Map.Entry)it.next(); sorted.put(en.getKey(), en.getValue()); } it = sorted.entrySet().iterator(); while(it.hasNext()) { Map.Entry en = (Map.Entry)it.next(); msg(en.getKey() + "=" + en.getValue()); } msg("===="); } protected String getJobTrackerHostPort() { return jobConf_.get("mapred.job.tracker"); } protected void jobInfo() { if (isLocalHadoop()) { LOG.info("Job running in-process (local Hadoop)"); } else { String hp = getJobTrackerHostPort(); LOG.info("To kill this job, run:"); LOG.info(getHadoopClientHome() + "/bin/hadoop job -Dmapred.job.tracker=" + hp + " -kill " + jobId_); //LOG.info("Job file: " + running_.getJobFile()); LOG.info("Tracking URL: " + StreamUtil.qualifyHost(running_.getTrackingURL())); } } // Based on JobClient public int submitAndMonitorJob() throws IOException { if (jar_ != null && isLocalHadoop()) { // getAbs became required when shell and subvm have different working dirs... File wd = new File(".").getAbsoluteFile(); StreamUtil.unJar(new File(jar_), wd); } // if jobConf_ changes must recreate a JobClient jc_ = new JobClient(jobConf_); boolean error = true; running_ = null; String lastReport = null; try { running_ = jc_.submitJob(jobConf_); jobId_ = running_.getJobID(); LOG.info("getLocalDirs(): " + Arrays.asList(jobConf_.getLocalDirs())); LOG.info("Running job: " + jobId_); jobInfo(); while (!running_.isComplete()) { try { Thread.sleep(1000); } catch (InterruptedException e) { } running_ = jc_.getJob(jobId_); String report = null; report = " map " + Math.round(running_.mapProgress() * 100) + "% reduce " + Math.round(running_.reduceProgress() * 100) + "%"; if (!report.equals(lastReport)) { LOG.info(report); lastReport = report; } } if (!running_.isSuccessful()) { jobInfo(); LOG.error("Job not Successful!"); return 1; } LOG.info("Job complete: " + jobId_); LOG.info("Output: " + output_); error = false; } catch(FileNotFoundException fe) { LOG.error("Error launching job , bad input path : " + fe.getMessage()); return 2; } catch(InvalidJobConfException je) { LOG.error("Error launching job , Invalid job conf : " + je.getMessage()); return 3; } catch(FileAlreadyExistsException fae) { LOG.error("Error launching job , Output path already exists : " + fae.getMessage()); return 4; } catch(IOException ioe) { LOG.error("Error Launching job : " + ioe.getMessage()); return 5; } finally { if (error && (running_ != null)) { LOG.info("killJob..."); running_.killJob(); } jc_.close(); } return 0; } /** Support -jobconf x=y x1=y1 type options **/ class MultiPropertyOption extends PropertyOption{ private String optionString; MultiPropertyOption(){ super(); } MultiPropertyOption(final String optionString, final String description, final int id){ super(optionString, description, id); this.optionString = optionString; } public boolean canProcess(final WriteableCommandLine commandLine, final String argument) { boolean ret = (argument != null) && argument.startsWith(optionString); return ret; } public void process(final WriteableCommandLine commandLine, final ListIterator arguments) throws OptionException { final String arg = (String) arguments.next(); if (!canProcess(commandLine, arg)) { throw new OptionException(this, ResourceConstants.UNEXPECTED_TOKEN, arg); } ArrayList properties = new ArrayList(); String next = ""; while(arguments.hasNext()){ next = (String) arguments.next(); if (!next.startsWith("-")){ properties.add(next); }else{ arguments.previous(); break; } } // add to any existing values (support specifying args multiple times) List<String> oldVal = (List<String>)commandLine.getValue(this); if (oldVal == null){ commandLine.addValue(this, properties); }else{ oldVal.addAll(properties); } } } protected boolean mayExit_; protected String[] argv_; protected boolean verbose_; protected boolean detailedUsage_; protected int debug_; protected Environment env_; protected String jar_; protected boolean localHadoop_; protected Configuration config_; protected JobConf jobConf_; protected JobClient jc_; // command-line arguments protected ArrayList inputSpecs_ = new ArrayList(); // <String> protected TreeSet seenPrimary_ = new TreeSet(); // <String> protected boolean hasSimpleInputSpecs_; protected ArrayList packageFiles_ = new ArrayList(); // <String> protected ArrayList shippedCanonFiles_ = new ArrayList(); // <String> protected TreeMap<String, String> userJobConfProps_ = new TreeMap<String, String>(); protected String output_; protected String mapCmd_; protected String comCmd_; protected String redCmd_; protected String jobName_; protected String cluster_; protected String cacheFiles; protected String cacheArchives; protected URI[] fileURIs; protected URI[] archiveURIs; protected ArrayList configPath_ = new ArrayList(); // <String> protected String hadoopAliasConf_; protected String inReaderSpec_; protected String inputFormatSpec_; protected String outputFormatSpec_; protected String partitionerSpec_; protected String numReduceTasksSpec_; protected String additionalConfSpec_; protected String postMapCmd_; protected String postRedCmd_; protected String preRedCmd_; // Use to communicate config to the external processes (ex env.var.HADOOP_USER) // encoding "a=b c=d" protected String addTaskEnvironment_; protected boolean outputSingleNode_; protected long minRecWrittenToEnableSkip_; protected RunningJob running_; protected String jobId_; protected static String LINK_URI = "You need to specify the uris as hdfs://host:port/#linkname," + "Please specify a different link name for all of your caching URIs"; }