/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunner;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
/**
*
* @author rana
*
*/
public class JobBuilder {
JobConf _jobConf;
public JobBuilder(String jobName,Configuration conf) {
_jobConf = new JobConf(conf);
// legacy crap
_jobConf.addResource("nutch-default.xml");
_jobConf.addResource("nutch-site.xml");
//defaults
_jobConf.setNumTasksToExecutePerJvm(1000);
_jobConf.setJobName(jobName);
}
/**
* add inputs to the job config
*
* @param inputs
* @return
* @throws IOException
*/
public JobBuilder inputs(List<Path> inputs)throws IOException {
for (Path input : inputs) {
FileInputFormat.addInputPath(_jobConf, input);
}
return this;
}
/**
* add a single input file to the job config
* @param input
* @return
* @throws IOException
*/
public JobBuilder input(Path input)throws IOException {
FileInputFormat.addInputPath(_jobConf, input);
return this;
}
public JobBuilder output(Path outputPath)throws IOException {
FileOutputFormat.setOutputPath(_jobConf,outputPath);
return this;
}
/**
* set input format
*
* @param inputFormat
* @return
* @throws IOException
*/
public JobBuilder inputFormat(Class<? extends InputFormat> inputFormat)throws IOException {
_jobConf.setInputFormat(inputFormat);
return this;
}
public JobBuilder inputIsSeqFile()throws IOException {
_jobConf.setInputFormat(SequenceFileInputFormat.class);
return this;
}
/**
* set output format
*
* @param inputFormat
* @return
* @throws IOException
*/
public JobBuilder outputFormat(Class<? extends OutputFormat> outputFormat)throws IOException {
_jobConf.setOutputFormat(outputFormat);
return this;
}
public JobBuilder outputIsSeqFile()throws IOException {
_jobConf.setOutputFormat(SequenceFileOutputFormat.class);
return this;
}
public JobBuilder jarByClass(Class theClass)throws IOException {
_jobConf.setJarByClass(theClass);
return this;
}
/**
*
* @param mapper
* @return
* @throws IOException
*/
public JobBuilder mapper(Class<? extends Mapper> mapper)throws IOException {
if (mapper != IdentityMapper.class)
_jobConf.setMapperClass(mapper);
_jobConf.setJarByClass(mapper);
return this;
}
public JobBuilder mapRunner(Class<? extends MapRunner> mapRunner)throws IOException {
_jobConf.setMapRunnerClass(mapRunner);
_jobConf.setJarByClass(mapRunner);
return this;
}
public JobBuilder mapperKeyValue(Class<? extends WritableComparable> key,Class<? extends Writable> value)throws IOException {
_jobConf.setMapOutputKeyClass(key);
_jobConf.setMapOutputValueClass(value);
_jobConf.setOutputKeyClass(key);
_jobConf.setOutputValueClass(value);
return this;
}
public JobBuilder reducer(Class<? extends Reducer> reducer,boolean hasCombiner)throws IOException {
if (reducer != IdentityReducer.class)
_jobConf.setReducerClass(reducer);
if (hasCombiner)
_jobConf.setCombinerClass(reducer);
_jobConf.setJarByClass(reducer);
return this;
}
public JobBuilder outputKeyValue(Class<? extends WritableComparable> key,Class<? extends Writable> value)throws IOException {
_jobConf.setOutputKeyClass(key);
_jobConf.setOutputValueClass(value);
return this;
}
public JobBuilder keyValue(Class<? extends WritableComparable> key,Class<? extends Writable> value)throws IOException {
_jobConf.setMapOutputKeyClass(key);
_jobConf.setMapOutputValueClass(value);
_jobConf.setOutputKeyClass(key);
_jobConf.setOutputValueClass(value);
return this;
}
public JobBuilder numMappers(int mappers)throws IOException {
_jobConf.setNumMapTasks(mappers);
return this;
}
public JobBuilder numReducers(int reducers)throws IOException {
_jobConf.setNumReduceTasks(reducers);
return this;
}
public JobBuilder compressMapOutput(boolean compress)throws IOException {
_jobConf.setCompressMapOutput(compress);
return this;
}
public JobBuilder compressor(CompressionType type,Class<? extends CompressionCodec> codec)throws IOException {
_jobConf.setBoolean("mapred.output.compress", true);
_jobConf.set("mapred.output.compression.type", type.toString());
_jobConf.setClass("mapred.output.compression.codec", codec,CompressionCodec.class);
return this;
}
public JobBuilder compressType(CompressionType type)throws IOException {
if (type == CompressionType.NONE) {
_jobConf.setBoolean("mapred.output.compress", false);
}
else {
_jobConf.setBoolean("mapred.output.compress", true);
}
_jobConf.set("mapred.output.compression.type", type.toString());
return this;
}
public JobBuilder sort(Class<? extends RawComparator> comparator)throws IOException {
_jobConf.setOutputKeyComparatorClass(comparator);
return this;
}
public JobBuilder group(Class<? extends RawComparator> comparator)throws IOException {
_jobConf.setOutputValueGroupingComparator(comparator);
return this;
}
public JobBuilder partition(Class<? extends Partitioner> partitioner)throws IOException {
_jobConf.setPartitionerClass(partitioner);
return this;
}
public JobBuilder speculativeExecution(boolean enabled)throws IOException {
_jobConf.setSpeculativeExecution(enabled);
return this;
}
public JobBuilder speculativeMapExecution() throws IOException {
_jobConf.setMapSpeculativeExecution(true);
return this;
}
public JobBuilder speculativeMapExecution(boolean enable) throws IOException {
_jobConf.setMapSpeculativeExecution(enable);
return this;
}
public JobBuilder speculativeReducerExecution() throws IOException {
_jobConf.setReduceSpeculativeExecution(true);
return this;
}
public JobBuilder speculativeReducerExecution(boolean enable) throws IOException {
_jobConf.setReduceSpeculativeExecution(enable);
return this;
}
public JobBuilder maxMapAttempts(int maxAttempts)throws IOException {
_jobConf.setMaxMapAttempts(maxAttempts);
return this;
}
public JobBuilder maxReduceAttempts(int maxAttempts)throws IOException {
_jobConf.setMaxReduceAttempts(maxAttempts);
return this;
}
public JobBuilder delayReducersUntil(float pctMappersComplete)throws IOException {
_jobConf.setFloat("mapred.reduce.slowstart.completed.maps", pctMappersComplete);
return this;
}
public JobBuilder maxMapTaskFailures(int percent)throws IOException {
_jobConf.setMaxMapTaskFailuresPercent(percent);
return this;
}
public JobBuilder maxReduceTaskFailures(int percent)throws IOException {
_jobConf.setMaxReduceTaskFailuresPercent(percent);
return this;
}
public JobBuilder setAffinity(Path affinityPath)throws IOException {
if (affinityPath != null) {
// set node affinity ...
String affinityMask = NodeAffinityMaskBuilder
.buildNodeAffinityMask(FileSystem.get(_jobConf), affinityPath,null);
NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);
}
return this;
}
public JobBuilder reuseJVM(int numberOfTimes)throws IOException {
_jobConf.setNumTasksToExecutePerJvm(numberOfTimes);
return this;
}
public JobBuilder setAffinity(Path affinityPath,Set<String> exclusionSet)throws IOException {
// set node affinity ...
String affinityMask = NodeAffinityMaskBuilder
.buildNodeAffinityMask(FileSystem.get(_jobConf), affinityPath,null,exclusionSet);
NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);
return this;
}
public JobBuilder setAffinityNoBalancing(Path affinityPath,Set<String> exclusionSet)throws IOException {
// set node affinity ...
String affinityMask = NodeAffinityMaskBuilder
.buildNodeAffinityMask(FileSystem.get(_jobConf), affinityPath,null,exclusionSet,_jobConf.getInt("mapred.tasktracker.reduce.tasks.maximum", -1),true);
NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);
return this;
}
public JobBuilder minSplitSize(long minSplitSize)throws IOException {
_jobConf.setLong("mapred.min.split.size",minSplitSize);
return this;
}
public JobConf build()throws IOException {
return _jobConf;
}
public static Path tempDir(Configuration conf,String baseName) throws IOException {
Path tempOutputDir = new Path(conf.get("mapred.temp.dir", ".")
+ "/"
+ baseName
+ "-"
+ System.currentTimeMillis());
return tempOutputDir;
}
public JobBuilder set(String key,String value)throws IOException {
_jobConf.set(key, value);
return this;
}
public JobBuilder set(String key,long value)throws IOException {
_jobConf.setLong(key, value);
return this;
}
}