package ldbc.snb.datagen.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
/**
* Created by aprat on 10/14/14.
*/
public class HadoopFileSorter {
private Configuration conf;
private Class<?> K;
private Class<?> V;
/**
*
* @param conf The configuration object.
* @param K The Key class of the hadoop sequence file.
* @param V The Value class of the hadoop sequence file.
*/
public HadoopFileSorter( Configuration conf, Class<?> K, Class<?> V) {
this.conf = new Configuration(conf);
this.K = K;
this.V = V;
}
/** Sorts a hadoop sequence file
*
* @param inputFileName The name of the file to sort.
* @param outputFileName The name of the sorted file.
* @throws Exception
*/
public void run( String inputFileName, String outputFileName ) throws Exception {
int numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads",1);
Job job = Job.getInstance(conf, "Sorting "+inputFileName);
FileInputFormat.setInputPaths(job, new Path(inputFileName));
FileOutputFormat.setOutputPath(job, new Path(outputFileName));
job.setMapOutputKeyClass(K);
job.setMapOutputValueClass(V);
job.setOutputKeyClass(K);
job.setOutputValueClass(V);
job.setNumReduceTasks(numThreads);
job.setJarByClass(V);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
InputSampler.Sampler sampler = new InputSampler.RandomSampler(0.1, 1000);
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path(inputFileName+"_partition.lst"));
InputSampler.writePartitionFile(job, sampler);
job.setPartitionerClass(TotalOrderPartitioner.class);
if(!job.waitForCompletion(true)) {
throw new Exception();
}
}
}