package com.manning.hip.ch4.sampler;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public final class SamplerJob {
public static void main(String... args) throws Exception {
runSortJob(args[0], args[1]);
}
public static void runSortJob(String input, String output)
throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(SamplerJob.class);
ReservoirSamplerInputFormat.setInputFormat(job,
TextInputFormat.class);
ReservoirSamplerInputFormat.setNumSamples(job, 10);
ReservoirSamplerInputFormat.setMaxRecordsToRead(job, 10000);
ReservoirSamplerInputFormat.
setUseSamplesNumberPerInputSplit(job, true);
Path outputPath = new Path(output);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath, true);
job.waitForCompletion(true);
}
}