package com.github.projectflink.hadoop;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GrepDriver {
public static class Grep extends Mapper<LongWritable, Text, Text, Text> {
private final Text out = new Text();
private Pattern p;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
String pattern = context.getConfiguration().get("pattern");
Preconditions.checkArgument(pattern != null);
p = Pattern.compile(pattern);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String val = value.toString();
if (val == null || val.length() == 0) {
return;
}
final Matcher m = p.matcher(val);
if (m.find()) {
out.set(val);
context.write(out, null);
}
}
}
public static void main(String [] args) throws Exception {
String in = args[0];
String out = args[1];
System.err.println("Using input=" + in);
System.err.println("Using output=" + out);
String patterns[] = new String[args.length - 2];
System.arraycopy(args, 2, patterns, 0, args.length - 2);
System.err.println("Using patterns: " + Arrays.toString(patterns));
for (int i = 0; i < patterns.length; i++) {
String pattern = patterns[i];
Configuration conf = new Configuration();
conf.set("pattern", pattern);
Job job = Job.getInstance(conf, "Grep for " + pattern);
job.setMapperClass(Grep.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(0);
job.setJarByClass(Grep.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1] + "_" + pattern));
if (!job.waitForCompletion(true)) {
throw new RuntimeException("Grep job " + i + " failed");
}
}
}
}