package com.github.projectflink.spark;
import org.apache.spark.Accumulator;
import org.apache.spark.AccumulatorParam;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Grep {
public static void main(String[] args) {
String master = args[0];
String inFile = args[1];
String outFile = args[2];
String patterns[] = new String[args.length-3];
System.arraycopy(args,3,patterns,0,args.length-3);
System.err.println("Starting spark with master="+master+" in="+inFile);
System.err.println("Using patterns: "+ Arrays.toString(patterns));
SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> file = sc.textFile(inFile);
for(int p = 0; p < patterns.length; p++) {
final String pattern = patterns[p];
JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
Pattern p = Pattern.compile(pattern);
@Override
public Boolean call(String value) throws Exception {
if (value == null || value.length() == 0) {
return false;
}
final Matcher m = p.matcher(value);
if (m.find()) {
return true;
}
return false;
}
});
res.saveAsTextFile(outFile+"_"+pattern);
}
}
}