package com.github.projectflink.spark;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.storage.StorageLevel;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GrepCaching {
public static void main(String[] args) {
String master = args[0];
String inFile = args[1];
String outFile = args[2];
String storageLevel = args[3];
String patterns[] = new String[args.length-4];
System.arraycopy(args, 4, patterns, 0, args.length - 4);
System.err.println("Starting spark with master="+master+" in="+inFile);
System.err.println("Using patterns: "+ Arrays.toString(patterns));
SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
JavaSparkContext sc = new JavaSparkContext(conf);
StorageLevel sl;
switch(storageLevel) {
case "MEMORY_ONLY":
sl = StorageLevel.MEMORY_ONLY(); break;
case "MEMORY_AND_DISK":
sl = StorageLevel.MEMORY_AND_DISK(); break;
case "MEMORY_ONLY_SER":
sl = StorageLevel.MEMORY_ONLY_SER(); break;
case "MEMORY_AND_DISK_SER":
sl = StorageLevel.MEMORY_AND_DISK_SER(); break;
case "NONE":
sl = StorageLevel.NONE(); break;
default:
throw new RuntimeException("Unknown storage level "+storageLevel);
}
JavaRDD<String> file = sc.textFile(inFile).persist(sl);
for(int p = 0; p < patterns.length; p++) {
final String pattern = patterns[p];
JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
Pattern p = Pattern.compile(pattern);
@Override
public Boolean call(String value) throws Exception {
if (value == null || value.length() == 0) {
return false;
}
final Matcher m = p.matcher(value);
if (m.find()) {
return true;
}
return false;
}
});
res.saveAsTextFile(outFile+"_"+pattern);
}
}
}