package edu.isi.karma.spark; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Properties; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; import edu.isi.karma.util.JSONLDUtilSimple; public class JSONReducerDriver { private static Logger logger = LoggerFactory.getLogger(JSONReducerDriver.class); private JSONReducerDriver() { } public static void main(String[] args) throws ParseException, IOException, ClassNotFoundException { int defaultPartitions = 100; Options options = createCommandLineOptions(); CommandLineParser parser = new BasicParser(); CommandLine cl = null; cl = parser.parse(options, args); if (cl == null || cl.getOptions().length == 0 || cl.hasOption("help")) { HelpFormatter hf = new HelpFormatter(); hf.printHelp(KarmaDriver.class.getSimpleName(), options); } String filePath = cl.getOptionValue("filepath"); String outputPath = cl.getOptionValue("outputpath"); String inputFormat = cl.getOptionValue("inputformat"); if (filePath == null || outputPath == null) { logger.error("No file path provided!"); return; } int partitions = defaultPartitions; try { partitions = Integer.parseInt(cl.getOptionValue("partitions")); } catch (Exception e) { } final SparkConf conf = new SparkConf().setAppName("Karma"); conf.set("spark.executor.userClassPathFirst", "true"); conf.set("spark.driver.userClassPathFirst", "true"); conf.set("spark.files.userClassPathFirst", "true"); conf.set("spark.io.compression.codec", "lz4"); final JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<String, String> pairs; if (inputFormat.equals("text")) { JavaRDD<String> input = sc.textFile(filePath, partitions); pairs = input.mapToPair(new PairFunction<String, String, String>() { private static final long serialVersionUID = 4170227232300260255L; @Override public Tuple2<String, String> call(String s) throws Exception { int tabIndex = s.indexOf("\t"); return new Tuple2<>(s.substring(0, tabIndex), s.substring(tabIndex + 1)); } }); } else { JavaPairRDD<Writable, Text> input = sc.sequenceFile(filePath, Writable.class, Text.class, partitions); pairs = input.mapToPair(new PairFunction<Tuple2<Writable, Text>, String, String>() { private static final long serialVersionUID = -9042224661662821670L; @Override public Tuple2<String, String> call(Tuple2<Writable, Text> textTextTuple2) throws Exception { return new Tuple2<>(textTextTuple2._1.toString(), textTextTuple2._2.toString()); } }); } Properties karmaSettings = new Properties(); reduceJSON(sc, pairs, karmaSettings) .saveAsNewAPIHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); } public static JavaPairRDD<String, String> reduceJSON(JavaSparkContext sc, JavaPairRDD<String, String> input, final Properties karmaSettings) { return reduceJSON(sc, input, sc.getConf().getInt("spark.default.parallelism", 1), karmaSettings); } public static JavaPairRDD<String, String> reduceJSON(JavaSparkContext sc, JavaPairRDD<String, String> input, int numPartitions, final Properties karmaSettings) { JavaPairRDD<String, JSONObject> pairs = input.mapToPair(new PairFunction<Tuple2<String, String>, String, JSONObject>() { private static final long serialVersionUID = 8884768697918036449L; @Override public Tuple2<String, JSONObject> call(Tuple2<String, String> tuple) throws Exception { JSONParser parser = new JSONParser(); String key = tuple._1(); String value = tuple._2(); JSONObject obj = (JSONObject)parser.parse(value); if (obj.containsKey("uri")) { key = (String)obj.get("uri"); } else if (obj.containsKey("@id")) { key = (String)obj.get("@id"); } return new Tuple2<>(key, obj); } }); return reduceJSON(numPartitions, pairs, karmaSettings); } public static JavaPairRDD<String, String> reduceJSON(int numPartitions, JavaPairRDD<String, JSONObject> pairs, final Properties karmaSettings) { String provenancePropertiesStr = karmaSettings.getProperty("karma.provenance.properties"); HashMap<String, String> provenaceProperties = new HashMap<>(); if(provenancePropertiesStr != null) { String[] provProps = provenancePropertiesStr.split(","); for(String provProp : provProps) { String[] provType = provProp.split(":"); String property = provType[0]; String type = "string"; if(provType.length > 1) type = provType[1].toLowerCase(); provenaceProperties.put(property, type); } } return reduceJSON(numPartitions, pairs, provenaceProperties); } public static JavaPairRDD<String, String> reduceJSON(int numPartitions, JavaPairRDD<String, JSONObject> pairs, final HashMap<String, String> provenaceProperties) { JavaPairRDD<String, JSONObject> reducedPairs = pairs .reduceByKey(new Function2<JSONObject, JSONObject, JSONObject>() { private static final long serialVersionUID = -3238789305990222436L; @Override public JSONObject call(JSONObject left, JSONObject right) throws Exception { return JSONLDUtilSimple.mergeJSONObjects(left, right, provenaceProperties); } }, numPartitions); return reducedPairs .mapValues(new Function<JSONObject, String>() { private static final long serialVersionUID = -1945629738808728265L; @Override public String call(JSONObject object) throws Exception { return object.toJSONString(); } }); } public static JavaRDD<String> reduceJSON(JavaSparkContext jsc, JavaRDD<String> input, final Properties karmaSettings) { return reduceJSON(jsc, input, jsc.getConf().getInt("spark.default.parallelism", 1), karmaSettings); } public static JavaRDD<String> reduceJSON(JavaSparkContext sc, JavaRDD<String> input, int numPartitions) throws org.json.simple.parser.ParseException { return reduceJSON(sc, input, numPartitions, "{}"); } public static JavaRDD<String> reduceJSON(JavaSparkContext sc, JavaRDD<String> input, int numPartitions, String propertiesStr) throws org.json.simple.parser.ParseException { JSONParser parser = new JSONParser(); JSONObject properties = (JSONObject) parser.parse(propertiesStr); Properties prop = new Properties(); for (@SuppressWarnings("unchecked") Iterator<String> keysIterator = properties.keySet().iterator(); keysIterator.hasNext(); ) { String objPropertyName = keysIterator.next(); String propertyName = objPropertyName; String value = ((String)properties.get(propertyName)); logger.info("Set " + propertyName + "=" + value); prop.setProperty(propertyName, value); } return reduceJSON(sc, input, numPartitions, prop); } public static JavaRDD<String> reduceJSON(JavaSparkContext jsc, JavaRDD<String> input, int numPartitions, final Properties karmaSettings) { JavaPairRDD<String, String> inputPair = input.mapToPair(new PairFunction<String, String, String>() { private static final long serialVersionUID = -4153068088292891034L; public Tuple2<String, String> call(String s) throws Exception { int tabIndex = s.indexOf("\t"); return new Tuple2<>(s.substring(0, tabIndex), s.substring(tabIndex + 1)); } }); JavaPairRDD<String, String> pairs = reduceJSON(jsc, inputPair, numPartitions, karmaSettings); return pairs.map(new Function<Tuple2<String, String>, String>() { private static final long serialVersionUID = 5833358013516510838L; @Override public String call(Tuple2<String, String> arg0) throws Exception { return (arg0._1() + "\t" + arg0._2()); } }); } private static Options createCommandLineOptions() { Options options = new Options(); options.addOption(new Option("filepath", "filepath", true, "Path to coordinate sequence file")); options.addOption(new Option("outputpath", "outputpath", true, "Path to output directory")); options.addOption(new Option("inputformat", "inputformat", true, "Path to output directory")); options.addOption(new Option("partitions", "partitions", true, "Number of partitions")); return options; } }