package edu.isi.karma.spark; import java.io.FileInputStream; import java.io.IOException; import java.net.URL; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.broadcast.Broadcast; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; import edu.isi.karma.rdf.JSONImpl; import edu.isi.karma.rdf.N3Impl; import edu.isi.karma.util.JSONLDUtilSimple; /** * Created by chengyey on 12/6/15. */ public class KarmaDriver { private static Logger logger = LoggerFactory.getLogger(KarmaDriver.class); private KarmaDriver() { } public static void main(String[] args) throws ParseException, IOException, ClassNotFoundException { int defaultPartitions = 100; final int batchSize = 200; Options options = createCommandLineOptions(); CommandLineParser parser = new BasicParser(); CommandLine cl = null; cl = parser.parse(options, args); if (cl == null || cl.getOptions().length == 0 || cl.hasOption("help")) { HelpFormatter hf = new HelpFormatter(); hf.printHelp(KarmaDriver.class.getSimpleName(), options); } String filePath = cl.getOptionValue("filepath"); String outputPath = cl.getOptionValue("outputpath"); String inputFormat = cl.getOptionValue("inputformat"); if (filePath == null || outputPath == null) { logger.error("No file path provided!"); return; } int partitions = defaultPartitions; try { partitions = Integer.parseInt(cl.getOptionValue("partitions")); } catch (Exception e) { } final SparkConf conf = new SparkConf().setAppName("Karma"); conf.set("spark.master", "local[*]"); conf.set("spark.executor.userClassPathFirst", "true"); conf.set("spark.driver.userClassPathFirst", "true"); conf.set("spark.files.userClassPathFirst", "true"); conf.set("spark.io.compression.codec", "lz4"); conf.set("spark.yarn.dist.archives", "karma.zip"); conf.set("spark.yarn.dist.files", "job.properties"); final JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<String, String> pairs; if (inputFormat.equals("text")) { JavaRDD<String> input = sc.textFile(filePath, partitions); pairs = input.mapToPair(new PairFunction<String, String, String>() { private static final long serialVersionUID = 4170227232300260255L; @Override public Tuple2<String, String> call(String s) throws Exception { int tabIndex = s.indexOf("\t"); return new Tuple2<>(s.substring(0, tabIndex), s.substring(tabIndex + 1)); } }); } else { JavaPairRDD<Writable, Text> input = sc.sequenceFile(filePath, Writable.class, Text.class, partitions); pairs = input.mapToPair(new PairFunction<Tuple2<Writable, Text>, String, String>() { @Override public Tuple2<String, String> call(Tuple2<Writable, Text> textTextTuple2) throws Exception { return new Tuple2<>(textTextTuple2._1.toString(), textTextTuple2._2.toString()); } }); } Properties properties = new Properties(); properties.load(new FileInputStream("job.properties")); applyModel(sc, pairs, properties, batchSize) .saveAsNewAPIHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); } public static JavaPairRDD<Text, Text> applyModel(JavaSparkContext sc, JavaPairRDD<String, String> input, final Properties karmaSettings, final int batchSize) throws IOException { return applyModel(sc, input, karmaSettings, batchSize, sc.getConf() .getInt("spark.default.parallelism", 1)); } public static JavaPairRDD<Text, Text> applyModel(JavaSparkContext sc, JavaPairRDD<String, String> input, final Properties karmaSettings, final int batchSize, int numPartitions) throws IOException { String input_type = karmaSettings.getProperty("karma.input.type"); String modelUrl = karmaSettings.getProperty("model.uri"); String contextUrl = karmaSettings.getProperty("context.uri"); String modelTxt = IOUtils.toString(new URL(modelUrl)); final Broadcast<String> model = sc.broadcast(modelTxt); String contextTxt = IOUtils.toString(new URL(contextUrl)); final Broadcast<String> context = sc.broadcast(contextTxt); final String outputFormat = karmaSettings.getProperty("karma.output.format"); logger.info("Load model:" + modelUrl); logger.info("Load context:" + contextUrl); if (input_type != null && input_type.toUpperCase().equals("JSON")) { input = input.values().glom().flatMapToPair( new PairFlatMapFunction<List<String>, String, String>() { private static final long serialVersionUID = 7257511573596956635L; @Override public Iterable<Tuple2<String, String>> call( List<String> t) throws Exception { List<Tuple2<String, String>> results = new LinkedList<>(); String key = ""; Iterable<String> values = t; int count = 0; StringBuilder builder = new StringBuilder(); builder.append("["); boolean isFirst = true; for (String value : values) { if (isFirst) { builder.append(value); isFirst = false; } else { builder.append(",").append(value); } count++; if (count == batchSize) { builder.append("]"); results.add(new Tuple2<>(key, builder.toString())); builder = new StringBuilder(); builder.append("["); isFirst = true; count = 0; } } String last = builder.append("]").toString(); results.add(new Tuple2<>(key, last)); return results; } }); } if(outputFormat != null && outputFormat.equals("n3")) { return applyModelToGetN3(input, karmaSettings,model, context, outputFormat, numPartitions); } else { return applyModelToGetJSON(input, karmaSettings, model, context, outputFormat, numPartitions); } } public static JavaRDD<String> applyModel(JavaSparkContext jsc, JavaRDD<String> input, String propertiesStr, final int batchSize, int numPartitions) throws IOException, org.json.simple.parser.ParseException { JSONParser parser = new JSONParser(); JSONObject properties = (JSONObject) parser.parse(propertiesStr); Properties prop = new Properties(); for (@SuppressWarnings("unchecked") Iterator<String> keysIterator = properties.keySet().iterator(); keysIterator.hasNext(); ) { String objPropertyName = keysIterator.next(); String propertyName = objPropertyName; String value = ((String)properties.get(propertyName)); logger.info("Set " + propertyName + "=" + value); prop.setProperty(propertyName, value); } JavaPairRDD<Text, Text> pairs = applyModel(jsc, input, prop, batchSize, numPartitions); return pairs.map(new Function<Tuple2<Text,Text>, String>() { private static final long serialVersionUID = 5833358013516510838L; @Override public String call(Tuple2<Text, Text> arg0) throws Exception { return (arg0._1() + "\t" + arg0._2()); } }); } public static JavaPairRDD<Text, Text> applyModelToGetJSON( JavaPairRDD<String, String> input, final Properties karmaSettings, final Broadcast<String> model, final Broadcast<String> context, final String outputFormat, int numPartitions) { JavaPairRDD<String, JSONObject> pairs = input .flatMapToPair(new PairFlatMapFunction<Tuple2<String, String>, String, JSONObject>() { private static final long serialVersionUID = -3533063264900721773L; @Override public Iterable<Tuple2<String, JSONObject>> call( Tuple2<String, String> writableIterableTuple2) throws Exception { List<Tuple2<String, JSONObject>> results = new LinkedList<>(); Properties karmaContentSettings = new Properties(); for (Map.Entry<Object, Object> objectObjectEntry : karmaSettings .entrySet()) karmaContentSettings.put( objectObjectEntry.getKey(), objectObjectEntry.getValue()); karmaContentSettings.put("model.content", model.value()); karmaContentSettings.put("context.content", context.getValue()); final JSONImpl mapper = new JSONImpl( karmaContentSettings); String result = mapper.mapResult( writableIterableTuple2._1, writableIterableTuple2._2); JSONParser parser = new JSONParser(); JSONArray generatedObjects = ((JSONArray) parser .parse(result)); for (int i = 0; i < generatedObjects.size(); i++) { try { String key; JSONObject value; if (((JSONObject) generatedObjects.get(i)) .containsKey(mapper.getAtId())) { key = (String) ((JSONObject) generatedObjects .get(i)).get(mapper.getAtId()); } else { key = generatedObjects.get(i).toString(); } value = ((JSONObject)generatedObjects.get(i)); results.add(new Tuple2<>(key, value)); } catch (ArrayIndexOutOfBoundsException ae) { logger.error("************ARRAYEXCEPTION*********:" + ae.getMessage() + "SOURCE: " + generatedObjects.get(i).toString()); } } return results; } }); boolean runReducer = true; if(karmaSettings.containsKey("karma.reducer.run")) { runReducer = Boolean.parseBoolean(karmaSettings.getProperty("karma.reducer.run")); } if(runReducer) { JavaPairRDD<String, String> reducedSerializedPairs = JSONReducerDriver.reduceJSON(numPartitions, pairs, karmaSettings); return reducedSerializedPairs .mapToPair(new PairFunction<Tuple2<String, String>, Text, Text>() { private static final long serialVersionUID = 2787821808872176951L; @Override public Tuple2<Text, Text> call( Tuple2<String, String> stringStringTuple2) throws Exception { return new Tuple2<>(new Text(stringStringTuple2._1), new Text(stringStringTuple2._2)); } }); } else { //To return without running the reducer: return pairs.mapToPair(new PairFunction<Tuple2<String,JSONObject>, Text, Text>() { @Override public Tuple2<Text, Text> call(Tuple2<String, JSONObject> arg0) throws Exception { return new Tuple2<>(new Text(arg0._1), new Text(arg0._2.toJSONString())); } }); } } public static JavaPairRDD<Text, Text>applyModelToGetN3(JavaPairRDD<String, String> input, final Properties karmaSettings,final Broadcast<String> model, final Broadcast<String> context, final String outputFormat, int numPartitions ) { JavaPairRDD<String, String> pairs = input.flatMapToPair( new PairFlatMapFunction<Tuple2<String,String>, String, String>() { private static final long serialVersionUID = -3533063264900721773L; @Override public Iterable<Tuple2<String, String>> call(Tuple2<String, String> writableIterableTuple2) throws Exception { List<Tuple2<String, String>> results = new LinkedList<>(); Properties karmaContentSettings = new Properties(); for(Map.Entry<Object, Object> objectObjectEntry : karmaSettings.entrySet()) karmaContentSettings.put(objectObjectEntry.getKey(), objectObjectEntry.getValue()); karmaContentSettings.put("model.content", model.value()); karmaContentSettings.put("context.content", context.getValue()); if(outputFormat != null && outputFormat.equals("n3")) { final N3Impl mapper = new N3Impl(karmaContentSettings); String result = mapper.mapResult(writableIterableTuple2._1, writableIterableTuple2._2); String[] lines = result.split("(\r\n|\n)"); for(String line : lines) { if((line = line.trim()).isEmpty()) { continue; } int splitBetweenSubjectAndPredicate = line.indexOf(' '); String key = (line.substring(0, splitBetweenSubjectAndPredicate)); String value = line; results.add(new Tuple2<>(key, value)); } } return results; } }); return pairs.mapToPair(new PairFunction<Tuple2<String,String>, Text, Text>() { private static final long serialVersionUID = 2787821808872176951L; @Override public Tuple2<Text, Text> call(Tuple2<String, String> stringStringTuple2) throws Exception { return new Tuple2<>(new Text(stringStringTuple2._1), new Text(stringStringTuple2._2)); } }); } public static JavaPairRDD<Text, Text> applyModel(JavaSparkContext jsc, JavaRDD<String> input, Properties properties, final int batchSize, int numPartitions) throws IOException { JavaPairRDD<String, String> pairRDD = input.mapToPair(new PairFunction<String, String, String>() { private static final long serialVersionUID = -4153068088292891034L; public Tuple2<String, String> call(String s) throws Exception { int tabIndex = s.indexOf("\t"); return new Tuple2<>(s.substring(0, tabIndex), s.substring(tabIndex + 1)); } }); return applyModel(jsc, pairRDD, properties, batchSize, numPartitions); } private static Options createCommandLineOptions() { Options options = new Options(); options.addOption(new Option("filepath", "filepath", true, "Path to coordinate sequence file")); options.addOption(new Option("outputpath", "outputpath", true, "Path to output directory")); options.addOption(new Option("inputformat", "inputformat", true, "Path to output directory")); options.addOption(new Option("partitions", "partitions", true, "Number of partitions")); return options; } }