package edu.isi.karma.spark;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Map;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import com.github.jsonldjava.core.JsonLdOptions;
import com.github.jsonldjava.core.JsonLdProcessor;
import com.github.jsonldjava.utils.JsonUtils;
public class JSONContextDriver {
private static Logger logger = LoggerFactory.getLogger(JSONContextDriver.class);
private JSONContextDriver() {
}
public static void main(String[] args) throws ParseException, IOException, ClassNotFoundException {
int defaultPartitions = 100;
Options options = createCommandLineOptions();
CommandLineParser parser = new BasicParser();
CommandLine cl = null;
cl = parser.parse(options, args);
if (cl == null || cl.getOptions().length == 0 || cl.hasOption("help")) {
HelpFormatter hf = new HelpFormatter();
hf.printHelp(KarmaDriver.class.getSimpleName(), options);
}
String filePath = cl.getOptionValue("filepath");
String contextUrl = cl.getOptionValue("contextUrl");
String outputPath = cl.getOptionValue("outputpath");
String inputFormat = cl.getOptionValue("inputformat");
if (filePath == null || outputPath == null) {
logger.error("No file path provided!");
return;
}
int partitions = defaultPartitions;
try {
partitions = Integer.parseInt(cl.getOptionValue("partitions"));
} catch (Exception e) {
}
final SparkConf conf = new SparkConf().setAppName("Karma");
conf.set("spark.executor.userClassPathFirst", "true");
conf.set("spark.driver.userClassPathFirst", "true");
conf.set("spark.files.userClassPathFirst", "true");
conf.set("spark.io.compression.codec", "lz4");
final JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<String, String> pairs;
if (inputFormat.equals("text")) {
JavaRDD<String> input = sc.textFile(filePath, partitions);
pairs = input.mapToPair(new PairFunction<String, String, String>() {
private static final long serialVersionUID = 4170227232300260255L;
@Override
public Tuple2<String, String> call(String s) throws Exception {
int tabIndex = s.indexOf("\t");
return new Tuple2<>(s.substring(0, tabIndex), s.substring(tabIndex + 1));
}
});
}
else {
JavaPairRDD<Writable, Text> input = sc.sequenceFile(filePath, Writable.class, Text.class, partitions);
pairs = input.mapToPair(new PairFunction<Tuple2<Writable, Text>, String, String>() {
private static final long serialVersionUID = -9042224661662821670L;
@Override
public Tuple2<String, String> call(Tuple2<Writable, Text> textTextTuple2) throws Exception {
return new Tuple2<>(textTextTuple2._1.toString(), textTextTuple2._2.toString());
}
});
}
applyContext(sc, pairs, contextUrl)
.saveAsNewAPIHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
public static JavaPairRDD<String, String> applyContext(JavaSparkContext jsc,
JavaPairRDD<String, String> input,
final String contextUrl) throws IOException {
InputStream in = new URL(contextUrl).openStream();
final Object contextObject = JsonUtils.fromInputStream(in);
in.close();
final Broadcast<Object> context = jsc.broadcast(contextObject);
return input.mapToPair(new PairFunction<Tuple2<String,String>, String, String>() {
private static final long serialVersionUID = 2878941073410454935L;
@SuppressWarnings("unchecked")
@Override
public Tuple2<String, String> call(Tuple2<String, String> t)
throws Exception {
String key = t._1();
String value = t._2();
JSONObject obj = new JSONObject(value);
Object outobj = JsonLdProcessor.compact(JsonUtils.fromString(value),
context.getValue(),
new JsonLdOptions(""));
if(outobj instanceof Map) {
@SuppressWarnings("rawtypes")
Map outjsonobj = (Map) outobj;
outjsonobj.put("@context", contextUrl);
}
value = JsonUtils.toString(outobj);
if (obj.has("uri")) {
key = obj.getString("uri");
}
else if (obj.has("@id")) {
key = obj.getString("@id");
}
else {
key = obj.toString();
}
return new Tuple2<>(key, value);
}
});
}
public static JavaRDD<String> applyContext(JavaSparkContext jsc,
JavaRDD<String> input, String contextUrl) throws IOException {
JavaPairRDD<String, String> inputPair = input.mapToPair(new PairFunction<String, String, String>() {
private static final long serialVersionUID = -4153068088292891034L;
public Tuple2<String, String> call(String s) throws Exception {
int tabIndex = s.indexOf("\t");
return new Tuple2<>(s.substring(0, tabIndex), s.substring(tabIndex + 1));
}
});
JavaPairRDD<String, String> pairs = applyContext(jsc, inputPair, contextUrl);
return pairs.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5833358013516510838L;
@Override
public String call(Tuple2<String, String> arg0) throws Exception {
return (arg0._1() + "\t" + arg0._2());
}
});
}
private static Options createCommandLineOptions() {
Options options = new Options();
options.addOption(new Option("filepath", "filepath", true, "Path to coordinate sequence file"));
options.addOption(new Option("contextUrl", "contextUrl", true, "Path to context url"));
options.addOption(new Option("outputpath", "outputpath", true, "Path to output directory"));
options.addOption(new Option("inputformat", "inputformat", true, "Path to output directory"));
options.addOption(new Option("partitions", "partitions", true, "Number of partitions"));
return options;
}
}