package edu.isi.karma.jsonld.spark;
import com.jayway.jsonpath.JsonPath;
import edu.isi.karma.jsonld.helper.JSONLDConverter;
import org.apache.commons.cli.*;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.text.SimpleDateFormat;
import java.util.List;
/**
* Created by chengyey on 10/27/15.
*/
public class ConvertJSONLD {
private static Logger logger = LoggerFactory.getLogger(ConvertJSONLD.class);
private static final SimpleDateFormat inputFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
private static final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MMM");
private static final int defaultNumPartition = 100;
private static final int defaultOutputPartition = 10;
public static void main(String[] args) throws ParseException {
Options options = createCommandLineOptions();
CommandLineParser parser = new BasicParser();
CommandLine cl = null;
cl = parser.parse(options, args);
if (cl == null || cl.getOptions().length == 0 || cl.hasOption("help")) {
HelpFormatter hf = new HelpFormatter();
hf.printHelp(ConvertJSONLD.class.getSimpleName(), options);
}
String filePath = cl.getOptionValue("filepath");
String outputPath = cl.getOptionValue("outputpath");
String tmpNumpartition = cl.getOptionValue("numpartition");
final String dateFilter = cl.getOptionValue("datefilter", "");
System.out.println("dateFilter: " + dateFilter);
String tmpOutputPartition = cl.getOptionValue("outputpartition");
if (filePath == null || outputPath == null) {
logger.error("No file path provided!");
return;
}
int numPartition = defaultNumPartition;
if (tmpNumpartition != null) {
try {
numPartition = Integer.parseInt(tmpNumpartition);
} catch (Exception e) {
logger.error("Invalid number", e);
}
}
int outputPartition = defaultOutputPartition;
if (tmpNumpartition != null) {
try {
outputPartition = Integer.parseInt(tmpOutputPartition);
} catch (Exception e) {
logger.error("Invalid number", e);
}
}
SparkConf conf = new SparkConf().setAppName("ConvertJSONLD");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, Text> input = sc.sequenceFile(filePath, Text.class, Text.class, numPartition);
input.filter(new Function<Tuple2<Text, Text>, Boolean>() {
@Override
public Boolean call(Tuple2<Text, Text> textTextTuple2) throws Exception {
List<String> results = JsonPath.read(textTextTuple2._2.toString(), "$..dateCreated");
for (String s : results) {
String date = outputFormat.format(inputFormat.parse(s));
if (date.contains(dateFilter)) {
return true;
}
}
if (results.isEmpty() && dateFilter.isEmpty()) {
return true;
}
return false;
}
}).mapToPair(new PairFunction<Tuple2<Text,Text>, String, String>() {
@Override
public Tuple2<String, String> call(Tuple2<Text, Text> textTextTuple2) throws Exception {
return new Tuple2<>(textTextTuple2._1().toString(), new JSONLDConverter().convertJSONLD(textTextTuple2._2().toString()));
}
}).reduceByKey(new Function2<String, String, String>() {
@Override
public String call(String text, String text2) throws Exception {
return new JSONLDConverter().deduplicateTriples(text, text2);
}
}).values().coalesce(outputPartition).saveAsTextFile(outputPath);
}
private static Options createCommandLineOptions() {
Options options = new Options();
options.addOption(new Option("filepath", "filepath", true, "Path to coordinate sequence file"));
options.addOption(new Option("numpartition", "numpartition", true, "Minimum number of partitions"));
options.addOption(new Option("outputpartition", "outputpartition", true, "Number of partitions for output"));
options.addOption(new Option("outputpath", "outputpath", true, "Path to output directory"));
options.addOption(new Option("datefilter", "date", true, "Date to filter in yyyy-MMM or MMM format"));
return options;
}
}