package com.caseystella.cli;
import com.caseystella.input.Mode;
import com.caseystella.output.CursesVisualize;
import com.caseystella.summarize.Summarizer;
import com.caseystella.summarize.Summary;
import com.caseystella.summarize.TotalSummary;
import com.caseystella.util.JSONUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import javax.annotation.Nullable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
public class SummarizerCLI {
public enum SummarizerOptions {
HELP("h", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "help", false, "This screen");
o.setRequired(false);
return o;
}
}),
LOAD("l", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "load", true, "Load an existing summary");
o.setRequired(false);
o.setArgName("JSON");
return o;
}
}),
INPUT("i", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "input", true, "Input source");
o.setRequired(false);
o.setArgName("SOURCE");
return o;
}
}),
MODE("m", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "mode", true, "Type of mode. One of " + Joiner.on(",").join(Mode.values()));
o.setRequired(false);
o.setArgName("MODE");
return o;
}
}),
INPUT_PROPERTIES("D", new Function<String, Option>() {
@Override
public Option apply(String code) {
return OptionBuilder.withArgName("property=value")
.hasArgs(2)
.withValueSeparator()
.withDescription("Input properties")
.create(code);
}
}
),
NUMERIC_SAMPLE_SIZE("ns", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "numeric_sample_size", true, "Sample size for numeric data.");
o.setRequired(false);
o.setArgName("NUM");
return o;
}
}),
NON_NUMERIC_SAMPLE_SIZE("nns", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "non_numeric_sample_size", true, "Sample size for non-numeric data.");
o.setRequired(false);
o.setArgName("NUM");
return o;
}
}
),
PERCENTILES("pct", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "percentiles", true, "A comma separated list of percentiles in (0, 100].");
o.setRequired(false);
o.setArgName("PCTILE1[,PCTILE2]*");
return o;
}
}
),
OUTPUT("o", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "output", true, "output location");
o.setRequired(false);
o.setArgName("SOURCE");
return o;
}
}
),SIMILARITY_SCORE_CUTOFF("ssc", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "similarity_score_cutoff", true, "Similarity score cutoff. Scores are cosine sim., so they range from [0,1], closer to 1 is more similar");
o.setRequired(false);
o.setType(Double.class);
o.setArgName("SCORE_CUTOFF");
return o;
}
}
),
SIMILARITY_MIN_OCCURRANCE("smo", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "similarity_min_occurrance", true, "Min Occurrances to be considered for synonyms");
o.setRequired(false);
o.setType(Integer.class);
o.setArgName("NUM_OCCURANCES");
return o;
}
}
),
SIMILARITY_VEC_SIZE("svs", new Function<String, Option>() {
@Override
public Option apply(String code) {
Option o = new Option(code, "similarity_vec_size", true, "Vector Size");
o.setRequired(false);
o.setType(Integer.class);
o.setArgName("DIM");
return o;
}
}
)
;
;
Option option;
String shortCode;
SummarizerOptions(String shortCode
, Function<String, Option> optionHandler
)
{
this.shortCode = shortCode;
this.option = optionHandler.apply(shortCode);
}
public boolean has(CommandLine cli) {
return cli.hasOption(shortCode);
}
public String get(CommandLine cli) {
return cli.getOptionValue(shortCode);
}
public String get(CommandLine cli, String def) {
return has(cli)?cli.getOptionValue(shortCode):def;
}
public Map<String, String> getProperties(CommandLine cli) {
Properties p = cli.getOptionProperties(shortCode);
Map<String, String> ret = new HashMap<>();
for(Map.Entry<Object, Object> kv : p.entrySet()) {
ret.put(kv.getKey().toString(), kv.getValue().toString());
}
return ret;
}
public static CommandLine parse(CommandLineParser parser, String[] args) throws ParseException {
try {
CommandLine cli = parser.parse(getOptions(), args);
if(HELP.has(cli)) {
printHelp();
System.exit(0);
}
return cli;
} catch (ParseException e) {
System.err.println("Unable to parse args: " + Joiner.on(' ').join(args));
e.printStackTrace(System.err);
printHelp();
throw e;
}
}
public static void printHelp() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "SummarizerCLI", getOptions());
}
public static Options getOptions() {
Options ret = new Options();
for(SummarizerOptions o : SummarizerOptions.values()) {
ret.addOption(o.option);
}
return ret;
}
}
private static List<Double> getPercentiles(String pctiles) {
List<Double> ret = new ArrayList<>();
Iterables.addAll(ret, Iterables.transform(Splitter.on(",").split(pctiles), new com.google.common.base.Function<String, Double>() {
@Nullable
@Override
public Double apply(@Nullable String s) {
return Double.parseDouble(s.trim());
}
}));
return ret;
}
public static void main(String... argv) throws ParseException, IOException {
Parser parser = new PosixParser();
CommandLine cli = SummarizerOptions.parse(parser, argv);
TotalSummary output =null;
if(SummarizerOptions.LOAD.has(cli)) {
output = JSONUtils.INSTANCE.load(new File(SummarizerOptions.LOAD.get(cli)), TotalSummary.class);
}
else {
SparkConf conf = new SparkConf().setAppName("Summarizer");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
JavaSparkContext sc = new JavaSparkContext(conf);
String input = SummarizerOptions.INPUT.get(cli);
Map<String, String> inputOptions = SummarizerOptions.INPUT_PROPERTIES.getProperties(cli);
Mode mode = Mode.valueOf(SummarizerOptions.MODE.get(cli).toUpperCase());
DataFrame df = mode.open(input, sc, inputOptions);
int numericSampleSize = Integer.parseInt(SummarizerOptions.NUMERIC_SAMPLE_SIZE.get(cli, "1500"));
int nonNumericSampleSize = Integer.parseInt(SummarizerOptions.NON_NUMERIC_SAMPLE_SIZE.get(cli, "20"));
List<Double> percentiles = getPercentiles(SummarizerOptions.PERCENTILES.get(cli, "25,50,75,95,99"));
double similarityScoreCutoff = Double.parseDouble(SummarizerOptions.SIMILARITY_SCORE_CUTOFF.get(cli, "0.8"));
int similarityDim= Integer.parseInt(SummarizerOptions.SIMILARITY_VEC_SIZE.get(cli, "100"));
int similarityMinOccurrance = Integer.parseInt(SummarizerOptions.SIMILARITY_MIN_OCCURRANCE.get(cli, "10"));
output = Summarizer.summarize(df, numericSampleSize
, nonNumericSampleSize, percentiles
, 100, similarityScoreCutoff
, similarityMinOccurrance
, similarityDim
);
}
if(SummarizerOptions.OUTPUT.has(cli)) {
File out = new File(SummarizerOptions.OUTPUT.get(cli));
try(PrintWriter pw = new PrintWriter(out)) {
IOUtils.write(JSONUtils.INSTANCE.toJSON(output, true), pw);
}
}
else {
new CursesVisualize().display(output);
}
}
}