package com.caseystella.summarize;
import com.caseystella.type.TypeInference;
import com.caseystella.type.ValueSummary;
import com.caseystella.util.LogLikelihood;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import scala.Tuple2;
import java.io.Serializable;
import java.util.*;
public class Summarizer implements Serializable {
public static TotalSummary summarize( DataFrame df
, int numericSampleSize
, final int nonNumericSampleSize
, final List<Double> percentiles
, final int numDistinctValuesCutoff
, final double similarityCutoff
, final int similarityMinOccurrances
, final int vectorSize
)
{
TotalSummary totalSummary = new TotalSummary();
Map<String, Summary> columnSummaries = totalSummary.getColumnSummaries();
final List<String> columns = new ArrayList<>();
for(String s : df.columns()) {
columns.add(s);
columnSummaries.put(s, new Summary());
}
SynonymHandler handler = new SynonymHandler(df, columns, similarityMinOccurrances, vectorSize, similarityCutoff);
Map<String, Map<String, String>> synonyms = handler.findSynonymsByColumn();
for(Map.Entry<String, Map<String, String>> kv : synonyms.entrySet()) {
columnSummaries.get(kv.getKey()).setSynonyms(kv.getValue());
}
JavaPairRDD<TypedColumnWithModifier, ValueSummary> summarize =
df.javaRDD().flatMapToPair(new PairFlatMapFunction<Row, TypedColumnWithModifier, ValueSummary>() {
@Override
public Iterable<Tuple2<TypedColumnWithModifier, ValueSummary>> call(Row row) throws Exception {
List<Tuple2<TypedColumnWithModifier, ValueSummary>> ret = new ArrayList<>();
for (int i = 0; i < row.size(); ++i) {
String column = columns.get(i);
Object o = row.get(i);
ValueSummary summary = TypeInference.Type.infer(o);
ret.add(new Tuple2(new TypedColumnWithModifier(column, summary.getType(), summary.getModifier()), summary));
if (summary.getType() == TypeInference.Type.INTEGRAL) {
//we want to treat integers independently as well as part of floats because Z \subset R
ValueSummary valueSummaryFloat = ValueSummary.of(((Number) summary.getValue()).doubleValue()
, TypeInference.Type.FLOAT
, summary.getModifier()
);
ret.add(new Tuple2(new TypedColumnWithModifier(column, valueSummaryFloat.getType(), valueSummaryFloat.getModifier())
, valueSummaryFloat
)
);
}
}
return ret;
}
}).cache();
final long totalCount = summarize.count();
//count by type
Map<TypedColumnWithModifier, Long> countByType = null;
{
JavaPairRDD<TypedColumnWithModifier, Long> pair = summarize.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifier, ValueSummary>, TypedColumnWithModifier, Long>() {
@Override
public Tuple2<TypedColumnWithModifier, Long> call(Tuple2<TypedColumnWithModifier, ValueSummary> t) throws Exception {
return new Tuple2(t._1, 1L);
}
});
countByType = pair.reduceByKey(new Function2<Long, Long, Long>() {
@Override
public Long call(Long x, Long y) throws Exception {
return x + y;
}
}).collectAsMap();
for(Map.Entry<String, List<Map<String, Object>>> kv : Summary.countByColumn(countByType).entrySet()) {
columnSummaries.get(kv.getKey()).getCountByType().addAll(kv.getValue());
}
}
for(Map.Entry<TypedColumnWithModifier, Long> col : countByType.entrySet()) {
Summary summary = totalSummary.getColumnSummaries().get(col.getKey().column);
if(col.getKey().modifier == TypeInference.Modifier.MISSING) {
summary.setNumInvalid(col.getValue());
}
long total = summary.getTotalCount() + col.getValue();
summary.setTotalCount(total);
}
//count approximate distinct values by type
final Map<TypedColumnWithModifier, Long> countDistinctByType = new HashMap<>();
{
Map<TypedColumnWithModifier, Object> tmp = summarize.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifier, ValueSummary>,TypedColumnWithModifier, Long>() {
@Override
public Tuple2<TypedColumnWithModifier, Long> call(Tuple2<TypedColumnWithModifier, ValueSummary> t) throws Exception {
return new Tuple2(t._1, t._2.getValue());
}
})
.countApproxDistinctByKey(0.001).collectAsMap();
for(Map.Entry<TypedColumnWithModifier, Object> kv : tmp.entrySet()) {
countDistinctByType.put(kv.getKey(), Long.parseLong(kv.getValue() + ""));
}
for(Map.Entry<String, List<Map<String, Object>>> kv : Summary.countByColumn(countDistinctByType).entrySet()) {
columnSummaries.get(kv.getKey()).getCountDistinctByType().addAll(kv.getValue());
}
}
{
final Map<TypedColumnWithModifierAndValue, Long> categoricalCounts =
df.javaRDD().flatMapToPair(new PairFlatMapFunction<Row, TypedColumnWithModifierAndValue, Long>() {
@Override
public Iterable<Tuple2<TypedColumnWithModifierAndValue, Long>> call(Row row) throws Exception {
List<Tuple2<TypedColumnWithModifierAndValue, Long>> categoricalVariables = new ArrayList<>();
for (int i = 0; i < row.size(); ++i) {
String column = columns.get(i);
Object o = row.get(i);
ValueSummary summary = TypeInference.Type.infer(o);
TypedColumnWithModifier columnWithModifier = new TypedColumnWithModifier(column, summary.getType(), summary.getModifier());
Long l = countDistinctByType.get(columnWithModifier) ;
if(l != null && l < numDistinctValuesCutoff) {
categoricalVariables.add(new Tuple2<>(columnWithModifier.withValue(summary.getValue(), false), 1L));
}
}
return categoricalVariables;
}
}).reduceByKey(
new Function2<Long, Long, Long>() {
@Override
public Long call(Long x, Long y) throws Exception {
return x + y;
}
}
).collectAsMap();
final List<Tuple2<Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Double>> g_score_outliers=
df.javaRDD().flatMapToPair(new PairFlatMapFunction<Row, Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Long>() {
@Override
public Iterable<Tuple2<Tuple2<TypedColumnWithModifierAndValue,TypedColumnWithModifierAndValue>, Long>> call(Row row) throws Exception {
List<TypedColumnWithModifierAndValue> categoricalVariables = new ArrayList<>();
for (int i = 0; i < row.size(); ++i) {
String column = columns.get(i);
Object o = row.get(i);
ValueSummary summary = TypeInference.Type.infer(o);
TypedColumnWithModifier columnWithModifier = new TypedColumnWithModifier(column, summary.getType(), summary.getModifier());
Long l = countDistinctByType.get(columnWithModifier) ;
if(l != null && l < numDistinctValuesCutoff) {
categoricalVariables.add(columnWithModifier.withValue(summary.getValue(), false));
}
}
List<Tuple2<Tuple2<TypedColumnWithModifierAndValue,TypedColumnWithModifierAndValue>, Long> > ret = new ArrayList<>();
for(int i = 0;i < categoricalVariables.size();++i) {
for(int j = i+1;j < categoricalVariables.size();++j) {
ret.add(new Tuple2<>(new Tuple2<>(categoricalVariables.get(i), categoricalVariables.get(j)), 1L));
}
}
return ret;
}
}).reduceByKey(
new Function2<Long, Long, Long>() {
@Override
public Long call(Long x, Long y) throws Exception {
return x + y;
}
}
).mapToPair(new PairFunction<Tuple2<Tuple2<TypedColumnWithModifierAndValue,TypedColumnWithModifierAndValue>,Long>, Tuple2<TypedColumnWithModifierAndValue,TypedColumnWithModifierAndValue>, Double>() {
@Override
public Tuple2<Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Double> call(Tuple2<Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Long> t) throws Exception {
long p_xy = t._2;
Long p_x = categoricalCounts.get(t._1._1);
if(p_x == null) {
p_x = 0L;
}
Long p_y = categoricalCounts.get(t._1._2);
if(p_y == null) {
p_y = 0L;
}
return new Tuple2<>(t._1, get_gscore(p_xy, p_x , p_y, totalCount ));
}
}).takeOrdered(10, new Comp());
totalSummary.setConnectedColumns(TotalSummary.toConnectedColumns(g_score_outliers));
}
//numeric summarize
Map<TypedColumnWithModifier, Map<String, Double>> numericValueSummary = null;
{
JavaPairRDD<TypedColumnWithModifier, Iterable<Double>> groupedSample =
summarize.filter(new Function<Tuple2<TypedColumnWithModifier, ValueSummary>, Boolean>() {
@Override
public Boolean call(Tuple2<TypedColumnWithModifier, ValueSummary> t) throws Exception {
return doSampling(t._1.type, t._1.modifier);
}
})
.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifier, ValueSummary>, TypedColumnWithModifier, Double>() {
@Override
public Tuple2<TypedColumnWithModifier, Double> call(Tuple2<TypedColumnWithModifier, ValueSummary> t) throws Exception {
return new Tuple2<>(t._1, ((Number) t._2.getValue()).doubleValue());
}
})
.sampleByKeyExact(false, getSamplePct(countByType, numericSampleSize))
.groupByKey()
;
numericValueSummary = groupedSample.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifier, Iterable<Double>>, TypedColumnWithModifier, Map<String, Double>>() {
@Override
public Tuple2<TypedColumnWithModifier, Map<String, Double>> call(Tuple2<TypedColumnWithModifier, Iterable<Double>> t) throws Exception {
return new Tuple2(t._1, statisticallySummarize(t._2, percentiles));
}
})
.collectAsMap();
for(Map.Entry<String, List<Map<String, Object>>> kv : Summary.summaryToList(numericValueSummary).entrySet()) {
columnSummaries.get(kv.getKey()).getNumericValueSummary().addAll(kv.getValue());
}
}
//non-numeric summarize
Map<TypedColumnWithModifier, Map<String, Double>> nonNumericValueSummary = null;
{
List<Tuple2<TypedColumnWithModifier, Map<String, Double>>> sampleAndCount =
summarize.filter(new Function<Tuple2<TypedColumnWithModifier, ValueSummary>, Boolean>() {
@Override
public Boolean call(Tuple2<TypedColumnWithModifier, ValueSummary> t) throws Exception {
return !doSampling(t._1.type, t._1.modifier);
}
})
.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifier, ValueSummary>, TypedColumnWithModifierAndValue, Long>() {
@Override
public Tuple2<TypedColumnWithModifierAndValue, Long> call(Tuple2<TypedColumnWithModifier, ValueSummary> t) throws Exception {
return new Tuple2<>(t._1.withValue(t._2.getValue()), 1L);
}
})
.reduceByKey(new Function2<Long, Long, Long>() {
@Override
public Long call(Long x, Long y) throws Exception {
return x + y;
}
})
.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifierAndValue, Long>, TypedColumnWithModifier, Tuple2<String, Long>>() {
@Override
public Tuple2<TypedColumnWithModifier, Tuple2<String, Long>> call(Tuple2<TypedColumnWithModifierAndValue, Long> t) throws Exception {
return new Tuple2<>(new TypedColumnWithModifier(t._1.column, t._1.type, t._1.modifier)
, new Tuple2<>(t._1.value, t._2)
);
}
}
)
.groupByKey()
.mapToPair(new PairFunction<Tuple2<TypedColumnWithModifier,Iterable<Tuple2<String,Long>>>, TypedColumnWithModifier, Map<String, Double>>() {
@Override
public Tuple2<TypedColumnWithModifier, Map<String, Double>> call(Tuple2<TypedColumnWithModifier, Iterable<Tuple2<String, Long>>> t) throws Exception {
return new Tuple2<>(t._1, getTopK(t._2, nonNumericSampleSize));
}
}).collect();
nonNumericValueSummary= new HashMap<>();
for(Tuple2<TypedColumnWithModifier, Map<String, Double>> t : sampleAndCount) {
nonNumericValueSummary.put(t._1, t._2);
}
for(Map.Entry<String, List<Map<String, Object>>> kv : Summary.summaryToList(nonNumericValueSummary).entrySet()) {
columnSummaries.get(kv.getKey()).getNonNumericValueSummary().addAll(kv.getValue());
}
}
return totalSummary;
}
public static Map<String, Double> getTopK(Iterable<Tuple2<String, Long>> values, int k) {
Map<String, Double> ret = new LinkedHashMap<>();
Iterable<Tuple2<String, Long>> topK =
Ordering.from(new Comparator<Tuple2<String, Long>>() {
@Override
public int compare(Tuple2<String, Long> o1, Tuple2<String, Long> o2) {
return Long.compare(o1._2, o2._2);
}
}).greatestOf(values, k);
for(Tuple2<String, Long> kv : topK) {
ret.put(kv._1, kv._2.doubleValue());
}
return ret;
}
public static class Comp implements Comparator<Tuple2<Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Double>>, Serializable
{
@Override
public int compare(Tuple2<Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Double> o1, Tuple2<Tuple2<TypedColumnWithModifierAndValue, TypedColumnWithModifierAndValue>, Double> o2) {
return -1*Double.compare(o1._2, o2._2);
}
};
public static Map<String, Double> statisticallySummarize(Iterable<Double> values, List<Double> percentiles) {
DescriptiveStatistics statistics = new DescriptiveStatistics();
Map<String, Double> ret = new HashMap<>();
for(Double d : values) {
statistics.addValue(d);
}
for(Double pct : percentiles) {
ret.put(pct + " %ile", statistics.getPercentile(pct));
}
ret.put("kurtosis", statistics.getKurtosis());
return ret;
}
public static boolean doSampling(TypeInference.Type type, TypeInference.Modifier modifier) {
return type.isNumeric() && modifier != TypeInference.Modifier.MISSING;
}
public static Map<TypedColumnWithModifier, Object> getSamplePct(Map<TypedColumnWithModifier, Long> countByType
, int targetSize
)
{
Map<TypedColumnWithModifier, Object> ret = new HashMap<>();
for(Map.Entry<TypedColumnWithModifier, Long> kv : countByType.entrySet()) {
if(doSampling(kv.getKey().type, kv.getKey().modifier)) {
double pct = 1.0;
if(targetSize < kv.getValue()) {
pct = targetSize / kv.getValue().doubleValue();
}
ret.put(kv.getKey(), pct);
}
}
return ret;
}
public static double get_gscore(long p_xy, long p_x, long p_y, long total) {
long k11 = p_xy;
long k12= (p_y - p_xy); //count of x without y
long k21= (p_x - p_xy); // count of y without x
long k22= total - (p_x + p_y - p_xy); //count of neither x nor y
return LogLikelihood.logLikelihoodRatio(k11, k12, k21, k22);
}
}