package com.linkedin.thirdeye.dataframe;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.commons.lang.ArrayUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DataFrameBenchmark {
// TODO: validate benchmarking method - Dead Code Elimination, etc. may be playing tricks on us.
private static final Logger LOG = LoggerFactory.getLogger(DataFrameBenchmark.class);
private static final int N_ROUNDS = 15;
private static final int N_ROUNDS_SLOW = 3;
private static final int N_ELEMENTS = 10_000_000;
private static final String[] SERIES_NAMES = new String[] { "task", "min", "mid", "max", "outer", "checksum", "samples" };
private static final long SEED = System.nanoTime();
long tStart;
long tStartOuter;
List<Long> times = new ArrayList<>();
long timeOuter;
DataFrame.Builder results = DataFrame.builder(SERIES_NAMES);
void benchmarkMapDoubleSeries() {
startTimerOuter();
long checksum = 0;
for (int r = 0; r < N_ROUNDS; r++) {
double[] doubleValues = generateDoubleData(N_ELEMENTS);
final double delta = r;
startTimer();
DoubleSeries s = DoubleSeries.buildFrom(doubleValues);
DoubleSeries sResult = s.map(new Series.DoubleFunction() {
@Override
public double apply(double... values) {
return values[0] + delta;
}
});
stopTimer();
checksum ^= checksum(sResult.values());
}
logResults("benchmarkMapDoubleSeries", checksum);
}
void benchmarkMapDoubleSeriesOperation() {
startTimerOuter();
long checksum = 0;
for (int r = 0; r < N_ROUNDS; r++) {
double[] doubleValues = generateDoubleData(N_ELEMENTS);
final double delta = r;
startTimer();
DoubleSeries s = DoubleSeries.buildFrom(doubleValues);
DoubleSeries sResult = s.add(delta);
stopTimer();
checksum ^= checksum(sResult.values());
}
logResults("benchmarkMapDoubleSeriesOperation", checksum);
}
void benchmarkMapDoubleArray() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
double[] doubleValues = generateDoubleData(N_ELEMENTS);
final double delta = r;
startTimer();
double[] results = new double[doubleValues.length];
for (int i = 0; i < doubleValues.length; i++) {
results[i] = doubleValues[i] + delta;
}
stopTimer();
checksum ^= checksum(results);
}
logResults("benchmarkMapDoubleArray", checksum);
}
void benchmarkMapLongSeries() {
startTimerOuter();
long checksum = 0;
for (int r = 0; r < N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
final long delta = r;
startTimer();
LongSeries s = LongSeries.buildFrom(longValues);
LongSeries sResult = s.map(new Series.LongFunction() {
@Override
public long apply(long... values) {
return values[0] + delta;
}
});
stopTimer();
checksum ^= checksum(sResult.values());
}
logResults("benchmarkMapLongSeries", checksum);
}
void benchmarkMapLongSeriesOperation() {
startTimerOuter();
long checksum = 0;
for (int r = 0; r < N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
final long delta = r;
startTimer();
LongSeries s = LongSeries.buildFrom(longValues);
LongSeries sResult = s.add(delta);
stopTimer();
checksum ^= checksum(sResult.values());
}
logResults("benchmarkMapLongSeriesOperation", checksum);
}
void benchmarkMapLongArray() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
final long delta = r;
startTimer();
long[] results = new long[longValues.length];
for (int i = 0; i < longValues.length; i++) {
results[i] = longValues[i] + delta;
}
stopTimer();
checksum ^= checksum(results);
}
logResults("benchmarkMapLongArray", checksum);
}
void benchmarkMapTwoSeriesExpression() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS_SLOW; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
DataFrame df = new DataFrame();
df.addSeries("long", longValues);
df.addSeries("double", doubleValues);
startTimer();
DoubleSeries res = df.map("long * double");
stopTimer();
checksum ^= checksum(res.values());
}
logResults("benchmarkMapTwoSeriesExpression", checksum);
}
void benchmarkMapTwoSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
DataFrame df = new DataFrame();
df.addSeries("long", longValues);
df.addSeries("double", doubleValues);
startTimer();
DoubleSeries res = df.map(new Series.DoubleFunction() {
@Override
public double apply(double... values) {
return values[0] * values[1];
}
}, "long", "double");
stopTimer();
checksum ^= checksum(res.values());
}
logResults("benchmarkMapTwoSeries", checksum);
}
void benchmarkMapTwoSeriesOperation() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
DataFrame df = new DataFrame();
df.addSeries("long", longValues);
df.addSeries("double", doubleValues);
startTimer();
LongSeries l = df.getLongs("long");
DoubleSeries d = df.getDoubles("double");
DoubleSeries res = d.multiply(l);
stopTimer();
checksum ^= checksum(res.values());
}
logResults("benchmarkMapTwoSeriesOperation", checksum);
}
void benchmarkMapTwoArrays() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
startTimer();
double[] results = new double[N_ELEMENTS];
for(int i=0; i<N_ELEMENTS; i++) {
results[i] = longValues[i] * doubleValues[i];
}
stopTimer();
checksum ^= checksum(results);
}
logResults("benchmarkMapTwoArrays", checksum);
}
void benchmarkMapThreeSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
long[] otherValues = generateLongData(N_ELEMENTS);
DataFrame df = new DataFrame();
df.addSeries("long", longValues);
df.addSeries("double", doubleValues);
df.addSeries("other", otherValues);
startTimer();
DoubleSeries res = df.map(new Series.DoubleFunction() {
@Override
public double apply(double... values) {
return values[0] * values[1] + values[2];
}
}, "long", "double", "other");
stopTimer();
checksum ^= checksum(res.values());
}
logResults("benchmarkMapThreeSeries", checksum);
}
void benchmarkMapThreeArrays() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
long[] otherValues = generateLongData(N_ELEMENTS);
startTimer();
double[] results = new double[N_ELEMENTS];
for(int i=0; i<N_ELEMENTS; i++) {
results[i] = longValues[i] * doubleValues[i] + otherValues[i];
}
stopTimer();
checksum ^= checksum(results);
}
logResults("benchmarkMapThreeArrays", checksum);
}
void benchmarkMapFourSeriesGeneric() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
long[] otherValues = generateLongData(N_ELEMENTS);
double[] anotherValues = generateDoubleData(N_ELEMENTS);
DataFrame df = new DataFrame();
df.addSeries("long", longValues);
df.addSeries("double", doubleValues);
df.addSeries("other", otherValues);
df.addSeries("another", anotherValues);
startTimer();
DoubleSeries res = df.map(new Series.DoubleFunction() {
@Override
public double apply(double... values) {
return values[0] * values[1] + values[2] / values[3];
}
}, "long", "double", "other", "another");
stopTimer();
checksum ^= checksum(res.values());
}
logResults("benchmarkMapFourSeriesGeneric", checksum);
}
void benchmarkMapFourArrays() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
double[] doubleValues = generateDoubleData(N_ELEMENTS);
long[] otherValues = generateLongData(N_ELEMENTS);
double[] anotherValues = generateDoubleData(N_ELEMENTS);
startTimer();
double[] results = new double[N_ELEMENTS];
for(int i=0; i<N_ELEMENTS; i++) {
results[i] = longValues[i] * doubleValues[i] + otherValues[i] / anotherValues[i];
}
stopTimer();
checksum ^= checksum(results);
}
logResults("benchmarkMapFourArrays", checksum);
}
void benchmarkMinMaxLongSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
LongSeries s = LongSeries.buildFrom(longValues);
startTimer();
long min = s.min();
long max = s.max();
stopTimer();
checksum ^= checksum(min, max);
}
logResults("benchmarkMinMaxLongSeries", checksum);
}
void benchmarkMinMaxLongArray() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
startTimer();
long min = longValues[0];
long max = longValues[0];
for (long v : longValues) {
if (min > v) min = v;
if (max < v) max = v;
}
stopTimer();
checksum ^= checksum(min, max);
}
logResults("benchmarkMinMaxLongArray", checksum);
}
void benchmarkEqualsLongArray() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
long[] otherValues = Arrays.copyOf(longValues, longValues.length);
startTimer();
if(!Arrays.equals(longValues, otherValues))
throw new IllegalStateException("Arrays must be equal");
stopTimer();
checksum ^= checksum(longValues);
checksum ^= checksum(otherValues);
}
logResults("benchmarkEqualsLongArray", checksum);
}
void benchmarkEqualsLongSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
long[] otherValues = Arrays.copyOf(longValues, longValues.length);
LongSeries series = LongSeries.buildFrom(longValues);
LongSeries other = LongSeries.buildFrom(otherValues);
startTimer();
if(!series.equals(other))
throw new IllegalStateException("Series must be equal");
stopTimer();
checksum ^= checksum(series.values());
checksum ^= checksum(other.values());
}
logResults("benchmarkEqualsLongSeries", checksum);
}
void benchmarkEqualsLongSeriesOperation() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
long[] otherValues = Arrays.copyOf(longValues, longValues.length);
LongSeries series = LongSeries.buildFrom(longValues);
LongSeries other = LongSeries.buildFrom(otherValues);
startTimer();
BooleanSeries res = series.eq(other);
stopTimer();
if(res.hasFalse())
throw new IllegalStateException("Series must be equal");
checksum ^= checksum(series.values());
checksum ^= checksum(other.values());
}
logResults("benchmarkEqualsLongSeriesOperation", checksum);
}
void benchmarkSortLongArray() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS_SLOW; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
startTimer();
Arrays.sort(longValues);
stopTimer();
checksum ^= checksum(longValues);
}
logResults("benchmarkSortLongArray", checksum);
}
void benchmarkSortLongSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS_SLOW; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
LongSeries series = LongSeries.buildFrom(longValues);
startTimer();
LongSeries out = series.sorted();
stopTimer();
checksum ^= checksum(out.values());
}
logResults("benchmarkSortLongSeries", checksum);
}
void benchmarkUniqueLongArrayWithObjects() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS_SLOW; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
startTimer();
Set<Long> set = new HashSet<>();
for(long l : longValues)
set.add(l);
long[] out = ArrayUtils.toPrimitive(set.toArray(new Long[set.size()]));
stopTimer();
checksum ^= checksum(out);
}
logResults("benchmarkUniqueLongArrayWithObjects", checksum);
}
void benchmarkUniqueLongSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS_SLOW; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
LongSeries series = LongSeries.buildFrom(longValues);
startTimer();
LongSeries out = series.unique();
stopTimer();
checksum ^= checksum(out.values());
}
logResults("benchmarkUniqueLongSeries", checksum);
}
void benchmarkShiftLongArray() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
startTimer();
long[] values = new long[N_ELEMENTS];
System.arraycopy(longValues, 0, values, N_ELEMENTS / 2, N_ELEMENTS / 2);
Arrays.fill(values, 0, N_ELEMENTS / 2, Long.MIN_VALUE);
stopTimer();
checksum ^= checksum(values);
}
logResults("benchmarkShiftLongArray", checksum);
}
void benchmarkShiftLongSeries() {
startTimerOuter();
long checksum = 0;
for(int r=0; r<N_ROUNDS; r++) {
long[] longValues = generateLongData(N_ELEMENTS);
LongSeries series = LongSeries.buildFrom(longValues);
startTimer();
LongSeries out = series.shift(N_ELEMENTS / 2);
stopTimer();
checksum ^= checksum(out.values());
}
logResults("benchmarkShiftLongSeries", checksum);
}
void benchmarkAll() {
benchmarkMinMaxLongSeries();
benchmarkMinMaxLongArray();
benchmarkEqualsLongSeries();
benchmarkEqualsLongSeriesOperation();
benchmarkEqualsLongArray();
benchmarkShiftLongSeries();
benchmarkShiftLongArray();
benchmarkSortLongSeries();
benchmarkSortLongArray();
benchmarkUniqueLongSeries();
benchmarkUniqueLongArrayWithObjects();
benchmarkMapDoubleSeries();
benchmarkMapDoubleSeriesOperation();
benchmarkMapDoubleArray();
benchmarkMapLongSeries();
benchmarkMapLongSeriesOperation();
benchmarkMapLongArray();
benchmarkMapTwoSeries();
benchmarkMapTwoSeriesOperation();
benchmarkMapTwoArrays();
benchmarkMapThreeSeries();
benchmarkMapThreeArrays();
benchmarkMapFourSeriesGeneric();
benchmarkMapFourArrays();
benchmarkMapTwoSeriesExpression();
}
void startTimer() {
this.tStart = System.nanoTime();
}
void stopTimer() {
long tDelta = System.nanoTime() - this.tStart;
this.times.add(tDelta);
}
void startTimerOuter() {
this.tStartOuter = System.nanoTime();
}
void stopTimerOuter() {
this.timeOuter = System.nanoTime() - this.tStartOuter;
}
void logResults(String name, long checksum) {
stopTimerOuter();
Collections.sort(this.times);
long tMid = this.times.get(this.times.size() / 2);
long tMin = Collections.min(this.times);
long tMax = Collections.max(this.times);
LOG.info("{}: min/mid/max = {}ms {}ms {}ms [all={}ms, chk={}, cnt={}]", name, tMin / 1000000, tMid / 1000000, tMax / 1000000, timeOuter / 1000000, checksum % 1000, this.times.size());
this.results.append(name, tMin, tMid, tMax, this.timeOuter, checksum, this.times.size());
// reset timer stats
this.times = new ArrayList<>();
}
public static void main(String[] args) throws Exception {
LOG.info("Press Enter key to start.");
System.in.read();
LOG.info("Running DataFrame benchmark ...");
DataFrameBenchmark b = new DataFrameBenchmark();
b.benchmarkAll();
Series.LongFunction toMillis = new Series.LongFunction() {
@Override
public long apply(long... values) {
return values[0] / 1000000;
}
};
DataFrame df = b.results.build();
df.mapInPlace(toMillis, "min");
df.mapInPlace(toMillis, "mid");
df.mapInPlace(toMillis, "max");
df.mapInPlace(toMillis, "outer");
df.mapInPlace(new Series.LongFunction() {
@Override
public long apply(long... values) {
return values[0] % 1000;
}
}, "checksum");
LOG.info("Summary:\n{}", df.toString(40, SERIES_NAMES));
LOG.info("done.");
}
static double[] generateDoubleData(int n) {
Random r = new Random();
r.setSeed(SEED);
double[] values = new double[n];
for(int i=0; i<n; i++) {
values[i] = r.nextDouble();
}
return values;
}
static long[] generateLongData(int n) {
Random r = new Random();
r.setSeed(SEED);
long[] values = new long[n];
for(int i=0; i<n; i++) {
values[i] = r.nextLong();
}
return values;
}
static long checksum(long... values) {
long bits = 0;
for(long v : values) {
bits ^= v;
}
return bits;
}
static long checksum(double... values) {
long bits = 0;
for(double v : values) {
bits ^= Double.doubleToLongBits(v);
}
return bits;
}
}