// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.numeric.histogram;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.BiConsumer;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.statistics.type.DataTypeEnum;
public class HistogramAnalyzerTest {
private static final Logger LOGGER = LoggerFactory.getLogger(HistogramAnalyzerTest.class);
private HistogramAnalyzer createAnalyzer(DataTypeEnum[] types, HistogramParameter histogramParameter) {
return new HistogramAnalyzer(types, histogramParameter);
}
@Test
public void testResizeWithInvalidValues() throws Exception {
String[][] data = { { "aaaa" }, { "5" } };
HistogramParameter histogramParameter = new HistogramParameter();
histogramParameter.setDefaultParameters(0, 5, 1);
HistogramAnalyzer analyzer = createAnalyzer(new DataTypeEnum[] { DataTypeEnum.DOUBLE }, histogramParameter);
for (String[] d : data) {
analyzer.analyze(d);
}
Map<Range, Long> histogram = analyzer.getResult().get(0).getHistogram();
for (Entry<Range, Long> entry : histogram.entrySet()) {
final Range range = entry.getKey();
Assert.assertEquals(0, range.getLower(), 0.00);
Assert.assertEquals(5, range.getUpper(), 0.00);
}
}
@Test
public void testAnalyzeStringArray() {
String[] data = { "0", "2", "2.5", "4", "6", "7", "8", "9", "10" };
HistogramParameter histogramParameter = new HistogramParameter();
histogramParameter.setDefaultParameters(0, 10, 4);
HistogramAnalyzer analyzer = createAnalyzer(new DataTypeEnum[] { DataTypeEnum.DOUBLE }, histogramParameter);
for (String d : data) {
analyzer.analyze(d);
}
Map<Range, Long> histogram = analyzer.getResult().get(0).getHistogram();
Iterator<Entry<Range, Long>> entrySet = histogram.entrySet().iterator();
int idx = 0;
while (entrySet.hasNext()) {
Entry<Range, Long> entry = entrySet.next();
Range r = entry.getKey();
if (idx == 0) {
Assert.assertEquals(0, r.getLower(), 0.00);
Assert.assertEquals(2.5, r.getUpper(), 0.00);
Assert.assertEquals(2, entry.getValue(), 0);
}
if (idx == 1) {
Assert.assertEquals(2.5, r.getLower(), 0.00);
Assert.assertEquals(5, r.getUpper(), 0.00);
Assert.assertEquals(2, entry.getValue(), 0);
}
if (idx == 2) {
Assert.assertEquals(5, r.getLower(), 0.00);
Assert.assertEquals(7.5, r.getUpper(), 0.00);
Assert.assertEquals(2, entry.getValue(), 0);
}
if (idx == 3) {
Assert.assertEquals(7.5, r.getLower(), 0.00);
Assert.assertEquals(10, r.getUpper(), 0.00);
Assert.assertEquals(3, entry.getValue(), 0);
}
idx++;
}
}
@Test
public void testAnalyzeExtended() {
String[] data = { "1", "2", "3", "4", "5", "6", "7", "8", "9", "10" };
HistogramParameter histogramParameter = new HistogramParameter();
histogramParameter.setDefaultParameters(2, 8, 3);
HistogramAnalyzer analyzer = createAnalyzer(new DataTypeEnum[] { DataTypeEnum.INTEGER }, histogramParameter);
for (String d : data) {
analyzer.analyze(d);
}
HistogramStatistics histogramStatistics = analyzer.getResult().get(0);
Map<Range, Long> histogram = histogramStatistics.getHistogram();
Iterator<Entry<Range, Long>> entrySet = histogram.entrySet().iterator();
int idx = 0;
while (entrySet.hasNext()) {
Entry<Range, Long> entry = entrySet.next();
Range r = entry.getKey();
if (idx == 0) {
LOGGER.debug(r.getLower() + " to " + r.getUpper() + ", count:" + entry.getValue());
Assert.assertEquals(2, r.getLower(), 0.00);
Assert.assertEquals(4, r.getUpper(), 0.00);
Assert.assertEquals(2, entry.getValue(), 0);
}
if (idx == 1) {
LOGGER.debug(r.getLower() + " to " + r.getUpper() + ", count:" + entry.getValue());
Assert.assertEquals(4, r.getLower(), 0.00);
Assert.assertEquals(6, r.getUpper(), 0.00);
Assert.assertEquals(2, entry.getValue(), 0);
}
if (idx == 2) {
LOGGER.debug(r.getLower() + " to " + r.getUpper() + ", count:" + entry.getValue());
Assert.assertEquals(6, r.getLower(), 0.00);
Assert.assertEquals(8, r.getUpper(), 0.00);
Assert.assertEquals(3, entry.getValue(), 0);
}
idx++;
}
// Assert the value out of range
Assert.assertFalse(histogramStatistics.isComplete());
Assert.assertEquals(1, histogramStatistics.getCountBelowMin(), 0);
Assert.assertEquals(2, histogramStatistics.getCountAboveMax(), 0);
}
@Test
public void testAnalyzeNegative() {
String[] data = { "-2", "-4", "-6", "-7", "8", "9", "5", "1" };
HistogramParameter histogramParameter = new HistogramParameter();
histogramParameter.setDefaultParameters(-4, 8, 3);
HistogramAnalyzer analyzer = createAnalyzer(new DataTypeEnum[] { DataTypeEnum.INTEGER }, histogramParameter);
for (String d : data) {
analyzer.analyze(d);
}
Map<Range, Long> histogram = analyzer.getResult().get(0).getHistogram();
histogram.forEach(new BiConsumer<Range, Long>() {
@Override
public void accept(Range t, Long u) {
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
if (t.getLower() == -4.0) {
Assert.assertEquals(2, u, 0.0);
}
if (t.getLower() == 0.0) {
Assert.assertEquals(1, u, 0.0);
}
if (t.getLower() == 4.0) {
Assert.assertEquals(2, u, 0.0);
}
}
});
}
@Test
public void testAnalyzeFranction() {
String[] data = { "-0.0001", "-0.00004", "-0.00006", "-0.00007", "8", "7", "9", "5", "1" };
HistogramParameter histogramParameter = new HistogramParameter();
histogramParameter.setDefaultParameters(-0.004, 9, 3);
HistogramAnalyzer analyzer = createAnalyzer(new DataTypeEnum[] { DataTypeEnum.DOUBLE }, histogramParameter);
for (String d : data) {
analyzer.analyze(d);
}
Map<Range, Long> histogram = analyzer.getResult().get(0).getHistogram();
histogram.forEach(new BiConsumer<Range, Long>() {
@Override
public void accept(Range t, Long u) {
if (t.getLower() == -0.004) {
Assert.assertEquals(5, u, 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (Math.round(t.getLower() * 1000.0) / 1000.0 == 2.997) {
Assert.assertEquals(1, u, 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (Math.round(t.getLower() * 1000.0) / 1000.0 == 5.999) {
Assert.assertEquals(3, u, 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
}
}
});
}
@Test
public void testHistogramWithColumnParameters() {
String[][] data = new String[][] { { "1", "1", "one" }, { "2", "2", "2" }, { "3", "3", "3" }, { "4", "4", "4" },
{ "5", "5", "5" }, { "6", "6", "6" }, { "7", "7", "7" }, { "8", "8", "8" }, { "9", "9", "9" },
{ "10", "10", "10" } };
HistogramParameter histogramParameter = new HistogramParameter();
HistogramColumnParameter column1Param = new HistogramColumnParameter();
column1Param.setParameters(2, 8, 3);
histogramParameter.putColumnParameter(0, column1Param);
HistogramColumnParameter column2Param = new HistogramColumnParameter();
column2Param.setParameters(0, 9, 4);
histogramParameter.putColumnParameter(1, column2Param);
HistogramAnalyzer analyzer = createAnalyzer(
new DataTypeEnum[] { DataTypeEnum.INTEGER, DataTypeEnum.INTEGER, DataTypeEnum.STRING }, histogramParameter);
for (String[] d : data) {
analyzer.analyze(d);
}
Map<Range, Long> col1Histogram = analyzer.getResult().get(0).getHistogram();
Map<Range, Long> col2Histogram = analyzer.getResult().get(1).getHistogram();
col1Histogram.forEach(new BiConsumer<Range, Long>() {
@Override
public void accept(Range t, Long u) {
if (t.getLower() == 2) {
Assert.assertEquals(2, u, 0.0);
Assert.assertEquals(4, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (t.getLower() == 4) {
Assert.assertEquals(2, u, 0.0);
Assert.assertEquals(6, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (t.getLower() == 6) {
Assert.assertEquals(3, u, 0.0);
Assert.assertEquals(8, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
}
});
col2Histogram.forEach(new BiConsumer<Range, Long>() {
@Override
public void accept(Range t, Long u) {
if (t.getLower() == 0) {
Assert.assertEquals(2, u, 0.0);
Assert.assertEquals(2.25, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (t.getLower() == 2.25) {
Assert.assertEquals(2, u, 0.0);
Assert.assertEquals(4.5, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (t.getLower() == 4.5) {
Assert.assertEquals(2, u, 0.0);
Assert.assertEquals(6.75, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
if (t.getLower() == 6.75) {
Assert.assertEquals(3, u, 0.0);
Assert.assertEquals(9, t.getUpper(), 0.0);
LOGGER.debug(t.getLower() + " to " + t.getUpper() + ", count:" + u);
return;
}
}
});
}
@Test
public void testMultipleRandomHistograms() {
final int nbLoop = 10;
for (int i = 0; i < nbLoop; i++) {
testHistogramWithRandom(3, 60, -1300 + 10 * i, 196 + 11 * i, 3435 + 10 * i, i * 37, 3 + i * 73);
}
}
private void testHistogramWithRandom(int minNbBins, int maxNbBins, int minValue, int maxMinValue, int maxValue,
int minNbValue, int maxNbValue) {
// number of bins
int numBins = ThreadLocalRandom.current().nextInt(minNbBins, maxNbBins);
// min value,max value
double min = ThreadLocalRandom.current().nextDouble(minValue, maxMinValue);
double max = ThreadLocalRandom.current().nextDouble(maxMinValue + 1, maxValue);
double step = (max - min) / numBins;
// value list
List<Double> values = new ArrayList<Double>();
// histograms
Map<Range, Long> histograms = new TreeMap<Range, Long>();
double current = min;
for (int i = 1; i <= numBins; i++) {
// generate values for each bin ( range of 5 to 10)
long countInBin = ThreadLocalRandom.current().nextLong(minNbValue, maxNbValue);
double next = current + step;
if (i == numBins) {
next = max;
}
Range currentRange = new Range(current, next);
for (int j = 0; j < countInBin; j++) {
double rValue = ThreadLocalRandom.current().nextDouble(current, next);
values.add(rValue);
}
if (1 == i || numBins == i) {
// increment count since min / max is included
countInBin++;
}
histograms.put(currentRange, countInBin);
// Go to next bin
current = next;
}
// Add min and max into value list
values.add(min);
values.add(max);
LOGGER.debug("numBins: " + numBins + ", min: " + min + ", max:" + max);
// analyze histogram
HistogramParameter histogramParameter = new HistogramParameter();
HistogramColumnParameter columnParam = new HistogramColumnParameter();
columnParam.setParameters(min, max, numBins);
histogramParameter.putColumnParameter(0, columnParam);
HistogramAnalyzer analyzer = createAnalyzer(new DataTypeEnum[] { DataTypeEnum.DOUBLE }, histogramParameter);
for (Double d : values) {
analyzer.analyze(d.toString());
}
Map<Range, Long> histogramFromAnalyzer = analyzer.getResult().get(0).getHistogram();
// do assertions
int binIdx = 0;
for (Entry<Range, Long> histEntry : histograms.entrySet()) {
@SuppressWarnings("unchecked")
Entry<Range, Long> histEntryOfAnalyzer = (Entry<Range, Long>) histogramFromAnalyzer.entrySet().toArray()[binIdx];
Assert.assertEquals(histEntry.getKey().getLower(), histEntryOfAnalyzer.getKey().getLower(), 0.001);
Assert.assertEquals(histEntry.getKey().getUpper(), histEntryOfAnalyzer.getKey().getUpper(), 0.001);
Assert.assertEquals(histEntry.getValue(), histEntryOfAnalyzer.getValue());
binIdx++;
}
}
}