// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.numeric.histogram;
import java.util.List;
import org.apache.commons.lang.NotImplementedException;
import org.talend.daikon.number.BigDecimalParser;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.ResizableList;
import org.talend.dataquality.statistics.numeric.NumericalStatisticsAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
import org.talend.dataquality.statistics.type.TypeInferenceUtils;
/**
* Analyzer the histograms.<br>
* Note that the min,max and number of bins parameters are needed.<br>
* If min max and number of bins are not set, then default values are used. (As defined in {@link HistogramParameter})
*
*
* @author zhao
*
*/
public class HistogramAnalyzer extends NumericalStatisticsAnalyzer<HistogramStatistics> {
private static final long serialVersionUID = -3756520692420812485L;
private ResizableList<HistogramStatistics> stats = new ResizableList<>(HistogramStatistics.class);
private HistogramParameter histogramParameter = null;
/**
*
* @param types data types
* @param histogramParameter Histogram analzyer's parameter
*/
public HistogramAnalyzer(DataTypeEnum[] types, HistogramParameter histogramParameter) {
super(types);
if (histogramParameter == null) {
throw new IllegalArgumentException("Histogram analyzer's parameter should is null.");
}
setParameters(histogramParameter);
}
/**
* Set histogram analyzer's parameters
*
* @param histogramParameter Histogram analzyer's parameter
*/
private void setParameters(HistogramParameter histogramParameter) {
this.histogramParameter = histogramParameter;
}
@Override
public boolean analyze(String... record) {
DataTypeEnum[] types = getTypes();
if (record.length != types.length)
throw new IllegalArgumentException("Each column of the record should be declared a DataType.Type corresponding! \n"
+ types.length + " type(s) declared in this histogram analyzer but " + record.length
+ " column(s) was found in this record. \n"
+ "Using method: setTypes(DataType.Type[] types) to set the types. ");
if (stats.resize(record.length)) {
int colIdx = 0;
for (HistogramStatistics stat : stats) {
HistogramColumnParameter columnParameter = histogramParameter.getColumnParameter(colIdx);
// Set column parameters to histogram statistics.
double max = histogramParameter.getDefaultMax();
double min = histogramParameter.getDefaultMin();
int numBins = histogramParameter.getDefaultNumBins();
if (columnParameter != null) {
min = columnParameter.getMin();
max = columnParameter.getMax();
numBins = columnParameter.getNumBins();
}
stat.setParameters(max, min, numBins);
colIdx++;
}
}
for (int idx : this.getStatColIdx()) { // analysis each numerical column in the record
if (!TypeInferenceUtils.isValid(types[idx], record[idx])) {
continue;
}
analyzerHistogram(idx, record);
}
return true;
}
private void analyzerHistogram(int index, String... record) {
HistogramStatistics histStats = stats.get(index);
try {
histStats.add(BigDecimalParser.toBigDecimal(record[index]).doubleValue());
} catch (NumberFormatException e) {
// skip
}
}
@Override
public Analyzer<HistogramStatistics> merge(Analyzer<HistogramStatistics> another) {
throw new NotImplementedException();
}
@Override
public void end() {
}
@Override
public List<HistogramStatistics> getResult() {
return stats;
}
}