// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.numeric.quantile;
import java.util.List;
import org.apache.commons.lang.NotImplementedException;
import org.talend.daikon.number.BigDecimalParser;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.ResizableList;
import org.talend.dataquality.statistics.numeric.NumericalStatisticsAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
import org.talend.dataquality.statistics.type.TypeInferenceUtils;
/**
* Analyze the quantiles given t-digest algorithm implemented by clearspring's "stream-lib" package. See more at <a
* href=
* "https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/quantile/TDigest.java"
* >TDigest</a>
*
* @author zhao
*
*/
public class TDigestAnalyzer extends NumericalStatisticsAnalyzer<TDigestQuantileStatistics> {
private static final long serialVersionUID = -9176043422228459277L;
private final ResizableList<TDigestQuantileStatistics> stats = new ResizableList<>(TDigestQuantileStatistics.class);
private Integer[] compression = null;
private boolean isDigestInited = false;
public TDigestAnalyzer(DataTypeEnum[] types) {
super(types);
}
@Override
public void init() {
super.init();
stats.clear();
}
public void init(Integer[] compression) {
this.compression = compression;
isDigestInited = false;
}
@Override
public boolean analyze(String... record) {
DataTypeEnum[] types = this.getTypes();
if (record.length != types.length)
throw new IllegalArgumentException("Each column of the record should be declared a DataType.Type corresponding! \n"
+ types.length + " type(s) declared in this T-Digest analyzer but " + record.length
+ " column(s) was found in this record. \n"
+ "Using method: setTypes(DataType.Type[] types) to set the types.");
stats.resize(record.length);
for (int idx : this.getStatColIdx()) { // analysis each numerical column in the record
if (!TypeInferenceUtils.isValid(types[idx], record[idx])) {
continue;
}
TDigestQuantileStatistics stat = stats.get(idx);
if (compression != null && !isDigestInited) {
stat.initTDigest(compression[idx]);
}
try {
stat.add(BigDecimalParser.toBigDecimal(record[idx]).doubleValue());
} catch (NumberFormatException e) {
continue;
}
}
if (!isDigestInited) {
isDigestInited = true;
}
return true;
}
@Override
public void end() {
}
@Override
public List<TDigestQuantileStatistics> getResult() {
return stats;
}
@Override
public Analyzer<TDigestQuantileStatistics> merge(Analyzer<TDigestQuantileStatistics> another) {
throw new NotImplementedException();
}
}