/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.beans;
import java.util.HashMap;
import java.util.Map;
import javax.inject.Inject;
import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
import org.apache.commons.math.stat.descriptive.StatisticalSummary;
import org.apache.commons.math.stat.descriptive.SummaryStatistics;
import org.apache.commons.math.stat.descriptive.moment.SecondMoment;
import org.eobjects.analyzer.beans.api.Analyzer;
import org.eobjects.analyzer.beans.api.AnalyzerBean;
import org.eobjects.analyzer.beans.api.Concurrent;
import org.eobjects.analyzer.beans.api.Configured;
import org.eobjects.analyzer.beans.api.Description;
import org.eobjects.analyzer.beans.api.Initialize;
import org.eobjects.analyzer.beans.api.Provided;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.result.AnnotatedRowsResult;
import org.eobjects.analyzer.result.Crosstab;
import org.eobjects.analyzer.result.CrosstabDimension;
import org.eobjects.analyzer.result.CrosstabNavigator;
import org.eobjects.analyzer.storage.InMemoryRowAnnotationFactory;
import org.eobjects.analyzer.storage.RowAnnotation;
import org.eobjects.analyzer.storage.RowAnnotationFactory;
/**
* Number analyzer, which provides statistical information for number values:
*
* <ul>
* <li>Highest value</li>
* <li>Lowest value</li>
* <li>Sum</li>
* <li>Mean</li>
* <li>Geometric mean</li>
* <li>Standard deviation</li>
* <li>Variance</li>
* </ul>
*/
@AnalyzerBean("Number analyzer")
@Description("Provides insight into number-column values.")
@Concurrent(true)
public class NumberAnalyzer implements Analyzer<NumberAnalyzerResult> {
public static final String DIMENSION_COLUMN = "Column";
public static final String DIMENSION_MEASURE = "Measure";
public static final String MEASURE_ROW_COUNT = "Row count";
public static final String MEASURE_NULL_COUNT = "Null count";
public static final String MEASURE_HIGHEST_VALUE = "Highest value";
public static final String MEASURE_LOWEST_VALUE = "Lowest value";
public static final String MEASURE_SUM = "Sum";
public static final String MEASURE_MEAN = "Mean";
public static final String MEASURE_GEOMETRIC_MEAN = "Geometric mean";
public static final String MEASURE_STANDARD_DEVIATION = "Standard deviation";
public static final String MEASURE_VARIANCE = "Variance";
public static final String MEASURE_SUM_OF_SQUARES = "Sum of squares";
public static final String MEASURE_SECOND_MOMENT = "Second moment";
public static final String MEASURE_MEDIAN = "Median";
public static final String MEASURE_PERCENTILE25 = "25th percentile";
public static final String MEASURE_PERCENTILE75 = "75th percentile";
public static final String MEASURE_KURTOSIS = "Kurtosis";
public static final String MEASURE_SKEWNESS = "Skewness";
private Map<InputColumn<? extends Number>, NumberAnalyzerColumnDelegate> _columnDelegates = new HashMap<InputColumn<? extends Number>, NumberAnalyzerColumnDelegate>();
@Inject
@Configured
InputColumn<? extends Number>[] _columns;
@Inject
@Configured
@Description("Gather so-called descriptive statistics, including median, skewness, kurtosis and percentiles, which have a larger memory-footprint.")
boolean descriptiveStatistics = false;
@Inject
@Provided
RowAnnotationFactory _annotationFactory;
public NumberAnalyzer() {
}
@SafeVarargs
public NumberAnalyzer(InputColumn<? extends Number>... columns) {
this();
_columns = columns;
_annotationFactory = new InMemoryRowAnnotationFactory();
init();
}
@Initialize
public void init() {
for (InputColumn<? extends Number> column : _columns) {
_columnDelegates.put(column, new NumberAnalyzerColumnDelegate(descriptiveStatistics, _annotationFactory));
}
}
@Override
public void run(InputRow row, int distinctCount) {
for (InputColumn<? extends Number> column : _columns) {
NumberAnalyzerColumnDelegate delegate = _columnDelegates.get(column);
Number value = row.getValue(column);
delegate.run(row, value, distinctCount);
}
}
@Override
public NumberAnalyzerResult getResult() {
CrosstabDimension measureDimension = new CrosstabDimension(DIMENSION_MEASURE);
measureDimension.addCategory(MEASURE_ROW_COUNT);
measureDimension.addCategory(MEASURE_NULL_COUNT);
measureDimension.addCategory(MEASURE_HIGHEST_VALUE);
measureDimension.addCategory(MEASURE_LOWEST_VALUE);
measureDimension.addCategory(MEASURE_SUM);
measureDimension.addCategory(MEASURE_MEAN);
measureDimension.addCategory(MEASURE_GEOMETRIC_MEAN);
measureDimension.addCategory(MEASURE_STANDARD_DEVIATION);
measureDimension.addCategory(MEASURE_VARIANCE);
measureDimension.addCategory(MEASURE_SECOND_MOMENT);
measureDimension.addCategory(MEASURE_SUM_OF_SQUARES);
if (descriptiveStatistics) {
measureDimension.addCategory(MEASURE_MEDIAN);
measureDimension.addCategory(MEASURE_PERCENTILE25);
measureDimension.addCategory(MEASURE_PERCENTILE75);
measureDimension.addCategory(MEASURE_SKEWNESS);
measureDimension.addCategory(MEASURE_KURTOSIS);
}
CrosstabDimension columnDimension = new CrosstabDimension(DIMENSION_COLUMN);
for (InputColumn<? extends Number> column : _columns) {
columnDimension.addCategory(column.getName());
}
Crosstab<Number> crosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension);
for (InputColumn<? extends Number> column : _columns) {
CrosstabNavigator<Number> nav = crosstab.navigate().where(columnDimension, column.getName());
NumberAnalyzerColumnDelegate delegate = _columnDelegates.get(column);
StatisticalSummary s = delegate.getStatistics();
int nullCount = delegate.getNullCount();
nav.where(measureDimension, MEASURE_NULL_COUNT).put(nullCount);
if (nullCount > 0) {
addAttachment(nav, delegate.getNullAnnotation(), column);
}
int numRows = delegate.getNumRows();
nav.where(measureDimension, MEASURE_ROW_COUNT).put(numRows);
long nonNullCount = s.getN();
if (nonNullCount > 0) {
final double highestValue = s.getMax();
final double lowestValue = s.getMin();
final double sum = s.getSum();
final double mean = s.getMean();
final double standardDeviation = s.getStandardDeviation();
final double variance = s.getVariance();
final double geometricMean;
final double secondMoment;
final double sumOfSquares;
if (descriptiveStatistics) {
final DescriptiveStatistics descriptiveStats = (DescriptiveStatistics) s;
geometricMean = descriptiveStats.getGeometricMean();
sumOfSquares = descriptiveStats.getSumsq();
secondMoment = new SecondMoment().evaluate(descriptiveStats.getValues());
} else {
final SummaryStatistics summaryStats = (SummaryStatistics) s;
geometricMean = summaryStats.getGeometricMean();
secondMoment = summaryStats.getSecondMoment();
sumOfSquares = summaryStats.getSumsq();
}
nav.where(measureDimension, MEASURE_HIGHEST_VALUE).put(highestValue);
addAttachment(nav, delegate.getMaxAnnotation(), column);
nav.where(measureDimension, MEASURE_LOWEST_VALUE).put(lowestValue);
addAttachment(nav, delegate.getMinAnnotation(), column);
nav.where(measureDimension, MEASURE_SUM).put(sum);
nav.where(measureDimension, MEASURE_MEAN).put(mean);
nav.where(measureDimension, MEASURE_GEOMETRIC_MEAN).put(geometricMean);
nav.where(measureDimension, MEASURE_STANDARD_DEVIATION).put(standardDeviation);
nav.where(measureDimension, MEASURE_VARIANCE).put(variance);
nav.where(measureDimension, MEASURE_SUM_OF_SQUARES).put(sumOfSquares);
nav.where(measureDimension, MEASURE_SECOND_MOMENT).put(secondMoment);
if (descriptiveStatistics) {
final DescriptiveStatistics descriptiveStatistics = (DescriptiveStatistics) s;
final double kurtosis = descriptiveStatistics.getKurtosis();
final double skewness = descriptiveStatistics.getSkewness();
final double median = descriptiveStatistics.getPercentile(50.0);
final double percentile25 = descriptiveStatistics.getPercentile(25.0);
final double percentile75 = descriptiveStatistics.getPercentile(75.0);
nav.where(measureDimension, MEASURE_MEDIAN).put(median);
nav.where(measureDimension, MEASURE_PERCENTILE25).put(percentile25);
nav.where(measureDimension, MEASURE_PERCENTILE75).put(percentile75);
nav.where(measureDimension, MEASURE_SKEWNESS).put(skewness);
nav.where(measureDimension, MEASURE_KURTOSIS).put(kurtosis);
}
}
}
return new NumberAnalyzerResult(_columns, crosstab);
}
private void addAttachment(CrosstabNavigator<Number> nav, RowAnnotation annotation, InputColumn<?> column) {
nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
}
}