/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.beans;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.SortedSet;
import java.util.TreeSet;
import org.eobjects.analyzer.beans.api.Analyzer;
import org.eobjects.analyzer.beans.api.AnalyzerBean;
import org.eobjects.analyzer.beans.api.Configured;
import org.eobjects.analyzer.beans.api.Description;
import org.eobjects.analyzer.beans.api.Initialize;
import org.eobjects.analyzer.beans.api.Provided;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.result.AnnotatedRowsResult;
import org.eobjects.analyzer.result.Crosstab;
import org.eobjects.analyzer.result.CrosstabDimension;
import org.eobjects.analyzer.result.CrosstabNavigator;
import org.eobjects.analyzer.storage.InMemoryRowAnnotationFactory;
import org.eobjects.analyzer.storage.RowAnnotation;
import org.eobjects.analyzer.storage.RowAnnotationFactory;
import org.eobjects.analyzer.util.ValueCombination;
@AnalyzerBean("Boolean analyzer")
@Description("Inspect your boolean values. How is the distribution of true/false? Are there null values?")
public class BooleanAnalyzer implements Analyzer<BooleanAnalyzerResult> {
public static final String MEASURE_LEAST_FREQUENT = "Least frequent";
public static final String MEASURE_MOST_FREQUENT = "Most frequent";
public static final String VALUE_COMBINATION_COLUMN_FREQUENCY = "Frequency";
public static final String MEASURE_FALSE_COUNT = "False count";
public static final String MEASURE_TRUE_COUNT = "True count";
public static final String MEASURE_NULL_COUNT = "Null count";
public static final String MEASURE_ROW_COUNT = "Row count";
public static final String DIMENSION_COLUMN = "Column";
public static final String DIMENSION_MEASURE = "Measure";
// comparator used to sort entries, getting the most frequent value
// combinations to the top
private static final Comparator<Map.Entry<ValueCombination<Boolean>, RowAnnotation>> frequentValueCombinationComparator = new Comparator<Map.Entry<ValueCombination<Boolean>, RowAnnotation>>() {
@Override
public int compare(Entry<ValueCombination<Boolean>, RowAnnotation> o1,
Entry<ValueCombination<Boolean>, RowAnnotation> o2) {
int result = o2.getValue().getRowCount() - o1.getValue().getRowCount();
if (result == 0) {
result = o2.getKey().compareTo(o1.getKey());
}
return result;
}
};
private final Map<InputColumn<Boolean>, BooleanAnalyzerColumnDelegate> _columnDelegates = new HashMap<InputColumn<Boolean>, BooleanAnalyzerColumnDelegate>();
private final Map<ValueCombination<Boolean>, RowAnnotation> _valueCombinations = new HashMap<ValueCombination<Boolean>, RowAnnotation>();
@Configured
InputColumn<Boolean>[] _columns;
@Provided
RowAnnotationFactory _annotationFactory;
public BooleanAnalyzer(InputColumn<Boolean>[] columns) {
_columns = columns;
_annotationFactory = new InMemoryRowAnnotationFactory();
}
public BooleanAnalyzer() {
}
@Initialize
public void init() {
for (InputColumn<Boolean> col : _columns) {
_columnDelegates.put(col, new BooleanAnalyzerColumnDelegate(_annotationFactory));
}
}
@Override
public void run(InputRow row, int distinctCount) {
Boolean[] values = new Boolean[_columns.length];
for (int i = 0; i < values.length; i++) {
InputColumn<Boolean> col = _columns[i];
Boolean value = row.getValue(col);
BooleanAnalyzerColumnDelegate delegate = _columnDelegates.get(col);
values[i] = value;
delegate.run(value, row, distinctCount);
}
// collect all combinations of booleans
if (_columns.length > 1) {
ValueCombination<Boolean> valueCombination = new ValueCombination<Boolean>(values);
RowAnnotation annotation = _valueCombinations.get(valueCombination);
if (annotation == null) {
annotation = _annotationFactory.createAnnotation();
_valueCombinations.put(valueCombination, annotation);
}
_annotationFactory.annotate(row, distinctCount, annotation);
}
}
@Override
public BooleanAnalyzerResult getResult() {
CrosstabDimension measureDimension = new CrosstabDimension(DIMENSION_MEASURE);
measureDimension.addCategory(MEASURE_ROW_COUNT);
measureDimension.addCategory(MEASURE_NULL_COUNT);
measureDimension.addCategory(MEASURE_TRUE_COUNT);
measureDimension.addCategory(MEASURE_FALSE_COUNT);
CrosstabDimension columnDimension = new CrosstabDimension(DIMENSION_COLUMN);
for (InputColumn<Boolean> column : _columns) {
columnDimension.addCategory(column.getName());
}
Crosstab<Number> crosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension);
for (InputColumn<Boolean> column : _columns) {
CrosstabNavigator<Number> nav = crosstab.navigate().where(columnDimension, column.getName());
BooleanAnalyzerColumnDelegate delegate = _columnDelegates.get(column);
nav.where(measureDimension, MEASURE_ROW_COUNT).put(delegate.getRowCount());
int nullCount = delegate.getNullCount();
nav.where(measureDimension, MEASURE_NULL_COUNT).put(nullCount);
if (nullCount > 0) {
nav.attach(new AnnotatedRowsResult(delegate.getNullAnnotation(), _annotationFactory, column));
}
RowAnnotation annotation = delegate.getTrueAnnotation();
nav.where(measureDimension, MEASURE_TRUE_COUNT).put(annotation.getRowCount());
if (annotation.getRowCount() > 0) {
nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
}
annotation = delegate.getFalseAnnotation();
nav.where(measureDimension, MEASURE_FALSE_COUNT).put(annotation.getRowCount());
if (annotation.getRowCount() > 0) {
nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
}
}
Crosstab<Number> valueCombinationCrosstab;
if (_columns.length > 1) {
measureDimension = new CrosstabDimension(DIMENSION_MEASURE);
columnDimension = new CrosstabDimension(DIMENSION_COLUMN);
for (InputColumn<Boolean> column : _columns) {
columnDimension.addCategory(column.getName());
}
columnDimension.addCategory(VALUE_COMBINATION_COLUMN_FREQUENCY);
valueCombinationCrosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension);
SortedSet<Entry<ValueCombination<Boolean>, RowAnnotation>> entries = new TreeSet<Map.Entry<ValueCombination<Boolean>, RowAnnotation>>(
frequentValueCombinationComparator);
entries.addAll(_valueCombinations.entrySet());
int row = 0;
for (Entry<ValueCombination<Boolean>, RowAnnotation> entry : entries) {
String measureName;
if (row == 0) {
measureName = MEASURE_MOST_FREQUENT;
} else if (row + 1 == entries.size()) {
measureName = MEASURE_LEAST_FREQUENT;
} else {
measureName = "Combination " + row;
}
measureDimension.addCategory(measureName);
CrosstabNavigator<Number> nav = valueCombinationCrosstab.where(measureDimension, measureName);
ValueCombination<Boolean> valueCombination = entry.getKey();
RowAnnotation annotation = entry.getValue();
nav.where(columnDimension, VALUE_COMBINATION_COLUMN_FREQUENCY);
nav.put(annotation.getRowCount());
nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, _columns));
for (int i = 0; i < valueCombination.getValueCount(); i++) {
InputColumn<Boolean> column = _columns[i];
Boolean value = valueCombination.getValueAt(i);
Byte numberValue = null;
if (value != null) {
if (value.booleanValue()) {
numberValue = 1;
} else {
numberValue = 0;
}
}
nav.where(columnDimension, column.getName());
nav.put(numberValue);
}
row++;
}
} else {
valueCombinationCrosstab = null;
}
return new BooleanAnalyzerResult(crosstab, valueCombinationCrosstab);
}
}