/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.SortedSet; import java.util.TreeSet; import org.datacleaner.api.AnalyzerResultReducer; import org.datacleaner.result.Crosstab; import org.datacleaner.result.CrosstabDimension; import org.datacleaner.result.CrosstabNavigator; import org.datacleaner.util.CrosstabReducerHelper; import org.datacleaner.util.ValueCombination; public class BooleanAnalyzerReducer implements AnalyzerResultReducer<BooleanAnalyzerResult> { private static final Comparator<Map.Entry<ValueCombination<Number>, Number>> frequentValueCombinationComparator = (o1, o2) -> { Number result = CrosstabReducerHelper.subtract(o2.getValue(), o1.getValue()); if (result.intValue() == 0) { result = o2.getKey().compareTo(o1.getKey()); } return result.intValue(); }; @Override public BooleanAnalyzerResult reduce(final Collection<? extends BooleanAnalyzerResult> partialResults) { if (partialResults.isEmpty()) { return null; } // Create the dimensions final List<CrosstabDimension> columnStatisticCrosstabDimensions = new ArrayList<>(); final List<CrosstabDimension> columnValueCombinationCrosstabDimensions = new ArrayList<>(); for (final BooleanAnalyzerResult partialResult : partialResults) { final Crosstab<Number> partialColumnStatisticsCrosstab = partialResult.getColumnStatisticsCrosstab(); final Crosstab<Number> partialValueCombinationCrosstab = partialResult.getValueCombinationCrosstab(); CrosstabReducerHelper .createDimensionsColumnCrosstab(columnStatisticCrosstabDimensions, partialColumnStatisticsCrosstab); createDimensionsValueCombinationCrosstab(columnValueCombinationCrosstabDimensions, partialValueCombinationCrosstab); } final Crosstab<Number> newResultColumnStatistics = new Crosstab<>(Number.class, columnStatisticCrosstabDimensions); final Crosstab<Number> newResultColumnValueCombination = new Crosstab<>(Number.class, columnValueCombinationCrosstabDimensions); final Map<ValueCombination<Number>, Number> valueCombinations = new HashMap<>(); // add the partial results for (final BooleanAnalyzerResult partialResult : partialResults) { final Crosstab<Number> partialColumnStatisticsCrosstab = partialResult.getColumnStatisticsCrosstab(); final Crosstab<Number> partialValueCombinationCrosstab = partialResult.getValueCombinationCrosstab(); if (partialColumnStatisticsCrosstab != null) { final CrosstabDimension columnDimension = partialColumnStatisticsCrosstab.getDimension(BooleanAnalyzer.DIMENSION_COLUMN); final CrosstabDimension measureDimension = partialColumnStatisticsCrosstab.getDimension(BooleanAnalyzer.DIMENSION_MEASURE); CrosstabReducerHelper .addData(newResultColumnStatistics, partialColumnStatisticsCrosstab, columnDimension, measureDimension); } // gather the sum of all possible value combinations found in the // partial crosstabs if (partialValueCombinationCrosstab != null) { addValueCombinationsCrosstabDimension(valueCombinations, partialValueCombinationCrosstab); } } // create a new measure dimension for Value Combination crosstab createMeasureDimensionValueCombinationCrosstab(valueCombinations, newResultColumnValueCombination); return new BooleanAnalyzerResult(newResultColumnStatistics, newResultColumnValueCombination); } /** * Creates the measure dimension based on the sorted value combinations * * @param valueCombinations * @param valueCombinationCrosstab */ public void createMeasureDimensionValueCombinationCrosstab( final Map<ValueCombination<Number>, Number> valueCombinations, final Crosstab<Number> valueCombinationCrosstab) { if (CrosstabReducerHelper.findDimension(valueCombinationCrosstab, BooleanAnalyzer.DIMENSION_MEASURE)) { final SortedSet<Entry<ValueCombination<Number>, Number>> entries = new TreeSet<>(frequentValueCombinationComparator); entries.addAll(valueCombinations.entrySet()); final CrosstabNavigator<Number> nav = new CrosstabNavigator<>(valueCombinationCrosstab); final CrosstabDimension measureDimension = valueCombinationCrosstab.getDimension(BooleanAnalyzer.DIMENSION_MEASURE); final CrosstabDimension columnDimension = valueCombinationCrosstab.getDimension(BooleanAnalyzer.DIMENSION_COLUMN); final List<String> columnDimCategories = columnDimension.getCategories(); int row = 0; for (final Entry<ValueCombination<Number>, Number> entry : entries) { // create the category final String measureName; if (row == 0) { measureName = BooleanAnalyzer.MEASURE_MOST_FREQUENT; } else if (row == entries.size() - 1) { measureName = BooleanAnalyzer.MEASURE_LEAST_FREQUENT; } else { measureName = BooleanAnalyzer.DIMENSION_COMBINATION_PREFIX + row; } measureDimension.addCategory(measureName); // extract data final Number[] values = new Number[columnDimCategories.size()]; final ValueCombination<Number> key = entry.getKey(); for (int i = 0; i < key.getValueCount(); i++) { values[i] = key.getValueAt(i); } values[columnDimCategories.size() - 1] = entry.getValue(); // put data into crosstab for (int i = 0; i < columnDimCategories.size(); i++) { nav.where(columnDimension, columnDimCategories.get(i)); nav.where(measureDimension, measureName); nav.put(values[i]); } row++; } } } /** * Gather the sum of all possible value combinations of the partial * crosstabs * * @param valueCombMapList * @param partialCrosstab */ public void addValueCombinationsCrosstabDimension(final Map<ValueCombination<Number>, Number> valueCombMapList, final Crosstab<Number> partialCrosstab) { final CrosstabNavigator<Number> nav = new CrosstabNavigator<>(partialCrosstab); final CrosstabDimension columnDimension = partialCrosstab.getDimension(BooleanAnalyzer.DIMENSION_COLUMN); final CrosstabDimension measureDimension = partialCrosstab.getDimension(BooleanAnalyzer.DIMENSION_MEASURE); final List<String> columnDimCategories = columnDimension.getCategories(); final List<String> measureCategories = measureDimension.getCategories(); for (final String measureCategory : measureCategories) { final Number[] values = new Number[columnDimCategories.size() - 1]; for (int i = 0; i < columnDimCategories.size() - 1; i++) { nav.where(columnDimension, columnDimCategories.get(i)); final CrosstabNavigator<Number> where = nav.where(measureDimension, measureCategory); final Number value = where.safeGet(null); if (!columnDimCategories.get(0).equals(BooleanAnalyzer.VALUE_COMBINATION_COLUMN_FREQUENCY)) { values[i] = value; } } final CrosstabNavigator<Number> where = nav.where(columnDimension, BooleanAnalyzer.VALUE_COMBINATION_COLUMN_FREQUENCY); final Number frequency = where.safeGet(null); final Number frequencyVal = frequency != null ? frequency : 0; final ValueCombination<Number> valComb = new ValueCombination<>(values); final Number combination = valueCombMapList.get(valComb); if (combination == null) { valueCombMapList.put(valComb, frequencyVal); } else { final Number newValue = CrosstabReducerHelper.sum(combination, frequencyVal); valueCombMapList.put(valComb, newValue); } } } /** * * @param crosstabDimensions * @param partialCrosstab * @throws IllegalStateException */ public void createDimensionsValueCombinationCrosstab(final List<CrosstabDimension> crosstabDimensions, final Crosstab<Number> partialCrosstab) throws IllegalStateException { if (partialCrosstab != null) { final CrosstabDimension columnDimension = partialCrosstab.getDimension(BooleanAnalyzer.DIMENSION_COLUMN); if (crosstabDimensions.size() == 0) { crosstabDimensions.add(columnDimension); // the Value Combination crosstab gets an empty measure // dimension because the measure categories need to be recreated // based on the value combinations found final CrosstabDimension measureDimension = new CrosstabDimension(BooleanAnalyzer.DIMENSION_MEASURE); crosstabDimensions.add(measureDimension); } else { // trying to be smart if (!CrosstabReducerHelper.dimensionExits(crosstabDimensions, columnDimension)) { throw new IllegalStateException( "The crosstabs do not have the same categories in dimension Column"); } } } } }