/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.valuedist; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.inject.Inject; import org.datacleaner.api.AnalyzerResultReducer; import org.datacleaner.api.InputColumn; import org.datacleaner.api.Provided; import org.datacleaner.result.ReducedSingleValueDistributionResult; import org.datacleaner.result.ValueCountingAnalyzerResult; import org.datacleaner.result.ValueFrequency; import org.datacleaner.storage.RowAnnotationFactory; /** * A reducer of {@link ValueDistributionAnalyzerResult}s. */ public class ValueDistributionAnalyzerResultReducer implements AnalyzerResultReducer<ValueDistributionAnalyzerResult> { @Inject @Provided RowAnnotationFactory _rowAnnotationFactory; @Override public ValueDistributionAnalyzerResult reduce( final Collection<? extends ValueDistributionAnalyzerResult> analyzerResults) { if (hasGroupedResults(analyzerResults)) { return reduceGroupedResults(analyzerResults); } else { return reduceSingleResults(analyzerResults); } } private ValueDistributionAnalyzerResult reduceSingleResults( final Collection<? extends ValueDistributionAnalyzerResult> analyzerResults) { final Map<String, Integer> reducedValueCounts = new HashMap<>(); Integer nullCount = 0; final ValueDistributionAnalyzerResult first = analyzerResults.iterator().next(); for (final ValueDistributionAnalyzerResult partialResult : analyzerResults) { if ((partialResult instanceof SingleValueDistributionResult) || (partialResult instanceof ReducedSingleValueDistributionResult)) { nullCount = reduceValueCounts(reducedValueCounts, nullCount, partialResult); } else { throw new IllegalStateException( "Unsupported type of " + ValueDistributionAnalyzerResult.class.getSimpleName() + ": " + partialResult.getClass().getSimpleName()); } } return new ReducedSingleValueDistributionResult(first.getName(), reducedValueCounts, nullCount); } private boolean hasGroupedResults(final Collection<? extends ValueDistributionAnalyzerResult> analyzerResults) { for (final ValueDistributionAnalyzerResult valueDistributionAnalyzerResult : analyzerResults) { if (valueDistributionAnalyzerResult instanceof GroupedValueDistributionResult) { return true; } } return false; } private ValueDistributionAnalyzerResult reduceGroupedResults( final Collection<? extends ValueDistributionAnalyzerResult> analyzerResults) { final Map<String, List<ValueDistributionAnalyzerResult>> groupedMap = new HashMap<>(); final ValueDistributionAnalyzerResult first = analyzerResults.iterator().next(); for (final ValueDistributionAnalyzerResult partialResult : analyzerResults) { if (partialResult instanceof GroupedValueDistributionResult) { final GroupedValueDistributionResult groupedPartialResult = (GroupedValueDistributionResult) partialResult; for (final ValueCountingAnalyzerResult childValueCountingResult : groupedPartialResult .getGroupResults()) { final ValueDistributionAnalyzerResult childValueDistributionResult = (ValueDistributionAnalyzerResult) childValueCountingResult; final String groupName = childValueCountingResult.getName(); if (groupedMap.containsKey(groupName)) { final List<ValueDistributionAnalyzerResult> list = groupedMap.get(groupName); list.add(childValueDistributionResult); } else { final List<ValueDistributionAnalyzerResult> list = new ArrayList<>(); list.add(childValueDistributionResult); groupedMap.put(groupName, list); } } } else { throw new IllegalStateException( "Unsupported type of " + ValueDistributionAnalyzerResult.class.getSimpleName() + ": " + partialResult.getClass().getSimpleName()); } } final List<ValueDistributionAnalyzerResult> reducedChildResults = new ArrayList<>(); final Collection<List<ValueDistributionAnalyzerResult>> groupedLists = groupedMap.values(); for (final List<ValueDistributionAnalyzerResult> list : groupedLists) { final ValueDistributionAnalyzerResult reducedChildResult = reduce(list); reducedChildResults.add(reducedChildResult); } final InputColumn<?> inputColumn = ((GroupedValueDistributionResult) first).getColumn(); final InputColumn<String> groupColumn = ((GroupedValueDistributionResult) first).getGroupColumn(); return new GroupedValueDistributionResult(inputColumn, groupColumn, reducedChildResults); } private Integer reduceValueCounts(final Map<String, Integer> reducedValueCounts, Integer nullCount, final ValueDistributionAnalyzerResult partialResult) { final Collection<ValueFrequency> valueCounts = partialResult.getValueCounts(); for (final ValueFrequency valueFrequency : valueCounts) { if (!valueFrequency.isComposite()) { nullCount = recordNonCompositeValueFrequency(reducedValueCounts, nullCount, valueFrequency); } else { for (final ValueFrequency childValueFrequency : valueFrequency.getChildren()) { nullCount = recordNonCompositeValueFrequency(reducedValueCounts, nullCount, childValueFrequency); } } } return nullCount; } private Integer recordNonCompositeValueFrequency(final Map<String, Integer> reducedValueCounts, Integer nullCount, final ValueFrequency valueFrequency) { final String value = valueFrequency.getValue(); final int count = valueFrequency.getCount(); if (value != null) { if (reducedValueCounts.containsKey(value)) { final Integer oldCount = reducedValueCounts.get(value); reducedValueCounts.put(value, oldCount + count); } else { reducedValueCounts.put(value, count); } } else { nullCount = nullCount + count; } return nullCount; } }