/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.valuedist;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.metamodel.schema.ColumnType;
import org.apache.metamodel.schema.MutableColumn;
import org.datacleaner.data.MetaModelInputColumn;
import org.datacleaner.data.MockInputRow;
import org.datacleaner.result.ReducedSingleValueDistributionResult;
import org.datacleaner.result.ValueCountList;
import org.datacleaner.result.ValueCountingAnalyzerResult;
import org.junit.Test;
public class ValueDistributionAnalyzerResultReducerTest {
@Test
public void testReduceSingleResults() throws Exception {
final ValueDistributionAnalyzer valueDist1 =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")), true);
valueDist1.runInternal(new MockInputRow(), "hello", 1);
valueDist1.runInternal(new MockInputRow(), "hello", 1);
valueDist1.runInternal(new MockInputRow(), "world", 3);
valueDist1.runInternal(new MockInputRow(), "locallyUniqueWord", 1);
final ValueDistributionAnalyzerResult partialResult1 = valueDist1.getResult();
final ValueCountList partialTopValues1 = ((SingleValueDistributionResult) partialResult1).getTopValues();
assertEquals(2, partialTopValues1.getActualSize());
assertEquals("[world->3]", partialTopValues1.getValueCounts().get(0).toString());
assertEquals("[hello->2]", partialTopValues1.getValueCounts().get(1).toString());
assertEquals(0, partialResult1.getNullCount());
assertEquals(1, partialResult1.getUniqueCount().intValue());
assertTrue(partialResult1.getUniqueValues().contains("locallyUniqueWord"));
final ValueDistributionAnalyzer valueDist2 =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")), true);
valueDist2.runInternal(new MockInputRow(), "hello", 5);
valueDist2.runInternal(new MockInputRow(), "hello", 1);
valueDist2.runInternal(new MockInputRow(), "world", 7);
valueDist2.runInternal(new MockInputRow(), "locallyUniqueWord", 1);
valueDist2.runInternal(new MockInputRow(), "globallyUniqueWord", 1);
final ValueDistributionAnalyzerResult partialResult2 = valueDist2.getResult();
final ValueCountList partialTopValues2 = ((SingleValueDistributionResult) partialResult2).getTopValues();
assertEquals(2, partialTopValues2.getActualSize());
assertEquals("[world->7]", partialTopValues2.getValueCounts().get(0).toString());
assertEquals("[hello->6]", partialTopValues2.getValueCounts().get(1).toString());
assertEquals(0, partialResult2.getNullCount());
assertEquals(2, partialResult2.getUniqueCount().intValue());
assertTrue(partialResult2.getUniqueValues().contains("locallyUniqueWord"));
assertTrue(partialResult2.getUniqueValues().contains("globallyUniqueWord"));
final List<ValueDistributionAnalyzerResult> partialResults = new ArrayList<>();
partialResults.add(partialResult1);
partialResults.add(partialResult2);
final ValueDistributionAnalyzerResultReducer reducer = new ValueDistributionAnalyzerResultReducer();
final ValueDistributionAnalyzerResult reducedValueDistributionResult = reducer.reduce(partialResults);
final ReducedSingleValueDistributionResult reducedResult =
(ReducedSingleValueDistributionResult) reducedValueDistributionResult;
assertEquals(0, reducedResult.getNullCount());
assertEquals(Integer.valueOf(4), reducedResult.getDistinctCount());
assertEquals(21, reducedResult.getTotalCount());
assertEquals(Integer.valueOf(1), reducedResult.getUniqueCount());
assertEquals("[globallyUniqueWord]", reducedResult.getUniqueValues().toString());
final ValueCountList reducedTopValues = reducedResult.getTopValues();
assertEquals(2, reducedTopValues.getActualSize());
assertEquals("[world->10]", reducedTopValues.getValueCounts().get(0).toString());
assertEquals("[hello->8]", reducedTopValues.getValueCounts().get(1).toString());
}
@Test
public void testReduceGroupedResults() throws Exception {
final ValueDistributionAnalyzer valueDist1 =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")),
new MetaModelInputColumn(new MutableColumn("groupCol", ColumnType.STRING)).narrow(String.class),
true);
valueDist1.runInternal(new MockInputRow(), "hello", "group1", 1);
valueDist1.runInternal(new MockInputRow(), "world", "group1", 3);
valueDist1.runInternal(new MockInputRow(), "locallyUniqueWord", "group1", 1);
valueDist1.runInternal(new MockInputRow(), "hello", "group2", 1);
final ValueDistributionAnalyzerResult partialResult1 = valueDist1.getResult();
final Collection<? extends ValueCountingAnalyzerResult> partialSingleResultList1 =
((GroupedValueDistributionResult) partialResult1).getGroupResults();
// Confirm what we got from the the first analyzer...
{
assertEquals(2, partialSingleResultList1.size());
final Iterator<? extends ValueCountingAnalyzerResult> iterator = partialSingleResultList1.iterator();
final SingleValueDistributionResult group1Analyzer1 = (SingleValueDistributionResult) iterator.next();
assertEquals("group1", group1Analyzer1.getName());
assertEquals(0, group1Analyzer1.getNullCount());
assertEquals(2, group1Analyzer1.getUniqueCount().intValue());
assertTrue(group1Analyzer1.getUniqueValues().contains("hello"));
assertTrue(group1Analyzer1.getUniqueValues().contains("locallyUniqueWord"));
assertEquals(5, group1Analyzer1.getTotalCount());
final SingleValueDistributionResult group2Analyzer1 = (SingleValueDistributionResult) iterator.next();
assertEquals("group2", group2Analyzer1.getName());
assertEquals(0, group2Analyzer1.getNullCount());
assertEquals(1, group2Analyzer1.getUniqueCount().intValue());
assertTrue(group2Analyzer1.getUniqueValues().contains("hello"));
assertEquals(1, group2Analyzer1.getTotalCount());
}
final ValueDistributionAnalyzer valueDist2 =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")),
new MetaModelInputColumn(new MutableColumn("groupCol", ColumnType.STRING)).narrow(String.class),
true);
valueDist2.runInternal(new MockInputRow(), "hello", "group1", 6);
valueDist2.runInternal(new MockInputRow(), "locallyUniqueWord", "group1", 1);
valueDist2.runInternal(new MockInputRow(), "globallyUniqueWord", "group1", 1);
valueDist2.runInternal(new MockInputRow(), "world", "group2", 7);
final ValueDistributionAnalyzerResult partialResult2 = valueDist2.getResult();
// Confirm what we got from the the second analyzer...
{
final Collection<? extends ValueCountingAnalyzerResult> partialSingleResultList2 =
((GroupedValueDistributionResult) partialResult2).getGroupResults();
assertEquals(2, partialSingleResultList2.size());
final Iterator<? extends ValueCountingAnalyzerResult> iterator2 = partialSingleResultList2.iterator();
final SingleValueDistributionResult group1Analyzer2 = (SingleValueDistributionResult) iterator2.next();
assertEquals("group1", group1Analyzer2.getName());
assertEquals(0, group1Analyzer2.getNullCount());
assertEquals(2, group1Analyzer2.getUniqueCount().intValue());
assertTrue(group1Analyzer2.getUniqueValues().contains("globallyUniqueWord"));
assertTrue(group1Analyzer2.getUniqueValues().contains("locallyUniqueWord"));
assertEquals(8, group1Analyzer2.getTotalCount());
final SingleValueDistributionResult group2Analyzer2 = (SingleValueDistributionResult) iterator2.next();
assertEquals("group2", group2Analyzer2.getName());
assertEquals(0, group2Analyzer2.getNullCount());
assertEquals(0, group2Analyzer2.getUniqueCount().intValue());
assertEquals(7, group2Analyzer2.getTotalCount());
}
final List<ValueDistributionAnalyzerResult> partialResults = new ArrayList<>();
partialResults.add(partialResult1);
partialResults.add(partialResult2);
final ValueDistributionAnalyzerResultReducer reducer = new ValueDistributionAnalyzerResultReducer();
final ValueDistributionAnalyzerResult reducedResult = reducer.reduce(partialResults);
// Assert the aggregates from the reduced groups
{
final GroupedValueDistributionResult groupedReducedResult = (GroupedValueDistributionResult) reducedResult;
final Iterator<? extends ValueCountingAnalyzerResult> reducedGroupsIterator =
groupedReducedResult.getGroupResults().iterator();
final ReducedSingleValueDistributionResult firstReducedGroup =
(ReducedSingleValueDistributionResult) reducedGroupsIterator.next();
final ReducedSingleValueDistributionResult secondReducedGroup =
(ReducedSingleValueDistributionResult) reducedGroupsIterator.next();
// The order is non-deterministic...
if (firstReducedGroup.getName().equals("group1")) {
assertEquals("group1", firstReducedGroup.getName());
assertEquals(0, firstReducedGroup.getNullCount());
assertEquals(1, firstReducedGroup.getUniqueCount().intValue());
assertTrue(firstReducedGroup.getUniqueValues().contains("globallyUniqueWord"));
assertEquals(13, firstReducedGroup.getTotalCount());
assertEquals("group2", secondReducedGroup.getName());
assertEquals(0, secondReducedGroup.getNullCount());
assertEquals(1, secondReducedGroup.getUniqueCount().intValue());
assertTrue(secondReducedGroup.getUniqueValues().contains("hello"));
assertEquals(8, secondReducedGroup.getTotalCount());
} else {
assertEquals("group2", firstReducedGroup.getName());
assertEquals(0, firstReducedGroup.getNullCount());
assertEquals(1, firstReducedGroup.getUniqueCount().intValue());
assertTrue(firstReducedGroup.getUniqueValues().contains("hello"));
assertEquals(8, firstReducedGroup.getTotalCount());
assertEquals("group1", secondReducedGroup.getName());
assertEquals(0, secondReducedGroup.getNullCount());
assertEquals(1, secondReducedGroup.getUniqueCount().intValue());
assertTrue(secondReducedGroup.getUniqueValues().contains("globallyUniqueWord"));
assertEquals(13, secondReducedGroup.getTotalCount());
}
}
}
}