/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.valuedist;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.Collection;
import org.apache.metamodel.schema.MutableColumn;
import org.datacleaner.configuration.DataCleanerConfigurationImpl;
import org.datacleaner.connection.Datastore;
import org.datacleaner.data.MetaModelInputColumn;
import org.datacleaner.data.MockInputColumn;
import org.datacleaner.data.MockInputRow;
import org.datacleaner.descriptors.AnalyzerDescriptor;
import org.datacleaner.descriptors.Descriptors;
import org.datacleaner.descriptors.MetricDescriptor;
import org.datacleaner.descriptors.MetricParameters;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.builder.AnalyzerComponentBuilder;
import org.datacleaner.result.GroupedValueCountingAnalyzerResult;
import org.datacleaner.result.ValueCountList;
import org.datacleaner.result.ValueCountingAnalyzerResult;
import org.datacleaner.test.TestHelper;
import org.junit.Test;
public class ValueDistributionAnalyzerTest {
@Test
public void testComponentBuilderIsDistributable() {
final Datastore datastore = TestHelper.createSampleDatabaseDatastore("orderdb");
final DataCleanerConfigurationImpl configuration = new DataCleanerConfigurationImpl().withDatastores(datastore);
try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration)) {
ajb.setDatastore(datastore);
ajb.addSourceColumns("customers.country", "customers.city");
final AnalyzerComponentBuilder<ValueDistributionAnalyzer> componentBuilder =
ajb.addAnalyzer(ValueDistributionAnalyzer.class);
assertTrue(componentBuilder.isDistributable());
componentBuilder.addInputColumn(ajb.getSourceColumnByName("country"));
assertTrue(componentBuilder.isDistributable());
componentBuilder.addInputColumn(ajb.getSourceColumnByName("city"));
assertTrue(componentBuilder.isDistributable());
}
}
@Test
public void testDescriptor() {
final AnalyzerDescriptor<?> desc = Descriptors.ofAnalyzer(ValueDistributionAnalyzer.class);
assertEquals(0, desc.getInitializeMethods().size());
assertEquals(6, desc.getConfiguredProperties().size());
assertEquals(1, desc.getProvidedProperties().size());
assertEquals("Value distribution", desc.getDisplayName());
}
@Test
public void testGetCounts() {
final ValueDistributionAnalyzer vd =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")), true);
assertEquals(0, vd.getResult().getUniqueCount().intValue());
assertEquals(0, vd.getResult().getNullCount());
assertEquals(0, vd.getResult().getDistinctCount().intValue());
assertEquals(0, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), "hello", 1);
assertEquals(1, vd.getResult().getUniqueCount().intValue());
assertEquals(1, vd.getResult().getDistinctCount().intValue());
assertEquals(1, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), "world", 1);
assertEquals(2, vd.getResult().getUniqueCount().intValue());
assertEquals(2, vd.getResult().getDistinctCount().intValue());
assertEquals(2, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), "foobar", 2);
assertEquals(2, vd.getResult().getUniqueCount().intValue());
assertEquals(3, vd.getResult().getDistinctCount().intValue());
assertEquals(4, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), "world", 1);
assertEquals(1, vd.getResult().getUniqueCount().intValue());
assertEquals(3, vd.getResult().getDistinctCount().intValue());
assertEquals(5, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), "hello", 3);
assertEquals(0, vd.getResult().getUniqueCount().intValue());
assertEquals(3, vd.getResult().getDistinctCount().intValue());
assertEquals(8, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), null, 1);
assertEquals(0, vd.getResult().getUniqueCount().intValue());
assertEquals(1, vd.getResult().getNullCount());
assertEquals(4, vd.getResult().getDistinctCount().intValue());
assertEquals(9, vd.getResult().getTotalCount());
vd.runInternal(new MockInputRow(), null, 3);
assertEquals(0, vd.getResult().getUniqueCount().intValue());
assertEquals(4, vd.getResult().getNullCount());
assertEquals(4, vd.getResult().getDistinctCount().intValue());
assertEquals(12, vd.getResult().getTotalCount());
}
@Test
public void testGetValueCountMetric() {
final ValueDistributionAnalyzer vd =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")), true);
vd.runInternal(new MockInputRow(), "hello", 1);
vd.runInternal(new MockInputRow(), "world", 1);
vd.runInternal(new MockInputRow(), "foobar", 2);
vd.runInternal(new MockInputRow(), "world", 1);
vd.runInternal(new MockInputRow(), "hello", 3);
vd.runInternal(new MockInputRow(), null, 1);
vd.runInternal(new MockInputRow(), null, 3);
final ValueCountingAnalyzerResult result = vd.getResult();
final AnalyzerDescriptor<?> desc = Descriptors.ofAnalyzer(ValueDistributionAnalyzer.class);
final MetricDescriptor metric = desc.getResultMetric("Value count");
final Collection<String> suggestions = metric.getMetricParameterSuggestions(result);
assertEquals("[hello, foobar, world]", suggestions.toString());
assertEquals(4, metric.getValue(result, new MetricParameters("hello")));
assertEquals(2, metric.getValue(result, new MetricParameters("world")));
assertEquals(6, metric.getValue(result, new MetricParameters("IN [hello,world]")));
assertEquals(8, metric.getValue(result, new MetricParameters("NOT IN [foobar,world]")));
}
@Test
public void testGetValueDistribution() {
final ValueDistributionAnalyzer vd =
new ValueDistributionAnalyzer(new MetaModelInputColumn(new MutableColumn("col")), true);
vd.runInternal(new MockInputRow(), "hello", 1);
vd.runInternal(new MockInputRow(), "hello", 1);
vd.runInternal(new MockInputRow(), "world", 3);
final ValueCountingAnalyzerResult result = vd.getResult();
final ValueCountList topValues = ((SingleValueDistributionResult) result).getTopValues();
assertEquals(2, topValues.getActualSize());
assertEquals("[world->3]", topValues.getValueCounts().get(0).toString());
assertEquals("[hello->2]", topValues.getValueCounts().get(1).toString());
assertEquals(0, result.getNullCount());
assertEquals(0, result.getUniqueCount().intValue());
final String[] resultLines = result.toString().split("\n");
assertEquals(3, resultLines.length);
assertEquals("Value distribution for: col", resultLines[0]);
assertEquals(" - world: 3", resultLines[1]);
assertEquals(" - hello: 2", resultLines[2]);
}
@Test
public void testGroupedRun() {
final ValueDistributionAnalyzer vd = new ValueDistributionAnalyzer(new MockInputColumn<>("foo", String.class),
new MockInputColumn<>("bar", String.class), true);
vd.runInternal(new MockInputRow(), "Copenhagen N", "2200", 3);
vd.runInternal(new MockInputRow(), "Copenhagen E", "2100", 2);
vd.runInternal(new MockInputRow(), "Copenhagen", "1732", 4);
vd.runInternal(new MockInputRow(), "Coppenhagen", "1732", 3);
final ValueCountingAnalyzerResult result = vd.getResult();
assertTrue(result instanceof GroupedValueCountingAnalyzerResult);
final String resultString = result.toString();
System.out.println(resultString);
final String[] resultLines = resultString.split("\n");
assertEquals(11, resultLines.length);
assertEquals("Value distribution for column: foo", resultLines[0]);
int i = 0;
assertEquals("Value distribution for column: foo", resultLines[i++]);
assertEquals("", resultLines[i++]);
assertEquals("Group: 1732", resultLines[i++]);
assertEquals(" - Copenhagen: 4", resultLines[i++]);
assertEquals(" - Coppenhagen: 3", resultLines[i++]);
assertEquals("", resultLines[i++]);
assertEquals("Group: 2100", resultLines[i++]);
assertEquals(" - Copenhagen E: 2", resultLines[i++]);
assertEquals("", resultLines[i++]);
assertEquals("Group: 2200", resultLines[i++]);
assertEquals(" - Copenhagen N: 3", resultLines[i++]);
}
}