/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.spark; import static org.junit.Assert.*; import java.io.File; import java.net.URI; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.io.FileUtils; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.FileResource; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.beans.CompletenessAnalyzerResult; import org.datacleaner.beans.StringAnalyzerResult; import org.datacleaner.beans.uniqueness.UniqueKeyCheckAnalyzerResult; import org.datacleaner.beans.valuedist.GroupedValueDistributionResult; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzerResult; import org.datacleaner.beans.valuematch.ValueMatchAnalyzerResult; import org.datacleaner.beans.writers.WriteDataResult; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.result.ReducedSingleValueDistributionResult; import org.datacleaner.result.ValueCountingAnalyzerResult; import org.datacleaner.util.SystemProperties; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; public class SparkAnalysisRunnerTest { private static class TestSparkJobLifeCycleListener implements SparkJobLifeCycleListener { private static final long serialVersionUID = 1L; final AtomicBoolean _jobStartCalled = new AtomicBoolean(); final AtomicBoolean _jobEndCalled = new AtomicBoolean(); @Override public void onPartitionProcessingStart(final SparkJobContext sparkJobContext) { // Unfortunately, serialization only goes one way, so we can't assert on this. System.out.println("Node start"); } @Override public void onPartitionProcessingEnd(final SparkJobContext sparkJobContext) { // Unfortunately, serialization only goes one way, so we can't assert on this. System.out.println("Node end"); } @Override public void onJobStart(final SparkJobContext sparkJobContext) { _jobStartCalled.set(true); } @Override public void onJobEnd(final SparkJobContext sparkJobContext) { _jobEndCalled.set(true); } } private static final int MIN_PARTITIONS_MULTIPLE = 4; @Rule public TestName testName = new TestName(); @BeforeClass public static void init() { // use local filesystem as default during tests System.setProperty(SystemProperties.DEFAULT_RESOURCE_SCHEME, "file"); } @Test public void testVanillaScenario() throws Exception { final AnalysisResultFuture result = runAnalysisJob("DCTest - " + getName(), URI.create("src/test/resources/vanilla-job.analysis.xml"), "vanilla-job", false); if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } assertEquals(2, result.getResultMap().size()); final List<AnalyzerResult> results = result.getResults(); assertEquals(2, results.size()); final StringAnalyzerResult stringAnalyzerResult = result.getResults(StringAnalyzerResult.class).get(0); assertEquals("[MetaModelInputColumn[resources.person_names.txt.company]]", Arrays.toString(stringAnalyzerResult.getColumns())); final int rowCount = stringAnalyzerResult.getRowCount(stringAnalyzerResult.getColumns()[0]); assertEquals(7, rowCount); final int upperCaseChars = stringAnalyzerResult.getEntirelyUpperCaseCount(stringAnalyzerResult.getColumns()[0]); assertEquals(7, upperCaseChars); } @Test public void testFixedWidthJobScenario() throws Exception { final AnalysisResultFuture result = runAnalysisJob("DCTest - " + getName(), URI.create("src/test/resources/fixed-width-job.analysis.xml"), "fixed-width-job", false); if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } } @Test public void testEscalatedValueDistributionScenario() throws Exception { final AnalysisResultFuture result = runAnalysisJob("DCTest - " + getName(), URI.create("src/test/resources/escalated-job.analysis.xml"), "escalated-job", false); if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } assertEquals(3, result.getResultMap().size()); final List<AnalyzerResult> results = result.getResults(); assertEquals(3, results.size()); final List<? extends ValueDistributionAnalyzerResult> valueDistributionAnalyzerResults = result.getResults(ValueDistributionAnalyzerResult.class); assertEquals(2, valueDistributionAnalyzerResults.size()); final ValueDistributionAnalyzerResult vdAnalyzerResult = valueDistributionAnalyzerResults.get(0); assertEquals(7, vdAnalyzerResult.getTotalCount()); } @Test public void testWriteDataScenarioNoResult() throws Exception { final AnalysisResultFuture result = runWriteDataScenario(false); final List<AnalyzerResult> results = result.getResults(); assertEquals(0, results.size()); } @Test public void testWriteDataScenarioSaveResult() throws Exception { final AnalysisResultFuture result = runWriteDataScenario(true); final List<AnalyzerResult> results = result.getResults(); assertEquals(1, results.size()); final WriteDataResult writeDataResult = result.getResults(WriteDataResult.class).get(0); assertEquals(7, writeDataResult.getWrittenRowCount()); } private AnalysisResultFuture runWriteDataScenario(final boolean saveResult) throws Exception { final String outputPath = "target/write-job.csv"; final File outputFile = new File(outputPath); if (outputFile.exists() && outputFile.isDirectory()) { FileUtils.deleteDirectory(outputFile); } final AnalysisResultFuture result; final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("DCTest - " + getName()); try (JavaSparkContext sparkContext = new JavaSparkContext(sparkConf)) { final SparkJobContext sparkJobContext; if (saveResult) { sparkJobContext = new SparkJobContext(URI.create("src/test/resources/conf_local.xml"), URI.create("src/test/resources/write-job.analysis.xml"), null, sparkContext); } else { sparkJobContext = new SparkJobContext(URI.create("src/test/resources/conf_local.xml"), URI.create("src/test/resources/write-job.analysis.xml"), URI.create("src/test/resources/jobProperties/noResult.properties"), sparkContext); } final AnalysisJob job = sparkJobContext.getAnalysisJob(); assertNotNull(job); assertEquals("write-job", sparkJobContext.getJobName()); final SparkAnalysisRunner sparkAnalysisRunner = new SparkAnalysisRunner(sparkContext, sparkJobContext, MIN_PARTITIONS_MULTIPLE); result = sparkAnalysisRunner.run(job); } if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } assertTrue(outputFile.isDirectory()); // file resource is capable of viewing the directory like it is a single // file final FileResource fileResource = new FileResource(outputFile); final String str = fileResource.read(in -> { return FileHelper.readInputStreamAsString(in, "UTF8"); }); final String[] lines = str.replaceAll("\r", "").split("\n"); assertEquals("\"COUNTRY\",\"CUSTOMERNUMBER\"", lines[0]); assertEquals("\"Denmark\",\"HI\"", lines[1]); // asserting 8 lines is important - 7 data lines and 1 header line assertEquals(8, lines.length); return result; } @Test public void testOutputDataStreamsScenario() throws Exception { final AnalysisResultFuture result = runAnalysisJob("DCTest - testOutputDataStreamsScenario", URI.create("src/test/resources/melon-job.analysis.xml"), "melon-job", false); final List<AnalyzerResult> results = result.getResults(); assertEquals(3, results.size()); final CompletenessAnalyzerResult completenessAnalyzerResult = result.getResults(CompletenessAnalyzerResult.class).get(0); assertEquals(7, completenessAnalyzerResult.getTotalRowCount()); assertEquals(7, completenessAnalyzerResult.getValidRowCount()); assertEquals(0, completenessAnalyzerResult.getInvalidRowCount()); final ValueMatchAnalyzerResult incompleteValueMatcherAnalyzerResult = result.getResults(ValueMatchAnalyzerResult.class).get(0); assertEquals(0, incompleteValueMatcherAnalyzerResult.getTotalCount()); assertEquals(Integer.valueOf(0), incompleteValueMatcherAnalyzerResult.getCount("Kasper")); final ValueMatchAnalyzerResult completeValueMatcherAnalyzerResult = result.getResults(ValueMatchAnalyzerResult.class).get(1); assertEquals(7, completeValueMatcherAnalyzerResult.getTotalCount()); assertEquals(Integer.valueOf(1), completeValueMatcherAnalyzerResult.getCount("Tomasz")); assertEquals(Integer.valueOf(6), completeValueMatcherAnalyzerResult.getUnexpectedValueCount()); } @Test public void testOutputDataStreamsNonDistributableScenario() throws Exception { final AnalysisResultFuture result; result = runAnalysisJob("DCTest - testOutputDataStreamsNonDistributableScenario", URI.create("src/test/resources/non-dist-melon-job.analysis.xml"), "non-dist-melon-job", true); if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } final List<AnalyzerResult> results = result.getResults(); assertEquals(3, results.size()); final CompletenessAnalyzerResult completenessAnalyzerResult = result.getResults(CompletenessAnalyzerResult.class).get(0); assertEquals(7, completenessAnalyzerResult.getTotalRowCount()); assertEquals(7, completenessAnalyzerResult.getValidRowCount()); assertEquals(0, completenessAnalyzerResult.getInvalidRowCount()); final ValueMatchAnalyzerResult incompleteValueMatcherAnalyzerResult = result.getResults(ValueMatchAnalyzerResult.class).get(0); assertEquals(0, incompleteValueMatcherAnalyzerResult.getTotalCount()); assertEquals(Integer.valueOf(0), incompleteValueMatcherAnalyzerResult.getCount("Kasper")); final UniqueKeyCheckAnalyzerResult uniqueKeyCheckAnalyzerResult = result.getResults(UniqueKeyCheckAnalyzerResult.class).get(0); assertEquals(7, uniqueKeyCheckAnalyzerResult.getRowCount()); assertEquals(7, uniqueKeyCheckAnalyzerResult.getUniqueCount()); assertEquals(0, uniqueKeyCheckAnalyzerResult.getNonUniqueCount()); assertEquals(0, uniqueKeyCheckAnalyzerResult.getNullCount()); // TODO: It would also be nice to have a flag indicating // distributable/non-distributable job that we could assert. } @Test public void testValueDistributionReducer() throws Exception { final AnalysisResultFuture result = runAnalysisJob("DCTest - testValueDistributionReducer", URI.create("src/test/resources/distributable-value-dist.analysis.xml"), "distributable-value-dist", true); if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } final List<AnalyzerResult> results = result.getResults(); assertEquals(1, results.size()); final ValueDistributionAnalyzerResult completeValueDistributionAnalyzerResult = result.getResults(ValueDistributionAnalyzerResult.class).get(0); assertEquals(7, completeValueDistributionAnalyzerResult.getTotalCount()); assertEquals(Integer.valueOf(7), completeValueDistributionAnalyzerResult.getUniqueCount()); assertEquals(Integer.valueOf(7), completeValueDistributionAnalyzerResult.getDistinctCount()); assertEquals(0, completeValueDistributionAnalyzerResult.getNullCount()); } @Test public void testGroupedValueDistributionReducer() throws Exception { final AnalysisResultFuture result = runAnalysisJob("DCTest - testGroupedValueDistributionReducer", URI.create("src/test/resources/distributable-grouped-value-dist.analysis.xml"), "distributable-grouped-value-dist", true); if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } final List<AnalyzerResult> results = result.getResults(); assertEquals(1, results.size()); final ValueDistributionAnalyzerResult completeValueDistributionAnalyzerResult = result.getResults(ValueDistributionAnalyzerResult.class).get(0); assertEquals(GroupedValueDistributionResult.class, completeValueDistributionAnalyzerResult.getClass()); final GroupedValueDistributionResult completeGroupedResult = (GroupedValueDistributionResult) completeValueDistributionAnalyzerResult; final Iterator<? extends ValueCountingAnalyzerResult> iterator = completeGroupedResult.getGroupResults().iterator(); final ReducedSingleValueDistributionResult group1 = (ReducedSingleValueDistributionResult) iterator.next(); final ReducedSingleValueDistributionResult group2 = (ReducedSingleValueDistributionResult) iterator.next(); if (group1.getName().equals("Denmark")) { checkGroup(group1, "Denmark", 4, 4, 4, 0); checkGroup(group2, "Netherlands", 3, 3, 3, 0); } else { checkGroup(group2, "Denmark", 4, 4, 4, 0); checkGroup(group1, "Netherlands", 3, 3, 3, 0); } } @Test public void testJsonDatastore() throws Exception { final String appName = "DCTest - " + getName(); final AnalysisResultFuture result = runAnalysisJob(appName, URI.create("src/test/resources/json-job.analysis.xml"), "json-job", false); final List<AnalyzerResult> results = result.getResults(); assertNotNull(results); assertEquals(1, results.size()); final ValueDistributionAnalyzerResult valueDistributionAnalyzerResult = result.getResults(ValueDistributionAnalyzerResult.class).get(0); assertEquals("[[blue->3], [green->2], [<unique>->1]]", valueDistributionAnalyzerResult.getValueCounts().toString()); assertEquals(1, valueDistributionAnalyzerResult.getUniqueCount().intValue()); assertEquals("[brown]", valueDistributionAnalyzerResult.getUniqueValues().toString()); } @Test public void testFixedWidthFiles() throws Exception { final String appName = "DCTest - " + getName(); final AnalysisResultFuture result = runAnalysisJob(appName, URI.create("src/test/resources/fixed-width-job.analysis.xml"), "fixed-width-job", false); final List<AnalyzerResult> results = result.getResults(); assertNotNull(results); assertEquals(2, results.size()); final ValueDistributionAnalyzerResult valueDistributionAnalyzerResult = result.getResults(ValueDistributionAnalyzerResult.class).get(0); assertEquals("[[<unique>->6]]", valueDistributionAnalyzerResult.getValueCounts().toString()); assertEquals("[Mrs. Foobar Foo, Bar, Foo, John Doe, Asbjørn Leeth, Jane Doe, Sørensen, Kasper]", valueDistributionAnalyzerResult.getUniqueValues().toString()); final StringAnalyzerResult stringAnalyzerResult = result.getResults(StringAnalyzerResult.class).get(0); assertNotNull(stringAnalyzerResult); } @Test public void testLifeCycleListener() throws Exception { final String appName = "DCTest - " + getName(); final URI analysisJobXmlPath = URI.create("src/test/resources/json-job.analysis.xml"); final String expectedAnalysisJobName = "json-job"; final TestSparkJobLifeCycleListener sparkJobLifeCycleListener = new TestSparkJobLifeCycleListener(); runAnalysisJob(appName, analysisJobXmlPath, expectedAnalysisJobName, false, sparkJobLifeCycleListener); assertTrue(sparkJobLifeCycleListener._jobStartCalled.get()); assertTrue(sparkJobLifeCycleListener._jobEndCalled.get()); } private void checkGroup(final ReducedSingleValueDistributionResult group, final String groupName, final int expectedTotalCount, final int expectedUniqueCount, final int expectedDistinctCount, final int expectedNullCount) { assertEquals(groupName, group.getName()); assertEquals(expectedTotalCount, group.getTotalCount()); assertEquals(Integer.valueOf(expectedUniqueCount), group.getUniqueCount()); assertEquals(Integer.valueOf(expectedDistinctCount), group.getDistinctCount()); assertEquals(expectedNullCount, group.getNullCount()); } private String getName() { return testName.getMethodName(); } private AnalysisResultFuture runAnalysisJob(final String appName, final URI analysisJobXmlPath, final String expectedAnalysisJobName, final boolean useMinPartitions) throws Exception { return runAnalysisJob(appName, analysisJobXmlPath, expectedAnalysisJobName, useMinPartitions, null); } private AnalysisResultFuture runAnalysisJob(final String appName, final URI analysisJobXmlPath, final String expectedAnalysisJobName, final boolean useMinPartitions, final SparkJobLifeCycleListener sparkJobLifeCycleListener) throws Exception { final AnalysisResultFuture result; final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName(appName); try (JavaSparkContext sparkContext = new JavaSparkContext(sparkConf)) { final SparkJobContext sparkJobContext = new SparkJobContext(URI.create("src/test/resources/conf_local.xml"), analysisJobXmlPath, null, sparkContext); if (sparkJobLifeCycleListener != null) { sparkJobContext.addSparkJobLifeCycleListener(sparkJobLifeCycleListener); } final AnalysisJob job = sparkJobContext.getAnalysisJob(); assertNotNull(job); assertEquals(expectedAnalysisJobName, sparkJobContext.getJobName()); final SparkAnalysisRunner sparkAnalysisRunner = new SparkAnalysisRunner(sparkContext, sparkJobContext, useMinPartitions ? MIN_PARTITIONS_MULTIPLE : null); result = sparkAnalysisRunner.run(job); } if (result.isErrornous()) { throw (Exception) result.getErrors().get(0); } return result; } }