/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.cluster; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import org.apache.metamodel.data.DataSet; import org.apache.metamodel.schema.ColumnType; import org.apache.metamodel.schema.Schema; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.beans.CompletenessAnalyzer; import org.datacleaner.beans.CompletenessAnalyzer.Condition; import org.datacleaner.beans.CompletenessAnalyzerResult; import org.datacleaner.beans.NumberAnalyzer; import org.datacleaner.beans.NumberAnalyzerResult; import org.datacleaner.beans.StringAnalyzer; import org.datacleaner.beans.StringAnalyzerResult; import org.datacleaner.beans.filter.EqualsFilter; import org.datacleaner.beans.transform.ConcatenatorTransformer; import org.datacleaner.beans.valuematch.ValueMatchAnalyzer; import org.datacleaner.beans.valuematch.ValueMatchAnalyzerResult; import org.datacleaner.beans.writers.InsertIntoTableAnalyzer; import org.datacleaner.beans.writers.WriteBufferSizeOption; import org.datacleaner.components.maxrows.MaxRowsFilter; import org.datacleaner.components.maxrows.MaxRowsFilter.Category; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironment; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreCatalog; import org.datacleaner.connection.DatastoreCatalogImpl; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.connection.JdbcDatastore; import org.datacleaner.connection.UpdateableDatastoreConnection; import org.datacleaner.data.MetaModelInputColumn; import org.datacleaner.descriptors.Descriptors; import org.datacleaner.descriptors.SimpleDescriptorProvider; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.ComponentJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.FilterComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.concurrent.SingleThreadedTaskRunner; import org.datacleaner.job.concurrent.TaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.JobStatus; import org.datacleaner.test.TestHelper; import org.junit.Assert; public class ClusterTestHelper { /** * Creates a {@link DataCleanerConfiguration} object (based on a few * parameters), typically to use in test methods of this class. * * @param testName * @param multiThreaded * @return */ public static DataCleanerConfiguration createConfiguration(final String testName, final boolean multiThreaded) { final JdbcDatastore csvDatastore = new JdbcDatastore("csv", "jdbc:h2:mem:" + testName, "org.h2.Driver", "SA", "", true); final UpdateableDatastoreConnection con = csvDatastore.openConnection(); con.getUpdateableDataContext().executeUpdate(callback -> { final Schema schema = callback.getDataContext().getDefaultSchema(); if (schema.getTableByName("testtable") != null) { return; } callback.createTable(schema, "testtable").withColumn("id").ofType(ColumnType.INTEGER).withColumn("name") .ofType(ColumnType.VARCHAR).execute(); }); con.close(); final Datastore databaseDatastore = TestHelper.createSampleDatabaseDatastore("orderdb"); final DatastoreCatalog datastoreCatalog = new DatastoreCatalogImpl(databaseDatastore, csvDatastore); final TaskRunner taskRunner; if (multiThreaded) { taskRunner = new MultiThreadedTaskRunner(20); } else { taskRunner = new SingleThreadedTaskRunner(); } final SimpleDescriptorProvider descriptorProvider = new SimpleDescriptorProvider(true); descriptorProvider.addFilterBeanDescriptor(Descriptors.ofFilter(MaxRowsFilter.class)); descriptorProvider.addTransformerBeanDescriptor(Descriptors.ofTransformer(MockTransformerThatWillFail.class)); descriptorProvider.addTransformerBeanDescriptor(Descriptors.ofTransformer(ConcatenatorTransformer.class)); descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(InsertIntoTableAnalyzer.class)); descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(CompletenessAnalyzer.class)); descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(ValueMatchAnalyzer.class)); descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(MockAnalyzerWithBadReducer.class)); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner).withDescriptorProvider(descriptorProvider); return new DataCleanerConfigurationImpl().withDatastoreCatalog(datastoreCatalog).withEnvironment(environment); } /** * Runs a job that verifies that errors (caused by the * {@link MockTransformerThatWillFail} dummy component) are picked up * correctly from the slave nodes. * * @param configuration * @param clusterManager * @return the list of errors returned, to perform further assertions */ public static List<Throwable> runErrorHandlingJob(final DataCleanerConfiguration configuration, final ClusterManager clusterManager) { final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore("orderdb"); jobBuilder.addSourceColumns("CUSTOMERS.CUSTOMERNUMBER"); final TransformerComponentBuilder<MockTransformerThatWillFail> transformer = jobBuilder.addTransformer(MockTransformerThatWillFail.class); transformer.addInputColumns(jobBuilder.getSourceColumns()); final AnalyzerComponentBuilder<CompletenessAnalyzer> analyzer = jobBuilder.addAnalyzer(CompletenessAnalyzer.class); analyzer.addInputColumns(transformer.getOutputColumns()); analyzer.setConfiguredProperty("Conditions", new CompletenessAnalyzer.Condition[] { CompletenessAnalyzer.Condition.NOT_BLANK_OR_NULL }); // build the job final AnalysisJob job = jobBuilder.toAnalysisJob(); // run the job in a distributed fashion final DistributedAnalysisRunner runner = new DistributedAnalysisRunner(configuration, clusterManager); final AnalysisResultFuture resultFuture = runner.run(job); switch (resultFuture.getStatus()) { case NOT_FINISHED: case ERRORNOUS: break; default: Assert.fail("Unexpected job status: " + resultFuture.getStatus()); } resultFuture.await(); if (resultFuture.isSuccessful()) { Assert.fail("Job that was supposed to fail was succesful! Results: " + resultFuture.getResultMap()); } Assert.assertEquals(JobStatus.ERRORNOUS, resultFuture.getStatus()); final List<Throwable> errors = resultFuture.getErrors(); Assert.assertNotNull(errors); Assert.assertFalse(errors.isEmpty()); jobBuilder.close(); return errors; } public static void runBasicAnalyzersJob(final DataCleanerConfiguration configuration, final ClusterManager clusterManager) throws Throwable { // build a job that concats names and inserts the concatenated names // into a file final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore("orderdb"); jobBuilder.addSourceColumns("CUSTOMERS.CUSTOMERNUMBER"); jobBuilder.addSourceColumns("CUSTOMERS.COUNTRY"); final AnalyzerComponentBuilder<StringAnalyzer> stringAnalyzer = jobBuilder.addAnalyzer(StringAnalyzer.class); stringAnalyzer.addInputColumns(jobBuilder.getAvailableInputColumns(String.class)); final AnalyzerComponentBuilder<NumberAnalyzer> numberAnalyzer = jobBuilder.addAnalyzer(NumberAnalyzer.class); numberAnalyzer.addInputColumns(jobBuilder.getAvailableInputColumns(Number.class)); final AnalysisJob job = jobBuilder.toAnalysisJob(); // run the job in a distributed fashion final DistributedAnalysisRunner runner = new DistributedAnalysisRunner(configuration, clusterManager); final AnalysisResultFuture resultFuture = runner.run(job); jobBuilder.close(); Assert.assertTrue( resultFuture.getStatus() == JobStatus.NOT_FINISHED || resultFuture.getStatus() == JobStatus.SUCCESSFUL); resultFuture.await(); if (resultFuture.isErrornous()) { final List<Throwable> errors = resultFuture.getErrors(); throw errors.get(0); } Assert.assertEquals(JobStatus.SUCCESSFUL, resultFuture.getStatus()); final List<AnalyzerResult> results = resultFuture.getResults(); Assert.assertEquals(2, results.size()); for (final AnalyzerResult analyzerResult : results) { Assert.assertNotNull(analyzerResult); if (analyzerResult instanceof StringAnalyzerResult) { final StringAnalyzerResult stringAnalyzerResult = (StringAnalyzerResult) analyzerResult; final InputColumn<String>[] columns = stringAnalyzerResult.getColumns(); Assert.assertEquals(1, columns.length); final InputColumn<String> column = columns[0]; Assert.assertEquals("COUNTRY", column.getName()); // test reduction: various ways of aggregating crosstab metrics // - min, max, avg, sum Assert.assertEquals(214, stringAnalyzerResult.getRowCount(column)); Assert.assertEquals(0, stringAnalyzerResult.getMinWords(column)); Assert.assertEquals(2, stringAnalyzerResult.getMaxWords(column)); Assert.assertEquals(5.34, stringAnalyzerResult.getAvgChars(column), 0.1d); Assert.assertEquals(1091, stringAnalyzerResult.getTotalCharCount(column)); } else if (analyzerResult instanceof NumberAnalyzerResult) { final NumberAnalyzerResult numberAnalyzerResult = (NumberAnalyzerResult) analyzerResult; final InputColumn<? extends Number>[] columns = numberAnalyzerResult.getColumns(); Assert.assertEquals(1, columns.length); final InputColumn<? extends Number> column = columns[0]; Assert.assertEquals("CUSTOMERNUMBER", column.getName()); Assert.assertEquals(214, numberAnalyzerResult.getRowCount(column)); Assert.assertEquals(298175.0, numberAnalyzerResult.getSum(column).doubleValue(), 0.1); Assert.assertEquals(1393.34, numberAnalyzerResult.getMean(column).doubleValue(), 0.1); Assert.assertEquals(5106, numberAnalyzerResult.getHighestValue(column).doubleValue(), 0.1); Assert.assertEquals(103.0, numberAnalyzerResult.getLowestValue(column).doubleValue(), 0.1); Assert.assertEquals(1646.7, numberAnalyzerResult.getStandardDeviation(column).doubleValue(), 0.8); Assert.assertEquals(null, numberAnalyzerResult.getMedian(column)); } else { Assert.fail("Unexpected analyzer result found: " + analyzerResult); } } } public static void runCompletenessAndValueMatcherAnalyzerJob(final DataCleanerConfiguration configuration, final ClusterManager clusterManager) throws Throwable { // build a job that concats names and inserts the concatenated names // into a file final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore("orderdb"); jobBuilder .addSourceColumns("CUSTOMERS.CUSTOMERNUMBER", "CUSTOMERS.CONTACTFIRSTNAME", "CUSTOMERS.CONTACTLASTNAME", "CUSTOMERS.COUNTRY", "CUSTOMERS.ADDRESSLINE2"); final List<MetaModelInputColumn> cols = jobBuilder.getSourceColumns(); final AnalyzerComponentBuilder<CompletenessAnalyzer> completeness = jobBuilder.addAnalyzer(CompletenessAnalyzer.class); completeness.addInputColumns(cols); final Condition[] conditions = new CompletenessAnalyzer.Condition[cols.size()]; for (int i = 0; i < conditions.length; i++) { conditions[i] = Condition.NOT_BLANK_OR_NULL; } completeness.setConfiguredProperty("Conditions", conditions); final AnalyzerComponentBuilder<ValueMatchAnalyzer> valueMatch = jobBuilder.addAnalyzer(ValueMatchAnalyzer.class); valueMatch.addInputColumn(jobBuilder.getSourceColumnByName("COUNTRY")); valueMatch.setConfiguredProperty("Expected values", new String[] { "United States", "USA", "Denmark", "Danmark", "Netherlands" }); final AnalysisJob job = jobBuilder.toAnalysisJob(); // run the job in a distributed fashion final DistributedAnalysisRunner runner = new DistributedAnalysisRunner(configuration, clusterManager); final AnalysisResultFuture resultFuture = runner.run(job); jobBuilder.close(); if (resultFuture.getStatus() == JobStatus.NOT_FINISHED) { resultFuture.await(); if (resultFuture.isErrornous()) { final List<Throwable> errors = resultFuture.getErrors(); throw errors.get(0); } } Assert.assertEquals(JobStatus.SUCCESSFUL, resultFuture.getStatus()); final List<AnalyzerResult> results = resultFuture.getResults(); Assert.assertEquals(2, results.size()); for (final AnalyzerResult analyzerResult : results) { Assert.assertNotNull(analyzerResult); if (analyzerResult instanceof CompletenessAnalyzerResult) { // Check completeness analyzer result final CompletenessAnalyzerResult completenessAnalyzerResult = (CompletenessAnalyzerResult) analyzerResult; Assert.assertEquals(193, completenessAnalyzerResult.getInvalidRowCount()); final List<InputRow> rows = completenessAnalyzerResult.getSampleRows(); Assert.assertNotNull(rows); Assert.assertTrue("No annotated rows available in CompletenessAnalyzer's result", rows.size() > 0); } else if (analyzerResult instanceof ValueMatchAnalyzerResult) { final ValueMatchAnalyzerResult valueMatchAnalyzerResult = (ValueMatchAnalyzerResult) analyzerResult; Assert.assertEquals(10, valueMatchAnalyzerResult.getNullCount()); Assert.assertEquals(150, valueMatchAnalyzerResult.getUnexpectedValueCount().intValue()); List<InputRow> rows = valueMatchAnalyzerResult.getAnnotatedRowsForUnexpectedValues().getSampleRows(); Assert.assertTrue(rows.size() > 0); Assert.assertTrue(rows.size() <= 150); Assert.assertEquals(8, valueMatchAnalyzerResult.getCount("Denmark").intValue()); rows = new ArrayList<>(valueMatchAnalyzerResult.getAnnotatedRowsForValue("Denmark").getSampleRows()); Assert.assertEquals(8, rows.size()); Collections.sort(rows, (o1, o2) -> (int) (o1.getId() - o2.getId())); Assert.assertEquals("MetaModelInputRow[Row[values=[145, Jytte, Petersen, Denmark, null]]]", rows.get(0).toString()); Assert.assertEquals("MetaModelInputRow[Row[values=[287, Jytte, Pedersen, Denmark, 1734 Kbh]]]", rows.get(2).toString()); } else { Assert.fail("Unexpected analyzer result found: " + analyzerResult); } } } public static void runExistingMaxRowsJob(final DataCleanerConfiguration configuration, final ClusterManager clusterManager) throws Throwable { final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore("orderdb"); jobBuilder.addSourceColumns("CUSTOMERS.CUSTOMERNUMBER", "CUSTOMERS.CONTACTFIRSTNAME", "CUSTOMERS.CONTACTLASTNAME"); final InputColumn<?> col1 = jobBuilder.getSourceColumnByName("CONTACTFIRSTNAME"); final InputColumn<?> col2 = jobBuilder.getSourceColumnByName("CONTACTLASTNAME"); final FilterComponentBuilder<MaxRowsFilter, Category> filter = jobBuilder.addFilter(MaxRowsFilter.class); filter.getComponentInstance().setFirstRow(5); filter.getComponentInstance().setMaxRows(20); final AnalyzerComponentBuilder<StringAnalyzer> analyzer = jobBuilder.addAnalyzer(StringAnalyzer.class); analyzer.addInputColumn(col1); analyzer.addInputColumn(col2); analyzer.setRequirement(filter, MaxRowsFilter.Category.VALID); final AnalysisJob job = jobBuilder.toAnalysisJob(); jobBuilder.close(); final DistributedAnalysisRunner runner = new DistributedAnalysisRunner(configuration, clusterManager); try { runner.run(job); Assert.fail("Exception expected"); } catch (final Exception e) { Assert.assertEquals("Job is not distributable!", e.getMessage()); } } /** * Runs a simple job that is fully distributable and should be able to * execute in all contexts. The job does one transformation (concatenates * two fields) and inserts this field, together with a source field, into * another table. * * @param configuration * @param clusterManager * @throws Throwable */ public static void runConcatAndInsertJob(final DataCleanerConfiguration configuration, final ClusterManager clusterManager) throws Throwable { // build a job that concats names and inserts the concatenated names // into a file final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore("orderdb"); jobBuilder.addSourceColumns("CUSTOMERS.CUSTOMERNUMBER", "CUSTOMERS.CONTACTFIRSTNAME", "CUSTOMERS.CONTACTLASTNAME"); // concatenate firstname + lastname final TransformerComponentBuilder<ConcatenatorTransformer> concatenator = jobBuilder.addTransformer(ConcatenatorTransformer.class); concatenator.addInputColumn(jobBuilder.getSourceColumnByName("CONTACTFIRSTNAME")); concatenator.addInputColumn(jobBuilder.getSourceColumnByName("CONTACTLASTNAME")); concatenator.setConfiguredProperty("Separator", " "); // insert into CSV file final Datastore csvDatastore = configuration.getDatastoreCatalog().getDatastore("csv"); final Datastore dbDatastore = configuration.getDatastoreCatalog().getDatastore("orderdb"); final DatastoreConnection csvCon = csvDatastore.openConnection(); final DatastoreConnection dbCon = dbDatastore.openConnection(); try { final Schema schema = csvCon.getDataContext().getDefaultSchema(); final String schemaName = schema.getName(); final String tableName = schema.getTable(0).getName(); final AnalyzerComponentBuilder<InsertIntoTableAnalyzer> insert = jobBuilder.addAnalyzer(InsertIntoTableAnalyzer.class); insert.setConfiguredProperty("Datastore", csvDatastore); insert.addInputColumn(jobBuilder.getSourceColumnByName("CUSTOMERNUMBER")); insert.addInputColumn(concatenator.getOutputColumns().get(0)); insert.setConfiguredProperty("Schema name", schemaName); insert.setConfiguredProperty("Table name", tableName); insert.setConfiguredProperty("Column names", new String[] { "id", "name" }); insert.setConfiguredProperty("Buffer size", WriteBufferSizeOption.TINY); // build the job final AnalysisJob job = jobBuilder.toAnalysisJob(); // run the job in a distributed fashion final DistributedAnalysisRunner runner = new DistributedAnalysisRunner(configuration, clusterManager); final AnalysisResultFuture resultFuture = runner.run(job); if (resultFuture.getStatus() == JobStatus.NOT_FINISHED) { resultFuture.await(); if (resultFuture.isErrornous()) { final List<Throwable> errors = resultFuture.getErrors(); throw errors.get(0); } } Assert.assertEquals(JobStatus.SUCCESSFUL, resultFuture.getStatus()); // check that the file created has the same amount of records as the // CUSTOMER table of orderdb. try (DataSet ds1 = dbCon.getDataContext().query().from("CUSTOMERS").selectCount().execute(); DataSet ds2 = csvCon.getDataContext().query().from(tableName).selectCount().execute()) { Assert.assertTrue(ds1.next()); Assert.assertTrue(ds2.next()); Assert.assertEquals(ds1.getRow().toString(), ds2.getRow().toString()); } // await multiple times to ensure that second time isn't distorting // the result resultFuture.await(); resultFuture.await(); // check that the analysis result elements are there... final Map<ComponentJob, AnalyzerResult> resultMap = resultFuture.getResultMap(); Assert.assertEquals(1, resultMap.size()); Assert.assertEquals("{ImmutableAnalyzerJob[name=null,analyzer=Insert into table]=214 inserts executed}", resultMap.toString()); } finally { dbCon.close(); csvCon.close(); jobBuilder.close(); } } public static void runNoExpectedRecordsJob(final DataCleanerConfiguration configuration) throws Throwable { final AnalysisJob job; { try (AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration)) { // build a job that concats names and inserts the concatenated // names // into a file jobBuilder.setDatastore("orderdb"); jobBuilder.addSourceColumns("CUSTOMERS.CUSTOMERNUMBER", "CUSTOMERS.CONTACTFIRSTNAME", "CUSTOMERS.CONTACTLASTNAME"); final FilterComponentBuilder<EqualsFilter, EqualsFilter.Category> equalsFilter = jobBuilder.addFilter(EqualsFilter.class); equalsFilter.addInputColumn(jobBuilder.getSourceColumnByName("CUSTOMERNUMBER")); equalsFilter.getComponentInstance().setValues(new String[] { "-1000000" }); final AnalyzerComponentBuilder<StringAnalyzer> stringAnalyzer = jobBuilder.addAnalyzer(StringAnalyzer.class); stringAnalyzer.addInputColumns(jobBuilder.getAvailableInputColumns(String.class)); stringAnalyzer.setRequirement(equalsFilter, EqualsFilter.Category.EQUALS); job = jobBuilder.toAnalysisJob(); } } final DistributedAnalysisRunner analysisRunner = new DistributedAnalysisRunner(configuration, new ClusterManager() { @Override public JobDivisionManager getJobDivisionManager() { throw new IllegalStateException( "Since this job should yield 0 expected records, this method should not be invoked"); } @Override public AnalysisResultFuture dispatchJob(final AnalysisJob job, final DistributedJobContext context) throws Exception { throw new IllegalStateException( "Since this job should yield 0 expected records, this method should not be invoked"); } }); final AnalysisResultFuture resultFuture = analysisRunner.run(job); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } final List<AnalyzerResult> results = resultFuture.getResults(); Assert.assertEquals(1, results.size()); final AnalyzerResult analyzerResult = results.get(0); Assert.assertTrue(analyzerResult instanceof StringAnalyzerResult); } public static void runCancelJobJob(final DataCleanerConfiguration configuration, final ClusterManager clusterManager) throws Throwable { // build a job that concats names and inserts the concatenated names // into a file final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore("orderdb"); jobBuilder.addSourceColumns("CUSTOMERS.CUSTOMERNUMBER", "CUSTOMERS.CONTACTFIRSTNAME", "CUSTOMERS.CONTACTLASTNAME"); // concatenate firstname + lastname final TransformerComponentBuilder<ConcatenatorTransformer> concatenator = jobBuilder.addTransformer(ConcatenatorTransformer.class); concatenator.addInputColumn(jobBuilder.getSourceColumnByName("CONTACTFIRSTNAME")); concatenator.addInputColumn(jobBuilder.getSourceColumnByName("CONTACTLASTNAME")); concatenator.setConfiguredProperty("Separator", " "); // insert into CSV file final Datastore csvDatastore = configuration.getDatastoreCatalog().getDatastore("csv"); final Datastore dbDatastore = configuration.getDatastoreCatalog().getDatastore("orderdb"); final DatastoreConnection csvCon = csvDatastore.openConnection(); final DatastoreConnection dbCon = dbDatastore.openConnection(); try { final Schema schema = csvCon.getDataContext().getDefaultSchema(); final String schemaName = schema.getName(); final String tableName = schema.getTable(0).getName(); final AnalyzerComponentBuilder<InsertIntoTableAnalyzer> insert = jobBuilder.addAnalyzer(InsertIntoTableAnalyzer.class); insert.setConfiguredProperty("Datastore", csvDatastore); insert.addInputColumn(jobBuilder.getSourceColumnByName("CUSTOMERNUMBER")); insert.addInputColumn(concatenator.getOutputColumns().get(0)); insert.setConfiguredProperty("Schema name", schemaName); insert.setConfiguredProperty("Table name", tableName); insert.setConfiguredProperty("Column names", new String[] { "id", "name" }); insert.setConfiguredProperty("Buffer size", WriteBufferSizeOption.TINY); // build the job final AnalysisJob job = jobBuilder.toAnalysisJob(); // run the job in a distributed fashion final DistributedAnalysisRunner runner = new DistributedAnalysisRunner(configuration, clusterManager); final AnalysisResultFuture resultFuture = runner.run(job); resultFuture.cancel(); Assert.assertTrue(resultFuture.isCancelled()); } finally { dbCon.close(); csvCon.close(); jobBuilder.close(); } } }