/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.job; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.stream.Collectors; import org.apache.metamodel.util.FileResource; import org.apache.metamodel.util.ToStringComparator; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.api.InputColumn; import org.datacleaner.api.OutputDataStream; import org.datacleaner.beans.CompletenessAnalyzerResult; import org.datacleaner.beans.StringAnalyzerResult; import org.datacleaner.beans.dategap.DateGapAnalyzerResult; import org.datacleaner.beans.dategap.DateGapTextRenderer; import org.datacleaner.beans.transform.DateMaskMatcherTransformer; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzerResult; import org.datacleaner.components.convert.ConvertToDateTransformer; import org.datacleaner.components.fuse.CoalesceUnit; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironment; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.configuration.SourceColumnMapping; import org.datacleaner.connection.CsvDatastore; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreCatalog; import org.datacleaner.connection.DatastoreCatalogImpl; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.connection.ExcelDatastore; import org.datacleaner.connection.SchemaNavigator; import org.datacleaner.data.MetaModelInputColumn; import org.datacleaner.descriptors.ClasspathScanDescriptorProvider; import org.datacleaner.descriptors.ComponentDescriptor; import org.datacleaner.descriptors.ConfiguredPropertyDescriptor; import org.datacleaner.descriptors.DescriptorProvider; import org.datacleaner.descriptors.Descriptors; import org.datacleaner.descriptors.SimpleDescriptorProvider; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.ComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunner; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.reference.ReferenceDataCatalog; import org.datacleaner.reference.ReferenceDataCatalogImpl; import org.datacleaner.reference.SimpleSynonymCatalog; import org.datacleaner.reference.SynonymCatalog; import org.datacleaner.result.CrosstabResult; import org.datacleaner.result.renderer.CrosstabTextRenderer; import org.datacleaner.test.MockAnalyzer; import org.datacleaner.test.TestHelper; import junit.framework.TestCase; public class JaxbJobReaderTest extends TestCase { private final DescriptorProvider descriptorProvider = new ClasspathScanDescriptorProvider().scanPackage("org.datacleaner", true); private final DatastoreCatalog datastoreCatalog = new DatastoreCatalogImpl(TestHelper.createSampleDatabaseDatastore("my database")); private final DataCleanerConfigurationImpl conf = new DataCleanerConfigurationImpl().withDatastoreCatalog(datastoreCatalog) .withEnvironment(new DataCleanerEnvironmentImpl().withDescriptorProvider(descriptorProvider)); // see #1196 - Synonym lookup changes has broken old jobs public void testReadJobWhereOutputColumnsHasBeenAddedToComponent() throws Exception { final SynonymCatalog synonymCatalog = new SimpleSynonymCatalog("Job titles"); final Collection<SynonymCatalog> synonyms = Collections.singletonList(synonymCatalog); final ReferenceDataCatalog referenceDataCatalog = new ReferenceDataCatalogImpl(Collections.emptyList(), synonyms, Collections.emptyList()); final DataCleanerConfigurationImpl conf = this.conf.withReferenceDataCatalog(referenceDataCatalog); final JobReader<InputStream> reader = new JaxbJobReader(conf); final AnalysisJob job = reader.read( new FileInputStream(new File("src/test/resources/example-job-job-title-analytics.analysis.xml"))); assertNotNull(job); } public void testReadComponentNames() throws Exception { final JobReader<InputStream> reader = new JaxbJobReader(conf); final AnalysisJob job = reader.read(new FileInputStream(new File("src/test/resources/example-job-component-names.xml"))); assertEquals(1, job.getAnalyzerJobs().size()); assertEquals("analyzer_1", job.getAnalyzerJobs().iterator().next().getName()); assertEquals(2, job.getFilterJobs().size()); assertEquals("single_word_1", job.getFilterJobs().iterator().next().getName()); assertEquals(1, job.getTransformerJobs().size()); assertEquals("email_std_1", job.getTransformerJobs().iterator().next().getName()); } public void testReadOnlyColumnNamePaths() throws Exception { final JobReader<InputStream> reader = new JaxbJobReader(conf); final FileInputStream source = new FileInputStream(new File("src/test/resources/example-job-only-columns-names-paths.analysis.xml")); final AnalysisJobMetadata metadata = reader.readMetadata(source); assertNotNull(metadata); assertEquals("UKContactData.csv", metadata.getDatastoreName()); assertEquals("[RecordId, Company, FirstName, LastName, AddressLine1, AddressLine2, AddressLine3, " + "AddressLine4, City, State, Country, Postcode]", metadata.getSourceColumnPaths().toString()); } public void testReadMetadataFull() throws Exception { final JobReader<InputStream> reader = new JaxbJobReader(conf); final AnalysisJobMetadata metadata = reader.readMetadata(new FileInputStream(new File("src/test/resources/example-job-metadata.xml"))); assertEquals("Kasper Sørensen", metadata.getAuthor()); assertEquals("my database", metadata.getDatastoreName()); assertEquals("Job metadata", metadata.getJobName()); assertEquals("An example job with complete metadata", metadata.getJobDescription()); assertEquals("1.1", metadata.getJobVersion()); assertEquals("[PUBLIC.PERSONS.FIRSTNAME, PUBLIC.PERSONS.LASTNAME]", metadata.getSourceColumnPaths().toString()); assertEquals("propertyValue", metadata.getProperties().get("propertyName")); assertNotNull(metadata.getCreatedDate()); assertNotNull(metadata.getUpdatedDate()); } public void testReadMetadataNone() throws Exception { final JobReader<InputStream> reader = new JaxbJobReader(new DataCleanerConfigurationImpl()); final AnalysisJobMetadata metadata = reader.readMetadata(new FileInputStream(new File("src/test/resources/example-job-valid.xml"))); assertNull(metadata.getAuthor()); assertNull(metadata.getJobName()); assertNull(metadata.getJobDescription()); assertNull(metadata.getJobVersion()); assertTrue(metadata.getProperties().isEmpty()); assertEquals("my database", metadata.getDatastoreName()); assertEquals("[PUBLIC.EMPLOYEES.FIRSTNAME, PUBLIC.EMPLOYEES.LASTNAME, PUBLIC.EMPLOYEES.EMAIL]", metadata.getSourceColumnPaths().toString()); assertNull(metadata.getCreatedDate()); assertNull(metadata.getUpdatedDate()); } public void testSimpleFilter() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-simple-filter.xml")); assertEquals(1, jobBuilder.getFilterComponentBuilders().size()); assertEquals(3, jobBuilder.getAnalyzerComponentBuilders().size()); final AnalysisJob analysisJob = jobBuilder.toAnalysisJob(); final AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(conf).run(analysisJob); final List<AnalyzerResult> results = resultFuture.getResults(); assertEquals(3, results.size()); // sort it to make sure test is deterministic Collections.sort(results, ToStringComparator.getComparator()); // the first result is for the unfiltered String analyzer final CrosstabResult res3 = (CrosstabResult) results.get(0); assertEquals(1, res3.getCrosstab().where("Column", "FIRSTNAME").where("Measures", "Min words").get()); assertEquals(2, res3.getCrosstab().where("Column", "FIRSTNAME").where("Measures", "Max words").get()); // this result represents the single manager (one unique and no repeated // values) final ValueDistributionAnalyzerResult res1 = (ValueDistributionAnalyzerResult) results.get(1); assertEquals("[[<unique>->1]]", res1.getValueCounts().toString()); assertEquals(1, res1.getUniqueCount().intValue()); // this result represents all the employees: Two repeated values and 18 // unique final ValueDistributionAnalyzerResult res2 = (ValueDistributionAnalyzerResult) results.get(2); assertEquals(18, res2.getUniqueCount().intValue()); assertEquals("[[<unique>->18], [Gerard->2], [Leslie->2]]", res2.getValueCounts().toString()); } public void testNamedInputs() throws Exception { final JaxbJobReader factory = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = factory.create(new File("src/test/resources/example-job-named-inputs.xml")); assertEquals(true, jobBuilder.isConfigured()); assertEquals(2, jobBuilder.getTransformerComponentBuilders().size()); final List<AnalyzerComponentBuilder<?>> analyzerJobBuilders = jobBuilder.getAnalyzerComponentBuilders(); assertEquals(1, analyzerJobBuilders.size()); final AnalyzerComponentBuilder<?> analyzerJobBuilder = analyzerJobBuilders.get(0); final AnalyzerJob analyzerJob = analyzerJobBuilder.toAnalyzerJob(); final ComponentConfiguration configuration = analyzerJob.getConfiguration(); final InputColumn<?> col1 = (InputColumn<?>) configuration .getProperty(analyzerJob.getDescriptor().getConfiguredProperty("From column")); assertEquals("date 1", col1.getName()); final InputColumn<?> col2 = (InputColumn<?>) configuration .getProperty(analyzerJob.getDescriptor().getConfiguredProperty("To column")); assertEquals("date 2", col2.getName()); final AnalysisJob analysisJob = jobBuilder.toAnalysisJob(); final AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(conf).run(analysisJob); final List<AnalyzerResult> results = resultFuture.getResults(); assertEquals(1, results.size()); final DateGapAnalyzerResult result = (DateGapAnalyzerResult) results.get(0); final String[] resultLines = new DateGapTextRenderer().render(result).split("\n"); assertEquals(58, resultLines.length); assertEquals(" - time gap: 2003-01-18 to 2003-01-29", resultLines[0]); assertEquals(" - time gap: 2003-02-09 to 2003-02-11", resultLines[1]); assertEquals(" - time gap: 2003-05-16 to 2003-05-20", resultLines[2]); assertEquals(" - time gap: 2003-07-23 to 2003-07-24", resultLines[3]); assertEquals(" - time gap: 2003-08-21 to 2003-08-25", resultLines[4]); assertEquals(" - time gap: 2003-09-02 to 2003-09-03", resultLines[5]); assertEquals(" - time gap: 2003-11-03 to 2003-11-04", resultLines[6]); assertEquals(" - time gap: 2003-12-17 to 2004-01-02", resultLines[7]); assertEquals(" - time gap: 2004-05-24 to 2004-05-26", resultLines[8]); assertEquals(" - time gap: 2004-09-22 to 2004-09-27", resultLines[9]); assertEquals(" - time gap: 2004-12-24 to 2005-01-05", resultLines[10]); assertEquals(" - time gap: 2005-05-28 to 2005-05-29", resultLines[11]); assertEquals(" - time overlap: 2003-01-09 to 2003-01-18", resultLines[12]); assertEquals(" - time overlap: 2003-01-31 to 2003-02-07", resultLines[13]); assertEquals(" - time overlap: 2005-05-29 to 2005-06-08", resultLines[57]); } public void testInvalidRead() throws Exception { final JaxbJobReader factory = new JaxbJobReader(new DataCleanerConfigurationImpl()); try { factory.create(new File("src/test/resources/example-job-invalid.xml")); fail("Exception expected"); } catch (final IllegalArgumentException e) { final String message = e.getMessage(); assertTrue(message, message.startsWith("javax.xml.bind.UnmarshalException")); assertTrue(message, message.toLowerCase().contains("uri:\"http://eobjects.org/analyzerbeans/job/1.0\"")); assertTrue(message, message.contains("\"datacontext\"")); } } public void testMissingDatastore() throws Exception { final JaxbJobReader factory = new JaxbJobReader(new DataCleanerConfigurationImpl()); try { factory.create(new File("src/test/resources/example-job-valid.xml")); fail("Exception expected"); } catch (final NoSuchDatastoreException e) { assertEquals("No such datastore: my database", e.getMessage()); } } public void testMissingTransformerDescriptor() throws Exception { final JaxbJobReader factory = new JaxbJobReader(conf); try { factory.create(new File("src/test/resources/example-job-missing-descriptor.xml")); fail("Exception expected"); } catch (final NoSuchComponentException e) { assertEquals("No such TransformerType descriptor: tokenizerDescriptor", e.getMessage()); } } public void testValidJob() throws Exception { final JaxbJobReader factory = new JaxbJobReader(conf); final AnalysisJobBuilder builder = factory.create(new File("src/test/resources/example-job-valid.xml")); assertTrue(builder.isConfigured()); final List<MetaModelInputColumn> sourceColumns = builder.getSourceColumns(); assertEquals(3, sourceColumns.size()); assertEquals("MetaModelInputColumn[PUBLIC.EMPLOYEES.FIRSTNAME]", sourceColumns.get(0).toString()); assertEquals("MetaModelInputColumn[PUBLIC.EMPLOYEES.LASTNAME]", sourceColumns.get(1).toString()); assertEquals("MetaModelInputColumn[PUBLIC.EMPLOYEES.EMAIL]", sourceColumns.get(2).toString()); assertEquals(1, builder.getTransformerComponentBuilders().size()); assertEquals("[TransformedInputColumn[id=trans-0001-0002,name=username], " + "TransformedInputColumn[id=trans-0001-0003,name=domain]]", builder.getTransformerComponentBuilders().get(0).getOutputColumns().toString()); assertEquals("[TransformedInputColumn[id=trans-0001-0002,name=username], " + "TransformedInputColumn[id=trans-0001-0003,name=domain], " + "MetaModelInputColumn[PUBLIC.EMPLOYEES.FIRSTNAME], " + "MetaModelInputColumn[PUBLIC.EMPLOYEES.LASTNAME]]", Arrays.toString(builder.getAnalyzerComponentBuilders().get(0).toAnalyzerJob().getInput())); final List<AnalyzerResult> results = new AnalysisRunnerImpl(conf).run(builder.toAnalysisJob()).getResults(); assertEquals(1, results.size()); final CrosstabResult crosstabResult = (CrosstabResult) results.get(0); final String[] resultLines = crosstabResult.toString(-1).split("\n"); assertEquals(85, resultLines.length); assertEquals("Crosstab:", resultLines[0]); assertEquals("FIRSTNAME,Avg chars: 5.391304347826087", resultLines[1]); assertEquals("FIRSTNAME,Avg white spaces: 0.043478260869565216", resultLines[2]); assertEquals("FIRSTNAME,Blank count: 0", resultLines[3]); assertEquals("FIRSTNAME,Diacritic chars: 0", resultLines[4]); assertEquals("FIRSTNAME,Digit chars: 0", resultLines[5]); } public void testUsingSourceAlternateDatastore() throws Throwable { final Datastore datastore = TestHelper.createSampleDatabaseDatastore("another datastore name"); final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder analysisJobBuilder = reader.create(new FileInputStream(new File("src/test/resources/example-job-valid.xml")), null, datastore); final AnalysisJob analysisJob = analysisJobBuilder.toAnalysisJob(); assertEquals("another datastore name", analysisJob.getDatastore().getName()); } public void testUsingSourceColumnMapping() throws Throwable { final Datastore datastore = TestHelper.createSampleDatabaseDatastore("another datastore name"); final JobReader<InputStream> reader = new JaxbJobReader(conf); final AnalysisJobMetadata metadata = reader.readMetadata(new FileInputStream(new File("src/test/resources/example-job-valid.xml"))); final SourceColumnMapping sourceColumnMapping = new SourceColumnMapping(metadata.getSourceColumnPaths()); assertFalse(sourceColumnMapping.isSatisfied()); assertEquals("[PUBLIC.EMPLOYEES.EMAIL, PUBLIC.EMPLOYEES.FIRSTNAME, PUBLIC.EMPLOYEES.LASTNAME]", sourceColumnMapping.getPaths().toString()); sourceColumnMapping.setDatastore(datastore); final DatastoreConnection con = datastore.openConnection(); final SchemaNavigator sn = con.getSchemaNavigator(); sourceColumnMapping.setColumn("PUBLIC.EMPLOYEES.EMAIL", sn.convertToColumn("PUBLIC.CUSTOMERS.PHONE")); sourceColumnMapping .setColumn("PUBLIC.EMPLOYEES.FIRSTNAME", sn.convertToColumn("PUBLIC.CUSTOMERS.CONTACTFIRSTNAME")); sourceColumnMapping .setColumn("PUBLIC.EMPLOYEES.LASTNAME", sn.convertToColumn("PUBLIC.CUSTOMERS.CONTACTLASTNAME")); assertEquals("[]", sourceColumnMapping.getUnmappedPaths().toString()); assertTrue(sourceColumnMapping.isSatisfied()); final AnalysisJob job = reader.read(new FileInputStream(new File("src/test/resources/example-job-valid.xml")), sourceColumnMapping); assertEquals("another datastore name", job.getDatastore().getName()); assertEquals("[MetaModelInputColumn[PUBLIC.CUSTOMERS.CONTACTFIRSTNAME], " + "MetaModelInputColumn[PUBLIC.CUSTOMERS.CONTACTLASTNAME], " + "MetaModelInputColumn[PUBLIC.CUSTOMERS.PHONE]]", job.getSourceColumns().toString()); final AnalysisRunner runner = new AnalysisRunnerImpl(conf); final AnalysisResultFuture resultFuture = runner.run(job); if (!resultFuture.isSuccessful()) { throw resultFuture.getErrors().get(0); } final AnalyzerResult res = resultFuture.getResults().get(0); assertTrue(res instanceof StringAnalyzerResult); final String[] resultLines = new CrosstabTextRenderer().render((CrosstabResult) res).split("\n"); assertEquals( " username domain CONTACTFIRSTNAME CONTACTLASTNAME ", resultLines[0]); assertEquals( "Row count 214 214 214 214 ", resultLines[1]); assertEquals( "Null count 214 214 1 0 ", resultLines[2]); } public void testReadVariables() throws Exception { final CsvDatastore datastore = new CsvDatastore("date-datastore", "src/test/resources/example-dates.csv"); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withDatastores(datastore) .withEnvironment(new DataCleanerEnvironmentImpl().withDescriptorProvider(descriptorProvider)); final JaxbJobReader reader = new JaxbJobReader(configuration); final File file = new File("src/test/resources/example-job-variables-ods.analysis.xml"); assertTrue(file.exists()); final AnalysisJobBuilder ajb = reader.create(file); final AnalysisJobBuilder odsjb = ajb.getAnalyzerComponentBuilders().get(0).getOutputDataStreamJobBuilder("Complete rows"); final List<TransformerComponentBuilder<?>> tjbs = odsjb.getTransformerComponentBuilders(); final DateMaskMatcherTransformer dateMaskMatcherTransformer1 = (DateMaskMatcherTransformer) tjbs.get(0).getComponentInstance(); assertEquals("[yyyy-MM-dd]", Arrays.toString(dateMaskMatcherTransformer1.getDateMasks())); final DateMaskMatcherTransformer dateMaskMatcherTransformer2 = (DateMaskMatcherTransformer) tjbs.get(1).getComponentInstance(); assertEquals("[yy-dd-MM]", Arrays.toString(dateMaskMatcherTransformer2.getDateMasks())); final ConvertToDateTransformer convertToDateTransformer = (ConvertToDateTransformer) tjbs.get(2).getComponentInstance(); assertEquals("[yyyy-MM-dd]", Arrays.toString(convertToDateTransformer.getDateMasks())); assertEquals("2000-01-01", new SimpleDateFormat("yyyy-MM-dd").format(convertToDateTransformer.getNullReplacement())); } public void testReadChainOfFilters() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-chain-of-filters.xml")); assertNotNull(jobBuilder); assertEquals(3, jobBuilder.getFilterComponentBuilders().size()); assertEquals(1, jobBuilder.getAnalyzerComponentBuilders().size()); assertEquals(0, jobBuilder.getTransformerComponentBuilders().size()); } public void testReadAndExecuteOutputDataStreams() throws Throwable { final JobReader<InputStream> reader = new JaxbJobReader(conf); final AnalysisJob job = reader.read( new FileInputStream(new File("src/test/resources/example-job-output-dataset.analysis.xml"))); assertEquals(1, job.getAnalyzerJobs().size()); final AnalyzerJob analyzerJob = job.getAnalyzerJobs().get(0); assertEquals("Completeness analyzer", analyzerJob.getDescriptor().getDisplayName()); assertEquals(2, analyzerJob.getOutputDataStreamJobs().length); final OutputDataStreamJob completeOutputDataStreamJob = analyzerJob.getOutputDataStreamJobs()[0]; assertEquals("Complete rows", completeOutputDataStreamJob.getOutputDataStream().getName()); assertEquals(2, completeOutputDataStreamJob.getJob().getAnalyzerJobs().size()); final AnalyzerJob completeStringAnalyzer = completeOutputDataStreamJob.getJob().getAnalyzerJobs().get(0); assertEquals("String analyzer", completeStringAnalyzer.getDescriptor().getDisplayName()); assertEquals(1, completeStringAnalyzer.getInput().length); assertEquals("Concat of FIRSTNAME,LASTNAME", completeStringAnalyzer.getInput()[0].getName()); final AnalyzerJob completeNumberAnalyzer = completeOutputDataStreamJob.getJob().getAnalyzerJobs().get(1); assertEquals("Number analyzer", completeNumberAnalyzer.getDescriptor().getDisplayName()); assertEquals(1, completeNumberAnalyzer.getInput().length); assertEquals("REPORTSTO", completeNumberAnalyzer.getInput()[0].getName()); assertEquals(2, analyzerJob.getOutputDataStreamJobs().length); final OutputDataStreamJob incompleteOutputDataStreamJob = analyzerJob.getOutputDataStreamJobs()[1]; assertEquals("Incomplete rows", incompleteOutputDataStreamJob.getOutputDataStream().getName()); assertEquals(2, incompleteOutputDataStreamJob.getJob().getAnalyzerJobs().size()); final AnalyzerJob incompleteStringAnalyzer = incompleteOutputDataStreamJob.getJob().getAnalyzerJobs().get(0); assertEquals("String analyzer", incompleteStringAnalyzer.getDescriptor().getDisplayName()); assertEquals(1, incompleteStringAnalyzer.getInput().length); assertEquals("Concat of FIRSTNAME,LASTNAME", incompleteStringAnalyzer.getInput()[0].getName()); final AnalyzerJob incompleteNumberAnalyzer = incompleteOutputDataStreamJob.getJob().getAnalyzerJobs().get(1); assertEquals("Number analyzer", incompleteNumberAnalyzer.getDescriptor().getDisplayName()); assertEquals(1, incompleteNumberAnalyzer.getInput().length); assertEquals("REPORTSTO", incompleteNumberAnalyzer.getInput()[0].getName()); final MultiThreadedTaskRunner taskRunner = new MultiThreadedTaskRunner(16); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner); final Datastore datastore = TestHelper.createSampleDatabaseDatastore("testoutputdatastream"); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withDatastores(datastore).withEnvironment(environment); final OutputDataStreamJob[] outputDataStreamJobs = analyzerJob.getOutputDataStreamJobs(); final AnalyzerJob analyzerJob2 = outputDataStreamJobs[0].getJob().getAnalyzerJobs().get(0); final AnalyzerJob analyzerJob3 = outputDataStreamJobs[1].getJob().getAnalyzerJobs().get(0); // now run the job(s) final AnalysisRunnerImpl runner = new AnalysisRunnerImpl(configuration); final AnalysisResultFuture resultFuture = runner.run(job); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } assertEquals(5, resultFuture.getResults().size()); final CompletenessAnalyzerResult result1 = (CompletenessAnalyzerResult) resultFuture.getResult(analyzerJob); assertNotNull(result1); assertEquals(23, result1.getValidRowCount()); assertEquals(0, result1.getInvalidRowCount()); final StringAnalyzerResult result2 = (StringAnalyzerResult) resultFuture.getResult(analyzerJob2); assertNotNull(result2); assertEquals(23, result2.getRowCount(result2.getColumns()[0])); assertEquals(0, result2.getNullCount(result2.getColumns()[0])); final StringAnalyzerResult result3 = (StringAnalyzerResult) resultFuture.getResult(analyzerJob3); assertNotNull(result3); assertEquals(0, result3.getRowCount(result3.getColumns()[0])); } public void testPlainSearchReplaceJobUpgrade() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/version_4_5_3_plain_search_replace.analysis.xml")); assertTrue(jobBuilder.isConfigured()); } public void testCoalesceJobWithTranformerColumns() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-coalesce-issue.analysis.xml")); final List<ComponentBuilder> componentBuilders = new ArrayList<>(jobBuilder.getComponentBuilders()); assertEquals(5, componentBuilders.size()); final ComponentBuilder componentBuilder = componentBuilders.get(3); final ComponentDescriptor<?> descriptor = componentBuilder.getDescriptor(); assertEquals("Fuse / Coalesce fields", descriptor.getDisplayName()); assertTrue(componentBuilder.isConfigured()); final ConfiguredPropertyDescriptor configuredPropertyDescriptor = componentBuilder.getDescriptor().getConfiguredProperty("Units"); final CoalesceUnit[] units = (CoalesceUnit[]) componentBuilder.getConfiguredProperty(configuredPropertyDescriptor); assertEquals("EQ name", units[0].getInputColumnNames()[0]); assertEquals("NEQ name", units[0].getInputColumnNames()[1]); } public void testCoalesceJobWithCombinedTranformerColumns() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-coalesce-combined-columns.analysis.xml")); final List<ComponentBuilder> componentBuilders = new ArrayList<>(jobBuilder.getComponentBuilders()); assertEquals(3, componentBuilders.size()); final ComponentBuilder componentBuilder = componentBuilders.get(1); final ComponentDescriptor<?> descriptor = componentBuilder.getDescriptor(); assertEquals("Fuse / Coalesce fields", descriptor.getDisplayName()); assertTrue(componentBuilder.isConfigured()); final ConfiguredPropertyDescriptor configuredPropertyDescriptor = componentBuilder.getDescriptor().getConfiguredProperty("Units"); final CoalesceUnit[] units = (CoalesceUnit[]) componentBuilder.getConfiguredProperty(configuredPropertyDescriptor); assertEquals("PUBLIC.CUSTOMERS.CONTACTLASTNAME", units[0].getInputColumnNames()[0]); assertEquals("CONTACTLASTNAME (Upper case)", units[0].getInputColumnNames()[1]); } public void testCoalesceJobWithInputColumns() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-coalesce-inputcolumns.analysis.xml")); final List<ComponentBuilder> componentBuilders = new ArrayList<>(jobBuilder.getComponentBuilders()); assertEquals(2, componentBuilders.size()); final ComponentBuilder componentBuilder = componentBuilders.get(0); final ComponentDescriptor<?> descriptor = componentBuilder.getDescriptor(); assertEquals("Fuse / Coalesce fields", descriptor.getDisplayName()); assertTrue(componentBuilder.isConfigured()); final ConfiguredPropertyDescriptor configuredPropertyDescriptor = componentBuilder.getDescriptor().getConfiguredProperty("Units"); final CoalesceUnit[] units = (CoalesceUnit[]) componentBuilder.getConfiguredProperty(configuredPropertyDescriptor); assertEquals("PUBLIC.CUSTOMERS.CONTACTLASTNAME", units[0].getInputColumnNames()[0]); assertEquals("PUBLIC.CUSTOMERS.CONTACTFIRSTNAME", units[0].getInputColumnNames()[1]); assertEquals("PUBLIC.CUSTOMERS.PHONE", units[1].getInputColumnNames()[0]); assertEquals("PUBLIC.CUSTOMERS.CITY", units[1].getInputColumnNames()[1]); } public void testUnionJob() throws Exception { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-union.analysis.xml")); final List<ComponentBuilder> componentBuilders = new ArrayList<>(jobBuilder.getComponentBuilders()); assertEquals(1, componentBuilders.size()); final ComponentBuilder componentBuilder = componentBuilders.get(0); final ComponentDescriptor<?> descriptor = componentBuilder.getDescriptor(); assertEquals("Union", descriptor.getDisplayName()); assertTrue(componentBuilder.isConfigured()); final ConfiguredPropertyDescriptor configuredPropertyDescriptor = componentBuilder.getDescriptor().getConfiguredProperty("Units"); final CoalesceUnit[] units = (CoalesceUnit[]) componentBuilder.getConfiguredProperty(configuredPropertyDescriptor); assertEquals("PUBLIC.CUSTOMERS.CONTACTLASTNAME", units[0].getInputColumnNames()[0]); assertEquals("PUBLIC.EMPLOYEES.LASTNAME", units[0].getInputColumnNames()[1]); assertEquals("CONTACTLASTNAME", units[0].getSuggestedOutputColumnName()); assertEquals("PUBLIC.CUSTOMERS.CONTACTFIRSTNAME", units[1].getInputColumnNames()[0]); assertEquals("PUBLIC.EMPLOYEES.FIRSTNAME", units[1].getInputColumnNames()[1]); assertEquals("CONTACTFIRSTNAME", units[1].getSuggestedOutputColumnName()); final List<OutputDataStream> outputDataStreams = componentBuilder.getOutputDataStreams(); assertEquals(1, outputDataStreams.size()); final OutputDataStream outputDataStream = outputDataStreams.get(0); assertEquals("output", outputDataStream.getName()); } /** * Validates whether the values of datastore column names of a job which is written to disk using the * {@link JaxbJobWriter} and then read as a new job using the {@link JaxbJobReader} are the same before * and after when the datastore column names contains line feeds. */ public void testReadJobWithMultipleLinedColumnNames() throws Exception { final Datastore datastore = new ExcelDatastore("doubles", null, "src/test/resources/double.xlsx"); final SimpleDescriptorProvider descriptorProvider = new SimpleDescriptorProvider(); descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(MockAnalyzer.class)); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withDatastoreCatalog(new DatastoreCatalogImpl(datastore)) .withEnvironment(new DataCleanerEnvironmentImpl().withDescriptorProvider(descriptorProvider)); final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore(datastore); jobBuilder.addSourceColumns( jobBuilder.getDatastoreConnection().getDataContext().getDefaultSchema().getTable(0).getColumns()); jobBuilder.addAnalyzer(MockAnalyzer.class).addInputColumns(jobBuilder.getSourceColumns()); final AnalysisJob originalJob = jobBuilder.toAnalysisJob(); jobBuilder.close(); final JaxbJobWriter writer = new JaxbJobWriter(new DataCleanerConfigurationImpl(), new JaxbJobMetadataFactoryImpl()); final File jobFile = File.createTempFile("double", ".analysis.xml"); writer.write(originalJob, new FileOutputStream(jobFile)); final AnalysisJob readJob = new JaxbJobReader(configuration).read(new FileInputStream(jobFile)); assertEquals(originalJob.getSourceColumns().stream().map(InputColumn::getName).collect(Collectors.toList()), readJob.getSourceColumns().stream().map(InputColumn::getName).collect(Collectors.toList())); } public void testJobWithTemplateProperties() throws IOException { final JaxbJobReader reader = new JaxbJobReader(conf); final AnalysisJobBuilder jobBuilder = reader.create(new File("src/test/resources/example-job-template.xml")); final List<ComponentBuilder> componentBuilders = new ArrayList<>(jobBuilder.getComponentBuilders()); assertEquals(1, componentBuilders.size()); final ComponentBuilder componentBuilder = componentBuilders.get(0); final ComponentDescriptor<?> descriptor = componentBuilder.getDescriptor(); assertEquals("Completeness analyzer", descriptor.getDisplayName()); final AnalysisJobBuilder outputDataStreamJobBuilder = componentBuilder.getOutputDataStreamJobBuilder( "Complete rows"); final List<ComponentBuilder> componentBuilders2 = new ArrayList<>(outputDataStreamJobBuilder .getComponentBuilders()); assertEquals(1, componentBuilders2.size()); final ComponentBuilder createCsvComponentBuilder = componentBuilders2.get(0); assertEquals("Create CSV file", createCsvComponentBuilder.getDescriptor().getDisplayName()); final LinkedList<Object> linkedList = new LinkedList<>(createCsvComponentBuilder.getConfiguredProperties() .values()); final FileResource propertyFile = (FileResource) linkedList .get(3); String absolutePath = propertyFile.getFile().getAbsolutePath(); absolutePath = absolutePath.replace("\\", "/"); absolutePath = absolutePath.replace("C:", ""); assertEquals("/tmp/ignite/hotfolder/dc_input - 2016-12-12 14:14:56 - samples.csv", absolutePath); } public void testJobWithTemplateProperties2() throws IOException { final DescriptorProvider descriptorProvider = new ClasspathScanDescriptorProvider().scanPackage( "org.datacleaner", true); final Datastore datastore = TestHelper.createSampleDatabaseDatastore("my db"); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withDatastores(datastore) .withEnvironment(new DataCleanerEnvironmentImpl().withDescriptorProvider(descriptorProvider)); ; final JaxbJobReader reader = new JaxbJobReader(configuration); final AnalysisJobBuilder jobBuilder = reader.create(new File( "src/test/resources/JaxbJobWriterTest-testWriteCsvTemplate.xml")); final List<ComponentBuilder> componentBuilders = new ArrayList<>(jobBuilder.getComponentBuilders()); assertEquals(1, componentBuilders.size()); final ComponentBuilder componentBuilder = componentBuilders.get(0); final ComponentDescriptor<?> descriptor = componentBuilder.getDescriptor(); assertEquals("Create CSV file", descriptor.getDisplayName()); final LinkedList<Object> linkedList = new LinkedList<>(componentBuilder.getConfiguredProperties().values()); final FileResource propertyFile = (FileResource) linkedList.get(3); String absolutePath = propertyFile.getFile().getAbsolutePath(); absolutePath = absolutePath.replace("\\", "/"); absolutePath = absolutePath.replace("C:", ""); assertEquals("/Users/claudiap/Documents/OutgoingHotFolder/myFile/1482244133378-samples.csv", absolutePath); } }