/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.components.fuse; import java.util.Arrays; import java.util.List; import org.apache.metamodel.DataContext; import org.apache.metamodel.MetaModelHelper; import org.apache.metamodel.data.Row; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.data.MetaModelInputColumn; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.AnalyzerJob; import org.datacleaner.job.OutputDataStreamJob; import org.datacleaner.job.TransformerJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.result.ListResult; import org.datacleaner.test.MockAnalyzer; import org.datacleaner.test.MockOutputDataStreamAnalyzer; import org.datacleaner.test.TestHelper; import org.junit.Assert; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.Timeout; public class FuseStreamsComponentIntegrationTest { private static final int COUNT_EMPLOYEES = 23; private static final int COUNT_CUSTOMERS = 214; private final Datastore datastore = TestHelper.createSampleDatabaseDatastore("orderdb"); private final DataCleanerConfigurationImpl singleThreadedConfiguration = new DataCleanerConfigurationImpl().withDatastores(datastore); private final DataCleanerConfigurationImpl multiThreadedConfiguration = new DataCleanerConfigurationImpl().withDatastores(datastore) .withEnvironment(new DataCleanerEnvironmentImpl().withTaskRunner(new MultiThreadedTaskRunner(4))); @Rule public Timeout globalTimeout = Timeout.seconds(20); private static void testUnionTables(final DataCleanerConfiguration configuration) throws Throwable { final AnalysisJob job; try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration)) { ajb.setDatastore("orderdb"); ajb.addSourceColumns("customers.contactfirstname", "customers.contactlastname"); ajb.addSourceColumns("employees.firstname", "employees.lastname"); final CoalesceUnit unit1 = new CoalesceUnit("firstname", "contactfirstname"); final CoalesceUnit unit2 = new CoalesceUnit("lastname", "contactlastname"); final CoalesceUnit[] units = new CoalesceUnit[] { unit1, unit2 }; final TransformerComponentBuilder<FuseStreamsComponent> fuse = ajb.addTransformer(FuseStreamsComponent.class); fuse.addInputColumns(ajb.getSourceColumns()); fuse.setConfiguredProperty(FuseStreamsComponent.PROPERTY_UNITS, units); final AnalysisJobBuilder fusedStreamJobBuilder = fuse.getOutputDataStreamJobBuilder(FuseStreamsComponent.OUTPUT_DATA_STREAM_NAME); final List<MetaModelInputColumn> fusedColumns = fusedStreamJobBuilder.getSourceColumns(); Assert.assertEquals("[MetaModelInputColumn[output.FIRSTNAME], MetaModelInputColumn[output.LASTNAME]]", fusedColumns.toString()); final AnalyzerComponentBuilder<MockAnalyzer> analyzer = fusedStreamJobBuilder.addAnalyzer(MockAnalyzer.class); analyzer.addInputColumns(fusedColumns); job = ajb.toAnalysisJob(); } Assert.assertNotNull(job); final AnalysisRunnerImpl runner = new AnalysisRunnerImpl(configuration); final AnalysisResultFuture resultFuture = runner.run(job); Assert.assertNotNull(resultFuture); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } final List<AnalyzerResult> results = resultFuture.getResults(); Assert.assertEquals(1, results.size()); // expect that the number of records kept in the list is equal to the // size of BOTH "employees" and "customers" tables. final ListResult<?> result = (ListResult<?>) results.get(0); Assert.assertEquals(COUNT_CUSTOMERS + COUNT_EMPLOYEES, result.getValues().size()); } private static void testFuseOutputDataStreams(final DataCleanerConfiguration configuration) throws Throwable { final AnalysisJob job; try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration)) { ajb.setDatastore("orderdb"); ajb.addSourceColumns("customers.customernumber"); // add an analyzer to create two streams final AnalyzerComponentBuilder<MockOutputDataStreamAnalyzer> analyzer1 = ajb.addAnalyzer(MockOutputDataStreamAnalyzer.class); analyzer1.addInputColumns(ajb.getSourceColumns()); final AnalysisJobBuilder streamJobBuilder1 = analyzer1.getOutputDataStreamJobBuilder(MockOutputDataStreamAnalyzer.STREAM_NAME1); final AnalysisJobBuilder streamJobBuilder2 = analyzer1.getOutputDataStreamJobBuilder(MockOutputDataStreamAnalyzer.STREAM_NAME2); // add fuse streams component to both streams final TransformerComponentBuilder<FuseStreamsComponent> fuse1 = streamJobBuilder1.addTransformer(FuseStreamsComponent.class); final TransformerComponentBuilder<FuseStreamsComponent> fuse2 = streamJobBuilder2.addTransformer(fuse1); Assert.assertSame(fuse1, fuse2); // add input columns from both streams fuse1.addInputColumns(streamJobBuilder1.getSourceColumns()); fuse1.addInputColumns(streamJobBuilder2.getSourceColumns()); final CoalesceUnit unit = new CoalesceUnit(streamJobBuilder1.getSourceColumns().get(0), streamJobBuilder2.getSourceColumns().get(0)); final CoalesceUnit[] units = new CoalesceUnit[] { unit }; fuse1.setConfiguredProperty(FuseStreamsComponent.PROPERTY_UNITS, units); // now consume the fused output final AnalysisJobBuilder fusedStreamJobBuilder = fuse1.getOutputDataStreamJobBuilder(FuseStreamsComponent.OUTPUT_DATA_STREAM_NAME); final AnalyzerComponentBuilder<MockAnalyzer> mockAnalyzerBuilder = fusedStreamJobBuilder.addAnalyzer(MockAnalyzer.class); mockAnalyzerBuilder.addInputColumns(fusedStreamJobBuilder.getSourceColumns()); job = ajb.toAnalysisJob(); } // assert on the structure of the job and get a reference to the mock // analyzer final AnalyzerJob mockAnalyzer; { final AnalyzerJob analyzer1 = job.getAnalyzerJobs().get(0); Assert.assertEquals(MockOutputDataStreamAnalyzer.class, analyzer1.getDescriptor().getComponentClass()); final OutputDataStreamJob[] outputDataStreamJobs = analyzer1.getOutputDataStreamJobs(); Assert.assertEquals(2, outputDataStreamJobs.length); final TransformerJob fuse1 = outputDataStreamJobs[0].getJob().getTransformerJobs().get(0); Assert.assertEquals(FuseStreamsComponent.class, fuse1.getDescriptor().getComponentClass()); // the created fuse refers to both streams Assert.assertEquals( "[MetaModelInputColumn[foo bar records.foo], MetaModelInputColumn[foo bar records.bar], " + "MetaModelInputColumn[counter records.count], MetaModelInputColumn[counter records.uuid]]", Arrays.toString(fuse1.getInput())); final TransformerJob fuse2 = outputDataStreamJobs[1].getJob().getTransformerJobs().get(0); Assert.assertSame(fuse1, fuse2); mockAnalyzer = fuse1.getOutputDataStreamJobs()[0].getJob().getAnalyzerJobs().get(0); Assert.assertEquals(MockAnalyzer.class, mockAnalyzer.getDescriptor().getComponentClass()); } // now run the job final AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(configuration).run(job); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } final ListResult<?> result = (ListResult<?>) resultFuture.getResult(mockAnalyzer); // a bit of dark math here ... The MockOutputDataStream discards 1/3 of // records but emits two records for the rest. And then 2 additional // records in the end. final int expectedValues = 2 * (COUNT_CUSTOMERS - (COUNT_CUSTOMERS / 3)) + 2; Assert.assertEquals(expectedValues, result.getValues().size()); } @Test public void testAssumptionsAboutOrderdb() throws Exception { try (DatastoreConnection connection = datastore.openConnection()) { final DataContext dataContext = connection.getDataContext(); final Row countCustomers = MetaModelHelper .executeSingleRowQuery(dataContext, dataContext.query().from("customers").selectCount().toQuery()); Assert.assertEquals(COUNT_CUSTOMERS, countCustomers.getValue(0)); final Row countEmployees = MetaModelHelper .executeSingleRowQuery(dataContext, dataContext.query().from("employees").selectCount().toQuery()); Assert.assertEquals(COUNT_EMPLOYEES, countEmployees.getValue(0)); } } @Test public void testUnionTablesSingleThreaded() throws Throwable { testUnionTables(singleThreadedConfiguration); } @Test public void testUnionTablesMultiThreaded() throws Throwable { testUnionTables(multiThreadedConfiguration); } @Test public void testFuseOutputDataStreamsSingleThreaded() throws Throwable { testFuseOutputDataStreams(singleThreadedConfiguration); } @Test public void testFuseOutputDataStreamsMultiThreaded() throws Throwable { testFuseOutputDataStreams(multiThreadedConfiguration); } @Test @Ignore("Not yet implemented") public void testFuseSourceTableAndOutputDataStream() throws Exception { Assert.fail("Not yet implemented"); } }