/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.job.tasks;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.components.convert.ConvertToNumberTransformer;
import org.datacleaner.components.maxrows.MaxRowsFilter;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.DataCleanerConfigurationImpl;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.data.MutableInputColumn;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.builder.FilterComponentBuilder;
import org.datacleaner.job.builder.TransformerComponentBuilder;
import org.datacleaner.job.runner.AnalysisResultFuture;
import org.datacleaner.job.runner.AnalysisRunner;
import org.datacleaner.job.runner.AnalysisRunnerImpl;
import org.datacleaner.result.ListResult;
import org.datacleaner.test.MockAnalyzer;
import junit.framework.TestCase;
public class ConsumeRowTaskTest extends TestCase {
@SuppressWarnings("unchecked")
public void testMultiRowTransformer() throws Throwable {
final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl();
final InputColumn<?> countingColumn;
final AnalysisJob job;
// build example job
try (AnalysisJobBuilder builder = new AnalysisJobBuilder(configuration)) {
// number_col,string_col
// 3,foo
// 10,bar
// 0,baz
builder.setDatastore(new CsvDatastore("foo", "src/test/resources/multi_row_transformer_test.csv"));
builder.addSourceColumns("number_col");
final TransformerComponentBuilder<ConvertToNumberTransformer> convertTransformer =
builder.addTransformer(ConvertToNumberTransformer.class)
.addInputColumn(builder.getSourceColumnByName("number_col"));
final MutableInputColumn<?> numberColumn = convertTransformer.getOutputColumns().get(0);
final TransformerComponentBuilder<MockMultiRowTransformer> multiRowTransformer =
builder.addTransformer(MockMultiRowTransformer.class).addInputColumn(numberColumn);
final List<MutableInputColumn<?>> mockTransformerColumns = multiRowTransformer.getOutputColumns();
countingColumn = mockTransformerColumns.get(0);
assertEquals("Mock multi row transformer (1)", countingColumn.getName());
builder.addAnalyzer(MockAnalyzer.class).addInputColumns(mockTransformerColumns);
job = builder.toAnalysisJob();
}
final ListResult<InputRow> result;
// run job
{
final AnalysisRunner runner = new AnalysisRunnerImpl(configuration);
final AnalysisResultFuture resultFuture = runner.run(job);
if (resultFuture.isErrornous()) {
throw resultFuture.getErrors().get(0);
}
result = (ListResult<InputRow>) resultFuture.getResults().get(0);
}
final List<InputRow> list = result.getValues();
// we expect 13 rows (3 + 10 + 0)
assertEquals(13, list.size());
assertEquals(1, list.get(0).getValue(countingColumn));
assertEquals(2, list.get(1).getValue(countingColumn));
assertEquals(3, list.get(2).getValue(countingColumn));
assertEquals(1, list.get(3).getValue(countingColumn));
assertEquals(2, list.get(4).getValue(countingColumn));
assertEquals(3, list.get(5).getValue(countingColumn));
assertEquals(4, list.get(6).getValue(countingColumn));
assertEquals(5, list.get(7).getValue(countingColumn));
assertEquals(6, list.get(8).getValue(countingColumn));
assertEquals(7, list.get(9).getValue(countingColumn));
assertEquals(8, list.get(10).getValue(countingColumn));
assertEquals(9, list.get(11).getValue(countingColumn));
assertEquals(10, list.get(12).getValue(countingColumn));
// assert that all generated rows have unique ids
final Set<Long> ids = new HashSet<>();
for (final InputRow row : list) {
final long id = row.getId();
if (ids.contains(id)) {
fail("Multiple rows with id " + id);
}
ids.add(id);
}
}
@SuppressWarnings("unchecked")
public void testConsumeRowTaskForComplexJob() throws Throwable {
final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl();
final AnalysisJob job;
// build example job
try (AnalysisJobBuilder builder = new AnalysisJobBuilder(configuration)) {
builder.setDatastore(new CsvDatastore("Names", "src/test/resources/example-name-lengths.csv"));
builder.addSourceColumns("name");
final FilterComponentBuilder<MaxRowsFilter, MaxRowsFilter.Category> filterJobBuilder =
builder.addFilter(MaxRowsFilter.class);
filterJobBuilder.setConfiguredProperty("Max rows", 10);
final TransformerComponentBuilder<ConvertToNumberTransformer> convertTransformer =
builder.addTransformer(ConvertToNumberTransformer.class)
.addInputColumn(builder.getSourceColumnByName("name"));
final MutableInputColumn<?> numberColumn = convertTransformer.getOutputColumns().get(0);
convertTransformer.setRequirement(filterJobBuilder, MaxRowsFilter.Category.VALID);
builder.addAnalyzer(MockAnalyzer.class).addInputColumns(numberColumn);
job = builder.toAnalysisJob();
}
final ListResult<InputRow> result;
// run job
{
final AnalysisRunner runner = new AnalysisRunnerImpl(configuration);
final AnalysisResultFuture resultFuture = runner.run(job);
if (resultFuture.isErrornous()) {
throw resultFuture.getErrors().get(0);
}
result = (ListResult<InputRow>) resultFuture.getResults().get(0);
}
final List<InputRow> list = result.getValues();
assertEquals(10, list.size());
}
}