/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.job.runner; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.apache.metamodel.DataContext; import org.apache.metamodel.data.DataSet; import org.apache.metamodel.data.Row; import org.apache.metamodel.jdbc.JdbcDataContext; import org.apache.metamodel.query.Query; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; import org.apache.metamodel.util.CollectionUtils; import org.apache.metamodel.util.LazyRef; import org.datacleaner.api.InputColumn; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.data.MetaModelInputRow; import org.datacleaner.job.concurrent.ForkTaskListener; import org.datacleaner.job.concurrent.RunNextTaskTaskListener; import org.datacleaner.job.concurrent.TaskListener; import org.datacleaner.job.concurrent.TaskRunnable; import org.datacleaner.job.tasks.ConsumeRowTask; import org.datacleaner.job.tasks.RunRowProcessingPublisherTask; import org.datacleaner.util.SystemProperties; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * {@link RowProcessingPublisher} implementation for source {@link Table}s. */ public final class SourceTableRowProcessingPublisher extends AbstractRowProcessingPublisher { private static final Logger logger = LoggerFactory.getLogger(SourceTableRowProcessingPublisher.class); private final Set<Column> _physicalColumns = new LinkedHashSet<>(); private final LazyRef<RowProcessingQueryOptimizer> _queryOptimizerRef; /** * Constructor to use for creating a * {@link SourceTableRowProcessingPublisher} which feeds data from a source * datastore. * * @param publishers * @param stream */ public SourceTableRowProcessingPublisher(final RowProcessingPublishers publishers, final RowProcessingStream stream) { super(publishers, stream); _queryOptimizerRef = createQueryOptimizerRef(); final boolean aggressiveOptimizeSelectClause = SystemProperties.getBoolean(SystemProperties.QUERY_SELECTCLAUSE_OPTIMIZE, false); if (!aggressiveOptimizeSelectClause) { final Collection<InputColumn<?>> sourceColumns = stream.getAnalysisJob().getSourceColumns(); final List<Column> columns = new ArrayList<>(); for (final InputColumn<?> sourceColumn : sourceColumns) { final Column column = sourceColumn.getPhysicalColumn(); if (column != null && getTable().equals(column.getTable())) { columns.add(column); } } addPhysicalColumns(columns.toArray(new Column[columns.size()])); } } private Table getTable() { return getStream().getTable(); } /** * Inspects the row processed tables primary keys. If all primary keys are * in the source columns of the AnalysisJob, they will be added to the * physically queried columns. * * Adding the primary keys to the query is a trade-off: It helps a lot in * making eg. annotated rows referenceable to the source table, but it may * also potentially make the job heavier to execute since a lot of (unique) * values will be retrieved. */ public void addPrimaryKeysIfSourced() { final Column[] primaryKeyColumns = getTable().getPrimaryKeys(); if (primaryKeyColumns == null || primaryKeyColumns.length == 0) { logger.info("No primary keys defined for table {}, not pre-selecting primary keys", getTable().getName()); return; } final Collection<InputColumn<?>> sourceInputColumns = getAnalysisJob().getSourceColumns(); final List<Column> sourceColumns = CollectionUtils.map(sourceInputColumns, InputColumn::getPhysicalColumn); for (final Column primaryKeyColumn : primaryKeyColumns) { if (!sourceColumns.contains(primaryKeyColumn)) { logger.info("Primary key column {} not added to source columns, not pre-selecting primary keys"); return; } } addPhysicalColumns(primaryKeyColumns); } private LazyRef<RowProcessingQueryOptimizer> createQueryOptimizerRef() { return new LazyRef<RowProcessingQueryOptimizer>() { @Override protected RowProcessingQueryOptimizer fetch() { final Datastore datastore = getAnalysisJob().getDatastore(); try (DatastoreConnection con = datastore.openConnection()) { final DataContext dataContext = con.getDataContext(); final Column[] columnArray = _physicalColumns.toArray(new Column[_physicalColumns.size()]); final Query baseQuery = dataContext.query().from(getTable()).select(columnArray).toQuery(); logger.debug("Base query for row processing: {}", baseQuery); // try to optimize return new RowProcessingQueryOptimizerImpl(datastore, getConsumersSorted(), baseQuery); } catch (final RuntimeException e) { logger.error("Failed to build query optimizer! {}", e.getMessage(), e); throw e; } } }; } @Override public void onAllConsumersRegistered() { // can safely load query optimizer in separate thread here _queryOptimizerRef.requestLoad(); } public void addPhysicalColumns(final Column... columns) { for (final Column column : columns) { if (!getTable().equals(column.getTable())) { throw new IllegalArgumentException( "Column does not pertain to the correct table. Expected table: " + getTable() + ", actual table: " + column.getTable()); } _physicalColumns.add(column); } } @Override protected RowProcessingQueryOptimizer getQueryOptimizer() { final RowProcessingQueryOptimizer optimizer = _queryOptimizerRef.get(); if (optimizer == null) { final Throwable e = _queryOptimizerRef.getError(); if (e instanceof RuntimeException) { throw (RuntimeException) e; } throw new IllegalStateException(e); } return optimizer; } @Override protected boolean processRowsInternal(final AnalysisListener analysisListener, final RowProcessingMetrics rowProcessingMetrics) { final RowProcessingQueryOptimizer queryOptimizer = getQueryOptimizer(); final Query finalQuery = queryOptimizer.getOptimizedQuery(); final RowIdGenerator idGenerator; if (finalQuery.getFirstRow() == null) { idGenerator = new SimpleRowIdGenerator(); } else { idGenerator = new SimpleRowIdGenerator(finalQuery.getFirstRow()); } analysisListener.rowProcessingBegin(getAnalysisJob(), rowProcessingMetrics); final ConsumeRowHandler consumeRowHandler = createConsumeRowHandler(); final RowConsumerTaskListener taskListener = new RowConsumerTaskListener(getAnalysisJob(), analysisListener, getTaskRunner()); final Datastore datastore = getAnalysisJob().getDatastore(); try (DatastoreConnection con = datastore.openConnection()) { final DataContext dataContext = con.getDataContext(); if (logger.isDebugEnabled()) { final String queryString; if (dataContext instanceof JdbcDataContext) { final JdbcDataContext jdbcDataContext = (JdbcDataContext) dataContext; queryString = jdbcDataContext.getQueryRewriter().rewriteQuery(finalQuery); } else { queryString = finalQuery.toSql(); } logger.debug("Final query: {}", queryString); logger.debug("Final query firstRow={}, maxRows={}", finalQuery.getFirstRow(), finalQuery.getMaxRows()); } // represents the distinct count of rows as well as the number of // tasks to execute int numTasks = 0; try (DataSet dataSet = dataContext.executeQuery(finalQuery)) { while (dataSet.next()) { if (taskListener.isErrornous()) { break; } numTasks++; final Row metaModelRow = dataSet.getRow(); final int rowId = idGenerator.nextPhysicalRowId(); final MetaModelInputRow inputRow = new MetaModelInputRow(rowId, metaModelRow); final ConsumeRowTask task = new ConsumeRowTask(consumeRowHandler, rowProcessingMetrics, inputRow, analysisListener, numTasks); getTaskRunner().run(task, taskListener); } } taskListener.awaitTasks(numTasks); } return !taskListener.isErrornous(); } @Override protected boolean runRowProcessingInternal(final List<TaskRunnable> postProcessingTasks) { final TaskListener runCompletionListener = new ForkTaskListener("run row processing (" + getStream() + ")", getTaskRunner(), postProcessingTasks); final RowProcessingMetrics rowProcessingMetrics = getRowProcessingMetrics(); final RunRowProcessingPublisherTask runTask = new RunRowProcessingPublisherTask(this, rowProcessingMetrics); final TaskListener initFinishedListener = new RunNextTaskTaskListener(getTaskRunner(), runTask, runCompletionListener); // kick off the initialization initializeConsumers(initFinishedListener); return true; } }