/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.components.fuse; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import javax.inject.Named; import org.apache.metamodel.query.Query; import org.apache.metamodel.schema.ColumnType; import org.apache.metamodel.schema.ColumnTypeImpl; import org.datacleaner.api.Categorized; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.Initialize; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.api.MultiStreamComponent; import org.datacleaner.api.OutputDataStream; import org.datacleaner.api.OutputRowCollector; import org.datacleaner.components.categories.CompositionCategory; import org.datacleaner.job.output.OutputDataStreamBuilder; import org.datacleaner.job.output.OutputDataStreams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @Named("Union") @Description("Lets you combine multiple streams into one. Providing what is equivalent to a union of tables.\n\n" + "Use it to fuse data streams coming from different source tables. " + "You can define new fields whose values represent whatever is available from one of the input streams.") @Categorized(CompositionCategory.class) public class FuseStreamsComponent extends MultiStreamComponent { public static final String OUTPUT_DATA_STREAM_NAME = "output"; public static final String PROPERTY_INPUTS = "Inputs"; public static final String PROPERTY_UNITS = "Units"; private static final Logger logger = LoggerFactory.getLogger(FuseStreamsComponent.class); @Configured(PROPERTY_INPUTS) InputColumn<?>[] _inputs; @Configured(PROPERTY_UNITS) CoalesceUnit[] _units; private OutputRowCollector _outputRowCollector; private CoalesceFunction _coalesceFunction; private CoalesceUnit[] _initializedUnits; public FuseStreamsComponent() { } public FuseStreamsComponent(final CoalesceUnit... units) { this(); this._units = units; } @Initialize public void init() { _coalesceFunction = new CoalesceFunction(false); _initializedUnits = new CoalesceUnit[_units.length]; for (int i = 0; i < _units.length; i++) { _initializedUnits[i] = _units[i].updateInputColumns(_inputs); } } /** * Configures the transformer using the coalesce units provided * * @param units */ public void configureUsingCoalesceUnits(final CoalesceUnit... units) { final List<InputColumn<?>> input = new ArrayList<>(); for (final CoalesceUnit coalesceUnit : units) { final InputColumn<?>[] inputColumns = coalesceUnit.getInputColumns(); Collections.addAll(input, inputColumns); } _inputs = input.toArray(new InputColumn[input.size()]); _units = units; init(); } @Override public void run(final InputRow inputRow) { final Object[] output = new Object[_initializedUnits.length]; for (int i = 0; i < _initializedUnits.length; i++) { final CoalesceUnit unit = _initializedUnits[i]; final InputColumn<?>[] inputColumns = unit.getInputColumns(); final List<Object> values = inputRow.getValues(inputColumns); final Object value = _coalesceFunction.coalesce(values); output[i] = value; } if (logger.isDebugEnabled()) { logger.debug("Fused values for row: {}", Arrays.toString(output)); } _outputRowCollector.putValues(output); } @Override public OutputDataStream[] getOutputDataStreams() { final OutputDataStreamBuilder builder = OutputDataStreams.pushDataStream(OUTPUT_DATA_STREAM_NAME); boolean foundOutputDataStream = false; for (final CoalesceUnit unit : _units) { // Not necessarily initialized yet, so no _initializedUnits available final InputColumn<?>[] updatedInputColumns = unit.getUpdatedInputColumns(_inputs, false); if (unit.getInputColumnNames().length == updatedInputColumns.length) { // Valid Unit foundOutputDataStream = true; final CoalesceUnit updatedCoalesceUnit = unit.getUpdatedCoalesceUnit(updatedInputColumns); final Class<?> dataType = updatedCoalesceUnit.getOutputDataType(); final String columnName = updatedCoalesceUnit.getSuggestedOutputColumnName(); final ColumnType columnType = ColumnTypeImpl.convertColumnType(dataType); builder.withColumn(columnName, columnType); } else { logger.info("Missing columns detected, skipping coalesce unit"); } } if (!foundOutputDataStream) { return new OutputDataStream[0]; } return new OutputDataStream[] { builder.toOutputDataStream() }; } @Override public void initializeOutputDataStream(final OutputDataStream outputDataStream, final Query query, final OutputRowCollector outputRowCollector) { _outputRowCollector = outputRowCollector; } }