ValuesToInputRowFunction.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark.functions;

import java.util.List;

import org.apache.metamodel.schema.Table;
import org.apache.spark.api.java.function.Function;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.data.MockInputRow;
import org.datacleaner.spark.SparkJobContext;

import scala.Tuple2;

/**
 * Mapper function that changes takes Object arrays representing physical
 * records into the format of an {@link InputRow}.
 *
 * Currently this is implemented very strictly by just investigating the column
 * indices of the job's source columns.
 *
 * It is assumed that the job is based on a single source {@link Table}.
 */
public class ValuesToInputRowFunction implements Function<Tuple2<Object[], Long>, InputRow> {

    private static final long serialVersionUID = 1L;

    private final SparkJobContext _sparkJobContext;

    public ValuesToInputRowFunction(final SparkJobContext sparkJobContext) {
        _sparkJobContext = sparkJobContext;
    }

    @Override
    public InputRow call(final Tuple2<Object[], Long> tuple) throws Exception {
        final Object[] values = tuple._1;
        final Long rowNumber = tuple._2;

        final MockInputRow inputRow = new MockInputRow(rowNumber.intValue());
        final List<InputColumn<?>> sourceColumns = _sparkJobContext.getAnalysisJob().getSourceColumns();
        for (final InputColumn<?> sourceColumn : sourceColumns) {
            assert sourceColumn.isPhysicalColumn();
            final int columnIndex = sourceColumn.getPhysicalColumn().getColumnNumber();
            final Object value = values[columnIndex];
            inputRow.put(sourceColumn, value);
        }
        return inputRow;
    }

}