/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.components.fuse; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; import org.datacleaner.api.Convertable; import org.datacleaner.api.InputColumn; import org.datacleaner.util.ReflectionUtils; /** * Represents a set of columns to be coalesced. */ @Convertable(CoalesceUnitConverter.class) public class CoalesceUnit { private final String[] _inputColumnNames; // transient cached view of columns private final InputColumn<?>[] _inputColumns; public CoalesceUnit(final List<? extends InputColumn<?>> inputColumns) { this(inputColumns.toArray(new InputColumn[inputColumns.size()])); } /** * Create an uninitialized CoalesceUnit... The makes it into a pure factory. */ public CoalesceUnit(final String... columnNames) { _inputColumnNames = columnNames; _inputColumns = null; } public CoalesceUnit(final InputColumn<?>... inputColumns) { if (inputColumns == null || inputColumns.length == 0) { throw new IllegalArgumentException("InputColumns cannot be null or empty"); } _inputColumns = inputColumns; _inputColumnNames = getInputColumnNames(inputColumns); } private static String getSimpleName(final String name) { final int dotIndex = name.lastIndexOf('.'); if (dotIndex == -1) { return name; } return name.substring(dotIndex + 1); } private String[] getInputColumnNames(final InputColumn<?>[] inputColumns) { final String[] result = new String[inputColumns.length]; for (int i = 0; i < inputColumns.length; i++) { final InputColumn<?> inputColumn = inputColumns[i]; if (inputColumn.isPhysicalColumn()) { result[i] = inputColumn.getPhysicalColumn().getQualifiedLabel(); } else { result[i] = inputColumn.getName(); } } return result; } public String[] getInputColumnNames() { if (_inputColumns != null) { // use updated column names if possible - they may have changed return getInputColumnNames(_inputColumns); } return _inputColumnNames; } /** * Creates new {@link CoalesceUnit} if {@link InputColumn}s has changed * * @param newInputColumns List of new {@link InputColumn}s * @return New {@link CoalesceUnit} if {@link InputColumn}s has truly changed, otherwise this. */ public CoalesceUnit getUpdatedCoalesceUnit(final InputColumn<?>[] newInputColumns) { if (Arrays.equals(_inputColumns, newInputColumns)) { return this; } else { return new CoalesceUnit(newInputColumns); } } /** * Refreshes the current transient setup of {@link InputColumn}s in the * {@link CoalesceUnit}. This is necessary to do before any job execution to * ensure that the {@link InputColumn} references are intact and don't point * to e.g. a copy of the input columns from a cloned job. * * Not doing this will result in issues such as * <a href="https://github.com/datacleaner/DataCleaner/issues/923">issue #923</a> * * @return A new CoalesceUnit containing the updated columns. */ public CoalesceUnit updateInputColumns(final InputColumn<?>[] allInputColumns) { return getUpdatedCoalesceUnit(getUpdatedInputColumns(allInputColumns, true)); } public InputColumn<?>[] getUpdatedInputColumns(final InputColumn<?>[] allInputColumns, final boolean exceptionOnMissing) { final String[] newInputColumnNames = getInputColumnNames(); final List<InputColumn<?>> newInputColumns = new ArrayList<>(newInputColumnNames.length); for (final String newInputColumnName : newInputColumnNames) { final InputColumn<?> updatedInputColumn = findInputColumn(allInputColumns, newInputColumnName); if (updatedInputColumn == null) { if (exceptionOnMissing) { final List<String> names = Arrays.stream(allInputColumns).map(InputColumn::getName).collect(Collectors.toList()); throw new CoalesceUnitMissingColumnException(this, newInputColumnName, "Column '" + newInputColumnName + "' not found. Available columns: " + names); } } else { newInputColumns.add(updatedInputColumn); } } return newInputColumns.toArray(new InputColumn[newInputColumns.size()]); } private InputColumn<?> findInputColumn(final InputColumn<?>[] allInputColumns, final String inputColumnName) { // Exact match round on path. for (final InputColumn<?> inputColumn : allInputColumns) { if (inputColumnName.contains(".") && inputColumn.isPhysicalColumn() && inputColumnName .equals(inputColumn.getPhysicalColumn().getQualifiedLabel())) { return inputColumn; } } // Trimmed and case-insensitive path match round. for (final InputColumn<?> inputColumn : allInputColumns) { if (inputColumnName.contains(".") && inputColumn.isPhysicalColumn() && inputColumnName.trim() .equalsIgnoreCase(inputColumn.getPhysicalColumn().getQualifiedLabel())) { return inputColumn; } } // Legacy: Exact name match round for (final InputColumn<?> inputColumn : allInputColumns) { if (inputColumnName.equals(inputColumn.getName())) { return inputColumn; } } // Legacy: Trimmed and case-insensitive name match round. for (final InputColumn<?> inputColumn : allInputColumns) { if (inputColumnName.trim().equalsIgnoreCase(inputColumn.getName().trim())) { return inputColumn; } } return null; } public InputColumn<?>[] getInputColumns() { return _inputColumns; } public Class<?> getOutputDataType() { Class<?> candidate = null; for (final InputColumn<?> inputColumn : _inputColumns) { final Class<?> dataType = inputColumn.getDataType(); if (candidate == null) { candidate = dataType; } else if (candidate == Object.class) { return candidate; } else { if (candidate != dataType) { if (ReflectionUtils.is(dataType, candidate)) { // keep the current candidate } else if (ReflectionUtils.is(candidate, dataType)) { candidate = dataType; } else { return Object.class; } } } } if (candidate == null) { return Object.class; } return candidate; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + Arrays.hashCode(getInputColumnNames()); return result; } @Override public boolean equals(final Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final CoalesceUnit other = (CoalesceUnit) obj; if (!Arrays.equals(getInputColumnNames(), other.getInputColumnNames())) { return false; } return true; } @Override public String toString() { return "CoalesceUnit[inputColumnNames=" + Arrays.toString(getInputColumnNames()) + "]"; } public String getSuggestedOutputColumnName() { return getSimpleName(getInputColumnNames()[0]); } }