/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.filter; import java.util.ArrayList; import java.util.List; import javax.inject.Named; import org.apache.metamodel.query.FilterItem; import org.apache.metamodel.query.OperatorType; import org.apache.metamodel.query.Query; import org.apache.metamodel.query.SelectItem; import org.apache.metamodel.schema.Column; import org.apache.metamodel.util.HasName; import org.datacleaner.api.Alias; import org.datacleaner.api.Categorized; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.Distributed; import org.datacleaner.api.HasLabelAdvice; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.api.QueryOptimizedFilter; import org.datacleaner.components.categories.FilterCategory; @Named("Null check") @Alias("Not null") @Description("Filter rows that contain null values.") @Categorized(FilterCategory.class) @Distributed(true) public class NullCheckFilter implements QueryOptimizedFilter<NullCheckFilter.NullCheckCategory>, HasLabelAdvice { public enum NullCheckCategory { @Alias("INVALID") NULL, @Alias("VALID") NOT_NULL } public enum EvaluationMode implements HasName { ALL_FIELDS("When all fields are NULL, the record is considered NULL"), ANY_FIELD("When any field is NULL, the record is considered NULL"); private final String _name; EvaluationMode(final String name) { _name = name; } @Override public String getName() { return _name; } } @Configured @Description("Select columns that should NOT have null values") InputColumn<?>[] columns; @Configured @Description("Consider empty strings (\"\") as null also?") boolean considerEmptyStringAsNull = false; @Configured("Evaluation mode") EvaluationMode evaluationMode = EvaluationMode.ANY_FIELD; public NullCheckFilter() { } public NullCheckFilter(final InputColumn<?>[] columns, final boolean considerEmptyStringAsNull) { this(); this.columns = columns; this.considerEmptyStringAsNull = considerEmptyStringAsNull; } public NullCheckFilter(final InputColumn<?>[] columns, final boolean considerEmptyStringAsNull, final EvaluationMode evaluationMode) { this(); this.columns = columns; this.considerEmptyStringAsNull = considerEmptyStringAsNull; this.evaluationMode = evaluationMode; } @Override public String getSuggestedLabel() { if (columns == null || columns.length != 1) { return null; } final InputColumn<?> column = columns[0]; return column.getName() + " is null?"; } public void setConsiderEmptyStringAsNull(final boolean considerEmptyStringAsNull) { this.considerEmptyStringAsNull = considerEmptyStringAsNull; } @Override public boolean isOptimizable(final NullCheckCategory category) { if (evaluationMode == EvaluationMode.ANY_FIELD) { return true; } // can be further improved but requires changes to optimizeQuery(...) return false; } @Override public Query optimizeQuery(final Query q, final NullCheckCategory category) { if (category == NullCheckCategory.NOT_NULL) { for (final InputColumn<?> col : columns) { final Column column = col.getPhysicalColumn(); if (column == null) { throw new IllegalStateException("Cannot optimize on non-physical column: " + col); } q.where(column, OperatorType.DIFFERENT_FROM, null); if (considerEmptyStringAsNull && col.getDataType() == String.class) { q.where(column, OperatorType.DIFFERENT_FROM, ""); } } } else { // if NULL all filter items will be OR'ed. final List<FilterItem> filterItems = new ArrayList<>(); for (final InputColumn<?> col : columns) { final Column column = col.getPhysicalColumn(); if (column == null) { throw new IllegalStateException("Cannot optimize on non-physical column: " + col); } final SelectItem selectItem = new SelectItem(column); final FilterItem fi1 = new FilterItem(selectItem, OperatorType.EQUALS_TO, null); filterItems.add(fi1); if (considerEmptyStringAsNull && col.getDataType() == String.class) { final FilterItem fi2 = new FilterItem(selectItem, OperatorType.EQUALS_TO, ""); filterItems.add(fi2); } } q.where(new FilterItem(filterItems.toArray(new FilterItem[filterItems.size()]))); } return q; } @Override public NullCheckCategory categorize(final InputRow inputRow) { if (evaluationMode.equals(EvaluationMode.ANY_FIELD)) { return categorizeAnyFieldMode(inputRow); } else { return categorizeAllFieldMode(inputRow); } } private NullCheckCategory categorizeAnyFieldMode(final InputRow inputRow) { for (final InputColumn<?> col : columns) { final Object value = inputRow.getValue(col); if (value == null) { return NullCheckCategory.NULL; } if (considerEmptyStringAsNull && "".equals(value)) { return NullCheckCategory.NULL; } } return NullCheckCategory.NOT_NULL; } private NullCheckCategory categorizeAllFieldMode(final InputRow inputRow) { NullCheckCategory result = NullCheckCategory.NULL; for (final InputColumn<?> col : columns) { final Object value = inputRow.getValue(col); if (value != null) { if (considerEmptyStringAsNull) { if (!"".equals(value)) { result = NullCheckCategory.NOT_NULL; break; } } else { result = NullCheckCategory.NOT_NULL; break; } } } return result; } }