/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Predicate;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.metamodel.query.Query;
import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.HasOutputDataStreams;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.MappedProperty;
import org.datacleaner.api.OutputDataStream;
import org.datacleaner.api.OutputRowCollector;
import org.datacleaner.job.output.OutputDataStreamBuilder;
import org.datacleaner.job.output.OutputDataStreams;
import org.datacleaner.storage.InMemoryRowAnnotationFactory2;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import com.google.common.base.Strings;
@Named("Completeness analyzer")
@Description("Asserts the completeness of your data by ensuring that all required fields are filled.")
public class CompletenessAnalyzer implements Analyzer<CompletenessAnalyzerResult>, HasOutputDataStreams {
public enum Condition implements HasName {
NOT_BLANK_OR_NULL("Not <blank> or <null>", Condition::isNotNullOrEmpty),
NOT_NULL("Not <null>", e -> (e != null)),
NULL("<null> required", e -> (e == null)),
BLANK_OR_NULL("<blank> or <null> required", Condition::isNullOrEmpty);
private final String _name;
private final Predicate<Object> _predicate;
Condition(final String name, final Predicate<Object> predicate) {
_name = name;
_predicate = predicate;
}
private static boolean isNotNullOrEmpty(final Object value) {
return !isNullOrEmpty(value);
}
private static boolean isNullOrEmpty(final Object value) {
if (value instanceof String) {
return Strings.isNullOrEmpty((String) value);
}
return value == null;
}
@Override
public String getName() {
return _name;
}
public boolean isValid(final Object argument) {
return _predicate.test(argument);
}
}
public enum EvaluationMode implements HasName {
ALL_FIELDS("When all fields are incomplete, the record is incomplete"),
ANY_FIELD("When any field is incomplete, the record is incomplete");
private final String _name;
EvaluationMode(final String name) {
_name = name;
}
@Override
public String getName() {
return _name;
}
}
public static final String OUTPUT_STREAM_COMPLETE = "Complete rows";
public static final String OUTPUT_STREAM_INCOMPLETE = "Incomplete rows";
public static final String PROPERTY_VALUES = "Values";
public static final String PROPERTY_CONDITIONS = "Conditions";
public static final String PROPERTY_EVALUATION_MODE = "Evaluation mode";
public static final String PROPERTY_ADDITIONAL_OUTPUT_VALUES = "Additional output values";
// Do not inject the shared RowAnnotations, available rows are always
// needed.
private final RowAnnotationFactory _annotationFactory = new InMemoryRowAnnotationFactory2();
private final RowAnnotation _invalidRecords = _annotationFactory.createAnnotation();
private final AtomicInteger _rowCount = new AtomicInteger();
@Inject
@Configured(order = 1, value = PROPERTY_VALUES)
@Description("Values to check for completeness")
InputColumn<?>[] _valueColumns;
@Inject
@Configured(order = 2, value = PROPERTY_CONDITIONS)
@Description("The conditions of which a value is determined to be filled or not")
@MappedProperty(PROPERTY_VALUES)
Condition[] _conditions;
@Inject
@Configured(order = 3, value = PROPERTY_EVALUATION_MODE)
EvaluationMode _evaluationMode = EvaluationMode.ANY_FIELD;
@Inject
@Configured(order = 100, value = PROPERTY_ADDITIONAL_OUTPUT_VALUES, required = false)
@Description("Optional additional values to add to output data streams")
InputColumn<?>[] _additionalOutputValueColumns;
private OutputRowCollector _completeRowCollector;
private OutputRowCollector _incompleteRowCollector;
private List<InputColumn<?>> _outputDataStreamColumns;
@Initialize
public void init() {
_rowCount.set(0);
_outputDataStreamColumns = createOutputDataStreamColumns();
}
private List<InputColumn<?>> createOutputDataStreamColumns() {
final List<InputColumn<?>> outputDataStreamColumns = new ArrayList<>();
Collections.addAll(outputDataStreamColumns, _valueColumns);
if (_additionalOutputValueColumns != null) {
for (final InputColumn<?> inputColumn : _additionalOutputValueColumns) {
if (!outputDataStreamColumns.contains(inputColumn)) {
outputDataStreamColumns.add(inputColumn);
}
}
}
return outputDataStreamColumns;
}
@Override
public void run(final InputRow row, final int distinctCount) {
_rowCount.addAndGet(distinctCount);
boolean allInvalid = true;
for (int i = 0; i < _valueColumns.length; i++) {
final Object value = row.getValue(_valueColumns[i]);
final boolean valid = _conditions[i].isValid(value);
if (_evaluationMode == EvaluationMode.ANY_FIELD && !valid) {
_annotationFactory.annotate(row, distinctCount, _invalidRecords);
if (_incompleteRowCollector != null) {
_incompleteRowCollector.putValues(row.getValues(_outputDataStreamColumns).toArray());
}
return;
}
if (valid) {
allInvalid = false;
}
}
if (_evaluationMode == EvaluationMode.ALL_FIELDS && allInvalid) {
_annotationFactory.annotate(row, distinctCount, _invalidRecords);
if (_incompleteRowCollector != null) {
_incompleteRowCollector.putValues(row.getValues(_outputDataStreamColumns).toArray());
}
return;
}
if (_completeRowCollector != null) {
_completeRowCollector.putValues(row.getValues(_outputDataStreamColumns).toArray());
}
}
@Override
public CompletenessAnalyzerResult getResult() {
return new CompletenessAnalyzerResult(_rowCount.get(), _invalidRecords, _annotationFactory, _valueColumns);
}
public void setConditions(final Condition[] conditions) {
_conditions = conditions;
}
public void setValueColumns(final InputColumn<?>[] valueColumns) {
_valueColumns = valueColumns;
}
/**
* Shortcut method to fill all conditions (of existing columns) to a single
* condition.
*
* @param condition
*/
public void fillAllConditions(final Condition condition) {
if (_valueColumns != null) {
final Condition[] conditions = new Condition[_valueColumns.length];
Arrays.fill(conditions, condition);
_conditions = conditions;
}
}
@Override
public OutputDataStream[] getOutputDataStreams() {
final OutputDataStreamBuilder completeStreamBuilder = OutputDataStreams.pushDataStream(OUTPUT_STREAM_COMPLETE);
final OutputDataStreamBuilder incompleteStreamBuilder =
OutputDataStreams.pushDataStream(OUTPUT_STREAM_INCOMPLETE);
for (final InputColumn<?> column : createOutputDataStreamColumns()) {
completeStreamBuilder.withColumnLike(column);
incompleteStreamBuilder.withColumnLike(column);
}
return new OutputDataStream[] { completeStreamBuilder.toOutputDataStream(),
incompleteStreamBuilder.toOutputDataStream() };
}
@Override
public void initializeOutputDataStream(final OutputDataStream outputDataStream, final Query query,
final OutputRowCollector outputRowCollector) {
if (outputDataStream.getName().equals(OUTPUT_STREAM_COMPLETE)) {
_completeRowCollector = outputRowCollector;
} else {
_incompleteRowCollector = outputRowCollector;
}
}
}