/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.valuedist;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.inject.Inject;
import javax.inject.Named;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.ColumnProperty;
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExternalDocumentation;
import org.datacleaner.api.ExternalDocumentation.DocumentationLink;
import org.datacleaner.api.ExternalDocumentation.DocumentationType;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Provided;
import org.datacleaner.storage.CollectionFactory;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.storage.RowAnnotations;
import org.datacleaner.util.NullTolerableComparator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Named("Value distribution")
@Description("Gets the distributions of values that occur in a dataset.\nOften used as an initial way to see if a "
+ "lot of repeated values are to be expected, if nulls occur and if a few un-repeated values add exceptions "
+ "to the typical usage-pattern.")
@ExternalDocumentation(
{ @DocumentationLink(title = "Analyzer rundown", url = "https://www.youtube.com/watch?v=hZWxB_eu_A0",
type = DocumentationType.VIDEO, version = "4.0") })
@Concurrent(true)
public class ValueDistributionAnalyzer implements Analyzer<ValueDistributionAnalyzerResult> {
public static final String PROPERTY_COLUMN = "Column";
public static final String PROPERTY_GROUP_COLUMN = "Group column";
public static final String PROPERTY_RECORD_UNIQUE_VALUES = "Record unique values";
public static final String PROPERTY_RECORD_DRILL_DOWN_INFORMATION = "Record drill-down information";
private static final Logger logger = LoggerFactory.getLogger(ValueDistributionAnalyzer.class);
private final Map<String, ValueDistributionGroup> _valueDistributionGroups;
@Inject
@Configured(value = PROPERTY_COLUMN, order = 1)
@ColumnProperty(escalateToMultipleJobs = true)
InputColumn<?> _column;
@Inject
@Configured(value = PROPERTY_GROUP_COLUMN, required = false, order = 2)
InputColumn<String> _groupColumn;
@Inject
@Configured(value = PROPERTY_RECORD_UNIQUE_VALUES, required = false, order = 3)
boolean _recordUniqueValues = true;
@Inject
@Configured(value = PROPERTY_RECORD_DRILL_DOWN_INFORMATION, required = false, order = 4)
@Description(
"Record extra information to allow drilling to the records that represent a particular value in the distribution")
boolean _recordDrillDownInformation = true;
@Inject
@Configured(value = "Top n most frequent values", required = false, order = 5)
@Deprecated
Integer _topFrequentValues;
@Inject
@Configured(value = "Bottom n most frequent values", required = false, order = 6)
@Deprecated
Integer _bottomFrequentValues;
@Inject
@Provided
RowAnnotationFactory _annotationFactory;
/**
* Constructor used for testing and ad-hoc purposes
*
* @param column
* @param recordUniqueValues
*/
public ValueDistributionAnalyzer(final InputColumn<?> column, final boolean recordUniqueValues) {
this(column, null, recordUniqueValues);
}
/**
* Constructor used for testing and ad-hoc purposes
*
* @param column
* @param groupColumn
* @param recordUniqueValues
*/
public ValueDistributionAnalyzer(final InputColumn<?> column, final InputColumn<String> groupColumn,
final boolean recordUniqueValues) {
this();
_column = column;
_groupColumn = groupColumn;
_recordUniqueValues = recordUniqueValues;
_annotationFactory = RowAnnotations.getDefaultFactory();
}
/**
* Main constructor
*/
public ValueDistributionAnalyzer() {
_valueDistributionGroups = new TreeMap<>(NullTolerableComparator.get(String.class));
}
@Override
public void run(final InputRow row, final int distinctCount) {
final Object value = row.getValue(_column);
if (_groupColumn == null) {
runInternal(row, value, distinctCount);
} else {
final String group = row.getValue(_groupColumn);
runInternal(row, value, group, distinctCount);
}
}
public void runInternal(final InputRow row, final Object value, final int distinctCount) {
runInternal(row, value, _column.getName(), distinctCount);
}
public void runInternal(final InputRow row, final Object value, final String group, final int distinctCount) {
final ValueDistributionGroup valueDistributionGroup = getValueDistributionGroup(group);
final String stringValue;
if (value == null) {
logger.debug("value is null");
stringValue = null;
} else {
stringValue = value.toString();
}
valueDistributionGroup.run(row, stringValue, distinctCount);
}
private ValueDistributionGroup getValueDistributionGroup(final String group) {
ValueDistributionGroup valueDistributionGroup = _valueDistributionGroups.get(group);
if (valueDistributionGroup == null) {
synchronized (this) {
valueDistributionGroup = _valueDistributionGroups.get(group);
if (valueDistributionGroup == null) {
final InputColumn<?>[] inputColumns;
if (_groupColumn == null) {
inputColumns = new InputColumn[] { _column };
} else {
inputColumns = new InputColumn[] { _column, _groupColumn };
}
valueDistributionGroup =
new ValueDistributionGroup(group, _annotationFactory, _recordDrillDownInformation,
inputColumns);
_valueDistributionGroups.put(group, valueDistributionGroup);
}
}
}
return valueDistributionGroup;
}
@Override
public ValueDistributionAnalyzerResult getResult() {
if (_groupColumn == null) {
logger.info("getResult() invoked, processing single group");
final ValueDistributionGroup valueDistributionGroup = getValueDistributionGroup(_column.getName());
return valueDistributionGroup.createResult(_recordUniqueValues);
} else {
logger.info("getResult() invoked, processing {} groups", _valueDistributionGroups.size());
final SortedSet<SingleValueDistributionResult> groupedResults = new TreeSet<>();
for (final String group : _valueDistributionGroups.keySet()) {
final ValueDistributionGroup valueDistributibutionGroup = getValueDistributionGroup(group);
final SingleValueDistributionResult result =
valueDistributibutionGroup.createResult(_recordUniqueValues);
groupedResults.add(result);
}
return new GroupedValueDistributionResult(_column, _groupColumn, groupedResults);
}
}
public void setAnnotationFactory(final RowAnnotationFactory annotationFactory) {
_annotationFactory = annotationFactory;
}
/**
*
* @param collectionFactory
* @deprecated use of this property is no longer adviced. It will be phased
* out in later versions of DataCleaner
*/
@Deprecated
public void setCollectionFactory(final CollectionFactory collectionFactory) {
// do nothing
}
public void setColumn(final InputColumn<?> column) {
_column = column;
}
public void setGroupColumn(final InputColumn<String> groupColumn) {
_groupColumn = groupColumn;
}
public void setRecordDrillDownInformation(final boolean recordDrillDownInformation) {
_recordDrillDownInformation = recordDrillDownInformation;
}
public void setRecordUniqueValues(final boolean recordUniqueValues) {
_recordUniqueValues = recordUniqueValues;
}
}