/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.components.group;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import javax.inject.Named;
import org.apache.metamodel.query.FunctionType;
import org.apache.metamodel.query.Query;
import org.apache.metamodel.schema.ColumnType;
import org.apache.metamodel.util.AggregateBuilder;
import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Close;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Distributed;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.MappedProperty;
import org.datacleaner.api.MultiStreamComponent;
import org.datacleaner.api.OutputDataStream;
import org.datacleaner.api.OutputRowCollector;
import org.datacleaner.components.categories.CompositionCategory;
import org.datacleaner.job.output.OutputDataStreamBuilder;
import org.datacleaner.job.output.OutputDataStreams;
@Named("Grouper")
@Description("A component that allows grouping and aggregating values with the same key.")
@Categorized(value = CompositionCategory.class)
@Distributed(false)
public class GrouperTransformer extends MultiStreamComponent {
public enum AggregationType implements HasName {
CONCAT_VALUES("Concatenate values"), FIRST_VALUE("Select first value"), LAST_VALUE("Select last value"),
RANDOM_VALUE("Select random value"), CREATE_LIST("Create list of values"), SUM("Calculate sum"),
AVG("Calculate average");
private final String _name;
AggregationType(final String name) {
_name = name;
}
@Override
public String getName() {
return _name;
}
public AggregateBuilder<?> createAggregateBuilder(final SortationType sortationType, final boolean skipNulls,
final String concatenationSeparator) {
switch (this) {
case CONCAT_VALUES:
return new ConcatAggregateBuilder(sortationType, skipNulls, concatenationSeparator);
case CREATE_LIST:
return new CreateListAggregateBuilder(sortationType, skipNulls);
case FIRST_VALUE:
return FunctionType.FIRST.createAggregateBuilder();
case LAST_VALUE:
return FunctionType.LAST.createAggregateBuilder();
case SUM:
return FunctionType.SUM.createAggregateBuilder();
case AVG:
return FunctionType.AVG.createAggregateBuilder();
case RANDOM_VALUE:
return FunctionType.RANDOM.createAggregateBuilder();
default:
throw new UnsupportedOperationException();
}
}
public void addColumnToOutputStream(final OutputDataStreamBuilder outputDataStreamBuilder,
final InputColumn<?> inputColumn) {
switch (this) {
case FIRST_VALUE:
case LAST_VALUE:
case RANDOM_VALUE:
outputDataStreamBuilder.withColumnLike(inputColumn);
break;
case SUM:
case AVG:
outputDataStreamBuilder.withColumn(inputColumn.getName(), ColumnType.NUMBER);
break;
case CONCAT_VALUES:
outputDataStreamBuilder.withColumn(inputColumn.getName(), ColumnType.STRING);
break;
case CREATE_LIST:
outputDataStreamBuilder.withColumn(inputColumn.getName(), ColumnType.LIST);
break;
default:
throw new UnsupportedOperationException("Unsupported aggregation type: " + this);
}
}
}
public static final String PROPERTY_GROUP_KEY = "Group key";
public static final String PROPERTY_AGGREGATED_VALUES = "Aggregated values";
public static final String PROPERTY_AGGREGATION_TYPES = "AggregationTypes";
public static final String PROPERTY_VALUE_SORTATION = "Value sortation";
private static final Object NULL_KEY = new Object();
private final ConcurrentMap<Object, List<AggregateBuilder<?>>> _aggregateBuilders = new ConcurrentHashMap<>();
@Configured(order = 1, value = PROPERTY_GROUP_KEY)
InputColumn<?> groupKey;
@Configured(order = 2, value = PROPERTY_AGGREGATED_VALUES)
InputColumn<?>[] aggregatedValues;
@Configured(order = 3, value = PROPERTY_AGGREGATION_TYPES)
@MappedProperty(PROPERTY_AGGREGATED_VALUES)
AggregationType[] aggregationTypes;
@Configured(order = 4, value = PROPERTY_VALUE_SORTATION)
SortationType valueSortation = SortationType.NONE;
@Configured
String concatenationSeparator = ", ";
@Configured
boolean skipNullGroupKeys = true;
@Configured
boolean skipNullValues = true;
private OutputRowCollector _rowCollector;
@Initialize
public void init() {
_aggregateBuilders.clear();
}
@Override
public OutputDataStream[] getOutputDataStreams() {
final OutputDataStreamBuilder outputDataStreamBuilder = OutputDataStreams.pushDataStream("output");
outputDataStreamBuilder.withColumnLike(groupKey);
outputDataStreamBuilder.withColumn("row_count", ColumnType.INTEGER);
for (int i = 0; i < aggregatedValues.length; i++) {
final InputColumn<?> inputColumn = aggregatedValues[i];
final AggregationType aggregationType =
(aggregationTypes.length <= i ? AggregationType.CREATE_LIST : aggregationTypes[i]);
if (aggregationType != null) {
aggregationType.addColumnToOutputStream(outputDataStreamBuilder, inputColumn);
}
}
final OutputDataStream stream = outputDataStreamBuilder.toOutputDataStream();
return new OutputDataStream[] { stream };
}
@Override
public void initializeOutputDataStream(final OutputDataStream stream, final Query q,
final OutputRowCollector collector) {
_rowCollector = collector;
}
@Override
protected void run(final InputRow row) {
if (_rowCollector == null) {
// nothing to do
return;
}
Object key = row.getValue(groupKey);
if (key == null) {
if (skipNullGroupKeys) {
// skip it
return;
} else {
key = NULL_KEY;
}
}
synchronized (_aggregateBuilders) {
final List<AggregateBuilder<?>> aggregateBuilders = getAggregateBuilders(key);
final long rowId = row.getId();
// send rowId to COUNT function
aggregateBuilders.get(0).add(rowId);
for (int i = 0; i < aggregatedValues.length; i++) {
final Object value = row.getValue(aggregatedValues[i]);
final AggregateBuilder<?> aggregateBuilder = aggregateBuilders.get(i + 1);
if (aggregateBuilder instanceof AbstractRowNumberAwareAggregateBuilder) {
((AbstractRowNumberAwareAggregateBuilder<?>) aggregateBuilder).add(value, rowId);
} else {
aggregateBuilder.add(value);
}
}
}
}
private List<AggregateBuilder<?>> getAggregateBuilders(final Object key) {
List<AggregateBuilder<?>> collectionOfAggregateBuilders = _aggregateBuilders.get(key);
if (collectionOfAggregateBuilders == null) {
final List<AggregateBuilder<?>> newCollectionOfValues = new ArrayList<>(aggregationTypes.length);
// add COUNT aggregation as first
newCollectionOfValues.add(FunctionType.COUNT.createAggregateBuilder());
for (final AggregationType aggregationType : aggregationTypes) {
final AggregateBuilder<?> aggregateBuilder =
aggregationType.createAggregateBuilder(valueSortation, skipNullValues, concatenationSeparator);
newCollectionOfValues.add(aggregateBuilder);
}
final List<AggregateBuilder<?>> previousCollectionOfValues =
_aggregateBuilders.putIfAbsent(key, newCollectionOfValues);
if (previousCollectionOfValues == null) {
collectionOfAggregateBuilders = newCollectionOfValues;
} else {
collectionOfAggregateBuilders = previousCollectionOfValues;
}
}
return collectionOfAggregateBuilders;
}
@Close
public void close() {
final Set<Entry<Object, List<AggregateBuilder<?>>>> entrySet = _aggregateBuilders.entrySet();
for (final Entry<Object, List<AggregateBuilder<?>>> entry : entrySet) {
final Object key = entry.getKey();
final List<AggregateBuilder<?>> aggregateBuilders = entry.getValue();
final Object[] values = new Object[2 + aggregatedValues.length];
values[0] = key == NULL_KEY ? null : key;
values[1] = aggregateBuilders.get(0).getAggregate();
for (int i = 1; i < aggregateBuilders.size(); i++) {
final AggregateBuilder<?> aggregateBuilder = aggregateBuilders.get(i);
values[i + 1] = aggregateBuilder.getAggregate();
}
_rowCollector.putValues(values);
}
}
}