/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.execution.steps;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BiConsumer;
import java.util.function.Supplier;
import org.diqube.data.column.StandardColumnShard;
import org.diqube.data.table.TableShard;
import org.diqube.execution.consumers.AbstractThreadedColumnBuiltConsumer;
import org.diqube.execution.consumers.AbstractThreadedRowIdConsumer;
import org.diqube.execution.consumers.ColumnBuiltConsumer;
import org.diqube.execution.consumers.DoneConsumer;
import org.diqube.execution.consumers.GenericConsumer;
import org.diqube.execution.consumers.GroupConsumer;
import org.diqube.execution.consumers.GroupDeltaConsumer;
import org.diqube.execution.consumers.RowIdConsumer;
import org.diqube.execution.exception.ExecutablePlanBuildException;
import org.diqube.executionenv.ExecutionEnvironment;
import org.diqube.executionenv.querystats.QueryableColumnShard;
import org.diqube.queries.QueryRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
* Executes a GROUP BY clause.
*
* <p>
* As each group that is produced will end up to be one row in the overall result table of the query, we choose one row
* ID per group that identifies the group . This identifying rowID is also called the "group id". This group ID though
* is not identifying the group globally, but only on one {@link TableShard}, as other table shards will choose other
* row IDs that may reference the same group (same group = group with the same values in the group-by-fields).
* <p>
* There are three output consumers that are fed with data by this step: {@link RowIdConsumer}s that will be fed with
* the groupIDs/identifying row IDs (= can be used to resolve any values that need to be resolved for a group). In
* addition to that the {@link GroupConsumer}s and {@link GroupDeltaConsumer}s will be fed with the actual grouping of
* the row IDs.
*
* <p>
* The columns which should be grouped by are expected to be {@link StandardColumnShard}s.
*
* <p>
* Input: 1 {@link RowIdConsumer}, 1 optional {@link ColumnBuiltConsumer} <br>
* Output: {@link RowIdConsumer} and/or {@link GroupConsumer} and/or {@link GroupDeltaConsumer}.
*
* @author Bastian Gloeckle
*/
public class GroupStep extends AbstractThreadedExecutablePlanStep {
private static final Logger logger = LoggerFactory.getLogger(GroupStep.class);
private AtomicBoolean allColumnsBuilt = new AtomicBoolean(false);
private Set<String> columnsThatNeedToBeBuilt;
private AbstractThreadedColumnBuiltConsumer columnBuiltConsumer = new AbstractThreadedColumnBuiltConsumer(this) {
@Override
protected void allSourcesAreDone() {
}
@Override
protected void doColumnBuilt(String colName) {
columnsThatNeedToBeBuilt.remove(colName);
if (columnsThatNeedToBeBuilt.isEmpty())
allColumnsBuilt.set(true);
}
};
private AtomicBoolean sourceIsEmpty = new AtomicBoolean(false);
private ConcurrentLinkedDeque<Long> rowIds = new ConcurrentLinkedDeque<>();
private AbstractThreadedRowIdConsumer rowIdConsumer = new AbstractThreadedRowIdConsumer(this) {
@Override
public void allSourcesAreDone() {
GroupStep.this.sourceIsEmpty.set(true);
}
@Override
protected void doConsume(Long[] rowIds) {
for (long rowId : rowIds)
GroupStep.this.rowIds.add(rowId);
}
};
/**
* The {@link Grouper} that controls all the groupings. If the grouping should be made by multiple fields, this
* grouper will automatically take care of that.
*/
private Grouper headGrouper;
private Map<Long, List<Long>> groups = new HashMap<>();
private List<String> colNamesToGroupBy;
private ExecutionEnvironment defaultEnv;
public GroupStep(int stepId, QueryRegistry queryRegistry, ExecutionEnvironment env, List<String> colNamesToGroupBy) {
super(stepId, queryRegistry);
this.defaultEnv = env;
this.colNamesToGroupBy = colNamesToGroupBy;
}
@Override
public void initialize() {
columnsThatNeedToBeBuilt = new ConcurrentSkipListSet<>(colNamesToGroupBy);
for (Iterator<String> it = columnsThatNeedToBeBuilt.iterator(); it.hasNext();)
if (defaultEnv.getColumnShard(it.next()) != null)
it.remove();
}
/**
* Create a Grouper that will do the grouping for the columns specified, starting from the specified index. That means
* the resulting Supplier will supply a new {@link Grouper} instance that will group by all column in columnsToGroupBy
* with index starting from the provided one.
*/
private Supplier<Grouper> createGroupers(List<String> columnsToGroupBy, int index) {
return () -> {
if (index == columnsToGroupBy.size())
// Use a Leaf grouper after the last Non-lead grouper.
return new Grouper();
QueryableColumnShard shard = defaultEnv.getColumnShard(columnsToGroupBy.get(index));
return new Grouper(shard, createGroupers(columnsToGroupBy, index + 1));
};
}
@Override
protected void execute() {
if (columnBuiltConsumer.getNumberOfTimesWired() > 0 && !allColumnsBuilt.get())
// we wait until our columns are all built.
return;
if (headGrouper == null)
// create groupers. Do this just now, as we know that now really all columns are available!
headGrouper = createGroupers(colNamesToGroupBy, 0).get();
List<Long> activeRowIds = new ArrayList<>();
Long newRowId;
while ((newRowId = rowIds.poll()) != null)
activeRowIds.add(newRowId);
if (activeRowIds.size() > 0) {
// use headGrouper to group the new RowIDs, collect the new groupings in a new map.
Map<Long, List<Long>> changesGroups = new HashMap<>();
headGrouper.groupRowIds(activeRowIds, changesGroups);
logger.trace("Grouped new rowIds (limit each): {}",
Maps.transformValues(changesGroups, lst -> Iterables.limit(lst, 50)));
Set<Long> newGroupIds = Sets.difference(changesGroups.keySet(), groups.keySet());
if (!newGroupIds.isEmpty()) {
// If we started new groups, we need to resolve the values of the group-by fields (if they are selected, e.g.).
// As each groupID is in fact a rowID (of one arbitrary row that is inside the group), we find those new row IDs
// and send them to RowID consumers.
Long[] newRowIdsArray = newGroupIds.stream().toArray(l -> new Long[l]);
logger.trace("New group IDs (limit): {}", Iterables.limit(Arrays.asList(newRowIdsArray), 100));
forEachOutputConsumerOfType(RowIdConsumer.class, c -> c.consume(newRowIdsArray));
}
for (Long groupId : changesGroups.keySet()) {
if (!groups.containsKey(groupId))
groups.put(groupId, new ArrayList<>(changesGroups.get(groupId)));
else
groups.get(groupId).addAll(changesGroups.get(groupId));
}
forEachOutputConsumerOfType(GroupDeltaConsumer.class, c -> c.consumeGroupDeltas(changesGroups));
forEachOutputConsumerOfType(GroupConsumer.class, c -> c.consumeGroups(groups));
}
if (sourceIsEmpty.get() && rowIds.isEmpty()) {
forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
doneProcessing();
}
}
@Override
protected void validateWiredStatus() throws ExecutablePlanBuildException {
if (rowIdConsumer.getNumberOfTimesWired() == 0)
throw new ExecutablePlanBuildException("RowId input not wired.");
// ColumnBuiltConsumer does not have to be wired.
}
@Override
protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException {
if (!(consumer instanceof DoneConsumer) && !(consumer instanceof RowIdConsumer)
&& !(consumer instanceof GroupConsumer) && !(consumer instanceof GroupDeltaConsumer))
throw new IllegalArgumentException("Only RowIdConsumer, GroupConsumer and GroupDeltaConsumer accepted.");
}
@Override
protected List<GenericConsumer> inputConsumers() {
return Arrays.asList(new GenericConsumer[] { rowIdConsumer, columnBuiltConsumer });
}
/**
* A {@link Grouper} is capable of grouping row IDs by one column and additionally forward the grouping requests to
* other groupers which will group by different columns.
*
* <p>
* Each grouper is in one of two states:
* <ul>
* <li>Leaf: These are the groupers that do not have any delegate groupers (= grouping on the column that was
* specified last in the GROUP BY stmt). Leaf groupers to not actually group anyything, but identify the groupId of a
* (new) group and record the new additions to a group. Each Leaf {@link Grouper} represents one group.
* <li>Non-Leaf: These forward any newly incoming rowIDs by the value of that row in the given column. After these
* rowIDs have been grouped, each group is forwarded to a delegate grouper to group it further (or, if the delegate is
* a leaf, to record the group).
* </ul>
*/
private class Grouper {
private QueryableColumnShard column;
private Map<Long, Grouper> delegateGroupers;
private Long groupId = null;
private boolean isLeaf;
private Supplier<Grouper> delegateGroupersFactory;
public Grouper(QueryableColumnShard column, Supplier<Grouper> delegateGroupersFactory) {
this.column = column;
this.delegateGroupersFactory = delegateGroupersFactory;
delegateGroupers = new HashMap<>();
isLeaf = false;
}
public Grouper() {
isLeaf = true;
}
public void groupRowIds(List<Long> rowIds, Map<Long, List<Long>> changes) {
if (isLeaf) {
if (groupId == null)
groupId = rowIds.iterator().next();
changes.put(groupId, rowIds);
return;
}
Map<Long, Long> rowIdToColValId = column.resolveColumnValueIdsForRows(rowIds);
Map<Long, List<Long>> columnValueToRowIds = new HashMap<>();
for (Entry<Long, Long> e : rowIdToColValId.entrySet()) {
long rowId = e.getKey();
long colValueId = e.getValue();
if (!columnValueToRowIds.containsKey(colValueId))
columnValueToRowIds.put(colValueId, new ArrayList<>());
columnValueToRowIds.get(colValueId).add(rowId);
}
// Add the row IDs to delegate groupers based on their column value id.
columnValueToRowIds.forEach(new BiConsumer<Long, List<Long>>() {
@Override
public void accept(Long columnValueId, List<Long> rowIds) {
if (!delegateGroupers.containsKey(columnValueId))
delegateGroupers.put(columnValueId, delegateGroupersFactory.get());
delegateGroupers.get(columnValueId).groupRowIds(rowIds, changes);
}
});
}
}
@Override
protected String getAdditionalToStringDetails() {
return "colsToGroupBy=" + colNamesToGroupBy;
}
}