/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.execution.steps;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.BiFunction;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Stream;
import org.diqube.data.column.ColumnShard;
import org.diqube.execution.consumers.AbstractThreadedColumnDictIdConsumer;
import org.diqube.execution.consumers.ColumnDictIdConsumer;
import org.diqube.execution.consumers.ColumnValueConsumer;
import org.diqube.execution.consumers.ColumnVersionBuiltConsumer;
import org.diqube.execution.consumers.DoneConsumer;
import org.diqube.execution.consumers.GenericConsumer;
import org.diqube.execution.exception.ExecutablePlanBuildException;
import org.diqube.executionenv.ExecutionEnvironment;
import org.diqube.executionenv.VersionedExecutionEnvironment;
import org.diqube.queries.QueryRegistry;
import org.diqube.queries.QueryUuid;
import org.diqube.queries.QueryUuid.QueryUuidThreadState;
import org.diqube.util.Pair;
import org.diqube.util.Triple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables;
/**
* A step that takes the output of a {@link ResolveColumnDictIdsStep} and transforms the column value IDs into final
* values by looking them up in the column dictionaries.
*
* <p>
* This takes the order in which the inputs are providing new values into account. This is due to the fact that
* {@link ResolveColumnDictIdsStep} might be based on a {@link ColumnVersionBuiltConsumer} where the values of a
* specific column/row combination might change during the execution of the pipeline. The later the dict IDs are
* resolved, the better the value of the column is therefore, so later calls need to overwrite the results of earlier
* ones.
*
* <p>
* Input: one or multiple {@link ColumnDictIdConsumer}<br>
* Output: {@link ColumnValueConsumer}s
*
* @author Bastian Gloeckle
*/
public class ResolveValuesStep extends AbstractThreadedExecutablePlanStep {
private static final Logger logger = LoggerFactory.getLogger(ResolveValuesStep.class);
private AtomicBoolean sourcesAreEmpty = new AtomicBoolean(false);
private AbstractThreadedColumnDictIdConsumer columnDictIdConsumer = new AbstractThreadedColumnDictIdConsumer(this) {
private final ConcurrentMap<Long, Pair<ExecutionEnvironment, Long>> EMPTY_VALUE = new ConcurrentHashMap<>();
@Override
protected void allSourcesAreDone() {
ResolveValuesStep.this.sourcesAreEmpty.set(true);
}
@Override
protected void doConsume(ExecutionEnvironment env, String colName, Map<Long, Long> rowIdToColumnDictId) {
// acquire read lock, because multiple threads might access the following code, but none might access the
// "writeLock" code in the execute() method.
rowIdReadWriteLock.readLock().lock();
try {
// put a single column name string object into the map
inputColsAndRows.putIfAbsent(colName, EMPTY_VALUE);
// fetch that single key string (which is equal to all threads!)
colName = inputColsAndRows.floorKey(colName);
// .. now we can use that string object to sync upon - the following code will only be executed by one thread
// simultaneously for a single colName.
synchronized (colName) {
logger.debug("Integrating column value IDs for col {} from {} for rowIds (limit) {}", colName, env,
Iterables.limit(rowIdToColumnDictId.keySet(), 100));
// prepare new value map.
ConcurrentMap<Long, Pair<ExecutionEnvironment, Long>> newRowIdToColValueId =
new ConcurrentHashMap<>(inputColsAndRows.get(colName));
// for each of the input rowId/columnValueId pairs check if there is a newer version available already. If
// not, put the new value!
rowIdToColumnDictId.entrySet().forEach(new Consumer<Entry<Long, Long>>() {
@Override
public void accept(Entry<Long, Long> newEntry) {
newRowIdToColValueId.merge( //
newEntry.getKey(), // rowId of entry to inspect
new Pair<>(env, newEntry.getValue()), // use this as new value
new BiFunction<Pair<ExecutionEnvironment, Long>, Pair<ExecutionEnvironment, Long>, Pair<ExecutionEnvironment, Long>>() {
@Override
public Pair<ExecutionEnvironment, Long> apply(Pair<ExecutionEnvironment, Long> currentValue,
Pair<ExecutionEnvironment, Long> newValue) {
ExecutionEnvironment currentEnv = currentValue.getLeft();
ExecutionEnvironment newEnv = newValue.getLeft();
if (!(currentEnv instanceof VersionedExecutionEnvironment))
return currentValue;
if (!(newEnv instanceof VersionedExecutionEnvironment))
return newValue;
if (((VersionedExecutionEnvironment) currentEnv)
.getVersion() < ((VersionedExecutionEnvironment) newEnv).getVersion())
return newValue;
return currentValue;
}
});
}
});
// be sure to use the exactly same string object here again, as this might be in sync-use in other threads
// already.
inputColsAndRows.put(colName, newRowIdToColValueId);
}
} finally {
rowIdReadWriteLock.readLock().unlock();
}
}
};
/**
* Map from colName to map from rowId to pair containing the column Value ID and the Env to resolve the value from.
* The Env and the col value ID of course have to be the newest ones.
*/
private ConcurrentNavigableMap<String, ConcurrentMap<Long, Pair<ExecutionEnvironment, Long>>> inputColsAndRows =
new ConcurrentSkipListMap<>();
private ReadWriteLock rowIdReadWriteLock = new ReentrantReadWriteLock();
public ResolveValuesStep(int stepId, QueryRegistry queryRegistry) {
super(stepId, queryRegistry);
}
@Override
public void execute() {
rowIdReadWriteLock.writeLock().lock();
ConcurrentNavigableMap<String, ConcurrentMap<Long, Pair<ExecutionEnvironment, Long>>> activeColsAndRows;
try {
activeColsAndRows = inputColsAndRows;
inputColsAndRows = new ConcurrentSkipListMap<>();
if (sourcesAreEmpty.get() && activeColsAndRows.isEmpty() && inputColsAndRows.isEmpty()) {
// there won't be any input at all. Stop processing.
forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
doneProcessing();
return;
}
} finally {
rowIdReadWriteLock.writeLock().unlock();
}
if (activeColsAndRows.size() > 0) {
logger.debug("Starting to resolve values...");
QueryUuidThreadState uuidState = QueryUuid.getCurrentThreadState();
Map<String, Map<Long, Object>> valuesPerColumn = activeColsAndRows.entrySet().stream() //
.parallel().flatMap( //
new Function<Entry<String, ConcurrentMap<Long, Pair<ExecutionEnvironment, Long>>>, Stream<Triple<String, Long, Object>>>() {
@Override
public Stream<Triple<String, Long, Object>> apply(
Entry<String, ConcurrentMap<Long, Pair<ExecutionEnvironment, Long>>> e) {
QueryUuid.setCurrentThreadState(uuidState);
try {
String colName = e.getKey();
List<Triple<String, Long, Object>> res = new ArrayList<>();
// group by ExecutionEnvs and columnValueIds, so we do not have to decompress specific colValueIds
// multiple times
Map<ExecutionEnvironment, SortedMap<Long, List<Long>>> envToColumnValueIdToRowId = new HashMap<>();
for (Entry<Long, Pair<ExecutionEnvironment, Long>> rowIdColValueIdEntry : e.getValue().entrySet()) {
Long rowId = rowIdColValueIdEntry.getKey();
Long columnValueId = rowIdColValueIdEntry.getValue().getRight();
ExecutionEnvironment env = rowIdColValueIdEntry.getValue().getLeft();
if (!envToColumnValueIdToRowId.containsKey(env))
envToColumnValueIdToRowId.put(env, new TreeMap<>());
if (!envToColumnValueIdToRowId.get(env).containsKey(columnValueId))
envToColumnValueIdToRowId.get(env).put(columnValueId, new ArrayList<>());
envToColumnValueIdToRowId.get(env).get(columnValueId).add(rowId);
}
for (ExecutionEnvironment env : envToColumnValueIdToRowId.keySet()) {
SortedMap<Long, List<Long>> columnValueIdToRowId = envToColumnValueIdToRowId.get(env);
Long[] sortedColumnValueIds =
columnValueIdToRowId.keySet().toArray(new Long[columnValueIdToRowId.keySet().size()]);
ColumnShard columnShard = env.getColumnShard(colName);
Object[] values = columnShard.getColumnShardDictionary().decompressValues(sortedColumnValueIds);
for (int i = 0; i < sortedColumnValueIds.length; i++) {
Long columnValueId = sortedColumnValueIds[i];
Object value = values[i];
for (Long rowId : columnValueIdToRowId.get(columnValueId))
res.add(new Triple<>(colName, rowId, value));
}
}
return res.stream();
} finally {
QueryUuid.clearCurrent();
}
}
})
.collect(() -> new HashMap<String, Map<Long, Object>>(), (map, triple) -> {
String colName = triple.getLeft();
Long rowId = triple.getMiddle();
Object value = triple.getRight();
if (!map.containsKey(colName))
map.put(colName, new HashMap<>());
map.get(colName).put(rowId, value);
} , (map1, map2) -> {
for (String colName : map2.keySet()) {
if (!map1.containsKey(colName))
map1.put(colName, new HashMap<>());
map1.get(colName).putAll(map2.get(colName));
}
});
QueryUuid.setCurrentThreadState(uuidState);
for (String colName : valuesPerColumn.keySet()) {
logger.trace("Resolved values, sending them out now (limit): {}, {}", colName,
Iterables.limit(valuesPerColumn.get(colName).entrySet(), 10));
forEachOutputConsumerOfType(ColumnValueConsumer.class, c -> c.consume(colName, valuesPerColumn.get(colName)));
}
}
if (sourcesAreEmpty.get() && inputColsAndRows.isEmpty()) {
forEachOutputConsumerOfType(GenericConsumer.class, c -> c.sourceIsDone());
doneProcessing();
}
}
@Override
public List<GenericConsumer> inputConsumers() {
return Arrays.asList(new GenericConsumer[] { columnDictIdConsumer });
}
@Override
protected void validateOutputConsumer(GenericConsumer consumer) throws IllegalArgumentException {
if (!(consumer instanceof DoneConsumer) && !(consumer instanceof ColumnValueConsumer))
throw new IllegalArgumentException("Only ColumnValueConsumer supported!");
}
@Override
protected void validateWiredStatus() throws ExecutablePlanBuildException {
// intentionally empty, as we do not track wire-status nicely here, as we wire our consumer multiple times.
}
@Override
protected String getAdditionalToStringDetails() {
return null;
}
}