/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.server.queryremote.flatten;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ExecutorService;
import java.util.stream.Collectors;
import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import javax.inject.Inject;
import org.apache.thrift.TException;
import org.diqube.cluster.ClusterManager;
import org.diqube.config.Config;
import org.diqube.config.ConfigKey;
import org.diqube.connection.ConnectionOrLocalHelper;
import org.diqube.connection.ServiceProvider;
import org.diqube.context.AutoInstatiate;
import org.diqube.data.column.AdjustableStandardColumnShard;
import org.diqube.data.column.StandardColumnShard;
import org.diqube.data.flatten.FlattenedTable;
import org.diqube.data.table.Table;
import org.diqube.data.table.TableShard;
import org.diqube.executionenv.FlattenedTableInstanceManager;
import org.diqube.executionenv.TableRegistry;
import org.diqube.flatten.FlattenManager;
import org.diqube.flatten.Flattener;
import org.diqube.flatten.QueryMasterFlattenService;
import org.diqube.remote.cluster.thrift.ClusterFlattenService;
import org.diqube.remote.cluster.thrift.RFlattenException;
import org.diqube.remote.cluster.thrift.ROptionalUuid;
import org.diqube.remote.cluster.thrift.RRetryLaterException;
import org.diqube.server.metadata.ServerTableMetadataPublisher;
import org.diqube.server.metadata.ServerTableMetadataPublisher.MergeImpossibleException;
import org.diqube.threads.ExecutorManager;
import org.diqube.thrift.base.thrift.RNodeAddress;
import org.diqube.thrift.base.thrift.RUUID;
import org.diqube.thrift.base.util.RUuidUtil;
import org.diqube.util.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables;
/**
* Handler for {@link ClusterFlatteningService}, which handles flattening local tables.
*
* <p>
* See JavaDoc on {@link FlattenRunnable}.
*
* @author Bastian Gloeckle
*/
@AutoInstatiate
public class ClusterFlattenServiceHandler implements ClusterFlattenService.Iface {
private static final Logger logger = LoggerFactory.getLogger(ClusterFlattenServiceHandler.class);
@Inject
private TableRegistry tableRegistry;
@Inject
private QueryMasterFlattenService queryMasterFlattenService;
@Inject
private ExecutorManager executorManager;
@Inject
private ConnectionOrLocalHelper connectionOrLocalHelper;
@Inject
private ClusterManager clusterManager;
@Inject
private FlattenedTableInstanceManager flattenedTableInstanceManager;
@Inject
private FlattenManager flattenManager;
@Inject
private ServerTableMetadataPublisher metadataPublisher;
@Config(ConfigKey.FLATTEN_TIMEOUT_SECONDS)
private int flattenTimeoutSeconds;
private ExecutorService flatteningExecutor;
private Map<UUID, FlattenRequestDetails> requestDetails = new ConcurrentHashMap<>();
private Map<Long, UUID> requestIdByThreadId = new ConcurrentHashMap<>();
private Map<Pair<String, String>, UUID> currentFlattenRequest = new ConcurrentHashMap<>();
@PostConstruct
public void initialize() {
flatteningExecutor = executorManager.newCachedThreadPoolWithMax("flatten-%d", new UncaughtExceptionHandler() {
@Override
public void uncaughtException(Thread t, Throwable e) {
UUID requestUuid = requestIdByThreadId.remove(t.getId());
logger.warn("Uncaught exception while processing flatten request {}", requestUuid, e);
if (requestUuid != null) {
FlattenRequestDetails details = requestDetails.remove(requestUuid);
if (details != null) {
synchronized (details.sync) {
currentFlattenRequest.remove(details.requestPair);
}
// try to send that our flattening failed.
for (Pair<RNodeAddress, UUID> resultPair : details.resultAddresses) {
try (ServiceProvider<ClusterFlattenService.Iface> serviceProv =
connectionOrLocalHelper.getService(ClusterFlattenService.Iface.class, resultPair.getLeft(), null)) {
serviceProv.getService().flattenFailed(RUuidUtil.toRUuid(resultPair.getRight()),
new RFlattenException(e.getMessage()));
} catch (Exception e2) {
logger.error(
"Could not send 'flattening failed' for flattening request {} to result node {}. Ignoring.",
resultPair.getRight(), resultPair.getLeft(), e2);
}
}
}
}
}
}, 3);
}
@PreDestroy
public void shutdown() {
flatteningExecutor.shutdownNow();
}
@Override
public ROptionalUuid getLatestValidFlattening(String tableName, String flattenBy)
throws RFlattenException, TException {
Table table = tableRegistry.getTable(tableName);
if (table == null)
throw new RFlattenException("Table '" + tableName + "' unknown.");
// flag it to not be removed from FlattenTableManager for some time (should be enough until our caller has received
// answers from all remotes and can issue his query).
Pair<UUID, FlattenedTable> newest =
flattenedTableInstanceManager.getNewestFlattenedTableVersionAndFlagIt(tableName, flattenBy);
if (newest == null)
return new ROptionalUuid();
// check if the FlattenedTable is "valid", i.e. it has flattened exactly those shards that are available in the
// table! As the table can be extended/shrinked over time (add/remove .control files), it could happen that there is
// something flattened that is actually not valid any more.
FlattenedTable flattenedTable = newest.getRight();
Set<Long> firstRowIds = table.getShards().stream().map(shard -> shard.getLowestRowId()).collect(Collectors.toSet());
// we identify validity if the flattenedTable was based on those "firstRowId" values of TableShards that the current
// table contains: One shard (identified by its firstRowId) will never change its content. So either shards could
// have been added or removed, but if the firstRowIds are equal, wthe table should be equal.
if (!flattenedTable.getOriginalFirstRowIdsOfShards().equals(firstRowIds))
return new ROptionalUuid();
ROptionalUuid res = new ROptionalUuid();
res.setUuid(RUuidUtil.toRUuid(newest.getLeft()));
return res;
}
/**
* Flattens a table locally asynchronously and informs the {@link ClusterFlattenService} at the result address about
* the process.
*
* <p>
* Note that the query master calls this method. If the execution fails, the query master should retry the process, as
* explained on {@link FlattenRunnable}.
*
* <p>
* Note that requests might be merged and therefore different cluster nodes might actually return different
* flattenedIds in the end - the master should take care of that by retrying the flatten process.
*
* @param flattenRequestId
* A unique ID.
* @param tableName
* Name of th etable to be flattened
* @param flattenBy
* The field by which should be flattened. See {@link Flattener}.
* @param otherFlatteners
* The other nodes that contain part of the source table and with which this node should communicate in order
* to clean up and rowId overlaps of the flattened table.
* @param resultAddress
* Address where there's a {@link ClusterFlattenService} that will receive the results. Can be
* <code>null</code> to not send any results (only valid locally, since Thrift does not support null values).
* @throws RFlattenException
* @throws TException
*/
@Override
public void flattenAllLocalShards(RUUID flattenRequestId, String tableName, String flattenBy,
List<RNodeAddress> otherFlatteners, RNodeAddress resultAddress) throws RFlattenException, TException {
Table table = tableRegistry.getTable(tableName);
if (table == null)
throw new RFlattenException("Table '" + tableName + "' not available.");
UUID requestUuid = RUuidUtil.toUuid(flattenRequestId);
FlattenRequestDetails details = new FlattenRequestDetails();
details.sync = new Object();
details.otherFlattenResults = new ConcurrentLinkedDeque<>();
details.resultAddresses = new ConcurrentLinkedDeque<>();
if (resultAddress != null)
details.resultAddresses.push(new Pair<>(resultAddress, requestUuid));
details.requestPair = new Pair<>(tableName, flattenBy);
requestDetails.put(requestUuid, details);
// try to identify another request that is currently running and processing the same table.
Pair<String, String> flattenRequestPair = new Pair<>(tableName, flattenBy);
UUID otherRequestUuid = currentFlattenRequest.putIfAbsent(flattenRequestPair, requestUuid);
if (otherRequestUuid != null && !requestUuid.equals(otherRequestUuid)) {
FlattenRequestDetails otherDetails = requestDetails.get(otherRequestUuid);
if (otherDetails != null) {
synchronized (otherDetails.sync) {
if (otherRequestUuid.equals(currentFlattenRequest.get(flattenRequestPair))) {
logger.info(
"Requested to flatten '{}' by '{}' in request id {}, but there is another flatten being "
+ "executed for the same combination currently in request ID {}. Telling the latter request to"
+ " inform the first request as soon as it is done.",
tableName, flattenBy, requestUuid, otherRequestUuid);
if (resultAddress != null)
otherDetails.resultAddresses.push(new Pair<>(resultAddress, requestUuid));
requestDetails.remove(requestUuid); // clean up our stuff.
return;
}
}
}
}
logger.info("Starting to flatten '{}' by '{}', request ID {}, result addr {}, other flatteners: {}", tableName,
flattenBy, requestUuid, resultAddress, otherFlatteners);
// execute asynchronously.
flatteningExecutor.execute(new FlattenRunnable(requestUuid, details, table, tableName, flattenBy, otherFlatteners));
}
@Override
public void shardsFlattened(RUUID flattenRequestId, Map<Long, Long> origShardFirstRowIdToFlattenedNumberOfRowsDelta,
RNodeAddress flattener) throws TException, RRetryLaterException {
UUID requestId = RUuidUtil.toUuid(flattenRequestId);
logger.trace("Received a shardsFlattened result for flatten request {} from {}.", requestId, flattener);
FlattenRequestDetails details = requestDetails.get(requestId);
if (details == null)
throw new RRetryLaterException("Local data not ready.");
synchronized (details.sync) {
details.otherFlattenResults.add(origShardFirstRowIdToFlattenedNumberOfRowsDelta);
details.sync.notifyAll();
}
}
@Override
public void flattenDone(RUUID flattenRequestId, RUUID flattenedTableId, RNodeAddress flattener) throws TException {
// executed on query master node.
queryMasterFlattenService.singleRemoteCompletedFlattening(RUuidUtil.toUuid(flattenRequestId),
RUuidUtil.toUuid(flattenedTableId), flattener);
}
@Override
public void flattenFailed(RUUID flattenRequestId, RFlattenException flattenException) throws TException {
// executed on query master node.
queryMasterFlattenService.singleRemoteFailedFlattening(RUuidUtil.toUuid(flattenRequestId),
flattenException.getMessage());
}
/**
* Helper class which holds information of one flatten request that is currently being executed.
*/
private static class FlattenRequestDetails {
/** sync object used to sync access and inform the {@link FlattenRunnable} about new data received from remotes */
Object sync;
/** Addresses and their respective requestId which requested to execute this flatten process */
Deque<Pair<RNodeAddress, UUID>> resultAddresses;
/**
* Results from other flatteners currently processing the same request. Maps from
* "firstRowId of the unflattened tableShard which was flattened" to
* "delta of rows the flattened shard has compared to the original".
*/
Deque<Map<Long, Long>> otherFlattenResults;
/** Pair of "tableName" and "flatten-by" */
Pair<String, String> requestPair;
}
/**
* Runnable that executes the flattening on this node, informs all other cluster nodes of our results and incorporates
* the changes of those other cluster nodes into our result.
*
* <p>
* This process is a little bit more complex than it might seem at first. The Query Master triggers the flattening of
* the requested table on all query remotes simultaneously. This runnable is executed on each of those. To achieve a
* valid flattened version of the table, these query remotes have to talk to each other: When flattening, a table
* typically has more rows than before, whcih means each TableShard of that table has more rows. Each row though must
* have a unique rowId and to get these, the query remotes need to exchange information to not have overlapping
* rowIds.
*
* <p>
* In addition to that, The {@link ClusterFlattenServiceHandler} merges multiple simultaneous requests to flatten the
* same table by the same flatten-by-field: When one request is being processed and a second, equal, request is
* received, that second request should be answered together with the first one - we do not want to calculate the very
* same flattening twice. <br/>
* This though contains even another small but important thing: If two query masters decide to flatten something at
* the same time, part of the query remotes would receive the request from the first master first and the other part
* of the query remotes would receive the request from the second master first. Therefore the query remotes would not
* work on the same "flatten request ID". In that case, we simply fail the flattening process. This happens when each
* query remote tries to inform the other flatteners about its results: It will retry for 10s and if after 10s not all
* flatteners did accept that information (which is bound to the request ID), the flatten will fail. It is required
* that each query master retries, though, and therefore the next request of the query master should succeed.
*
* <p>
* And even more: When one flattening is completed and a different query later requests the same flattening, we re-use
* most of the data of the old flattening (if that data is still valid, i.e. the flattened table was built based on
* the same TableShards the source table currently has) - nevertheless we might now need to adjust the rowIds
* differently than before (because any other query remote might flatten differently because it has more/less
* TableShards loaded e.g.). For the latter case we use FlattenedTableUtil.
*/
private class FlattenRunnable implements Runnable {
private FlattenRequestDetails details;
private UUID requestUuid;
private Table table;
private String tableName;
private String flattenBy;
private List<RNodeAddress> otherFlatteners;
/* package */ FlattenRunnable(UUID requestUuid, FlattenRequestDetails details, Table table, String tableName,
String flattenBy, List<RNodeAddress> otherFlatteners) {
this.requestUuid = requestUuid;
this.details = details;
this.table = table;
this.tableName = tableName;
this.flattenBy = flattenBy;
this.otherFlatteners = otherFlatteners;
}
@Override
public void run() {
long timeoutTime = System.nanoTime() + flattenTimeoutSeconds * 1_000_000_000L;
requestIdByThreadId.put(Thread.currentThread().getId(), requestUuid);
UUID flattenedTableId = requestUuid; // result UUID for the flattened table if we succeed.
// fetch table shards in one go - otherwise they might change inside the table while we're processing them!
List<TableShard> inputShardsSorted = table.getShards().stream()
.sorted((s1, s2) -> Long.compare(s1.getLowestRowId(), s2.getLowestRowId())).collect(Collectors.toList());
FlattenedTable flattenedTable =
flattenManager.createFlattenedTable(table, inputShardsSorted, flattenBy, flattenedTableId);
List<TableShard> flattenedShardsSorted = flattenedTable.getShards().stream()
.sorted((s1, s2) -> Long.compare(s1.getLowestRowId(), s2.getLowestRowId())).collect(Collectors.toList());
logger.trace(
"Flattening '{}' by '{}', request ID {}, completed local computation, sending results to other flatteners.",
tableName, flattenBy, requestUuid);
// calculate the deltas in row-count that we need to distribute to other flatteners.
Map<Long, Long> origShardFirstRowIdToFlattenedNumberOfRowsDelta = new HashMap<>();
NavigableMap<Long, TableShard> origShardFirstRowIdToFlattenedShard = new TreeMap<>();
for (int i = 0; i < inputShardsSorted.size(); i++) {
origShardFirstRowIdToFlattenedNumberOfRowsDelta.put(inputShardsSorted.get(i).getLowestRowId(),
flattenedShardsSorted.get(i).getNumberOfRowsInShard() - inputShardsSorted.get(i).getNumberOfRowsInShard());
origShardFirstRowIdToFlattenedShard.put(inputShardsSorted.get(i).getLowestRowId(),
flattenedShardsSorted.get(i));
}
for (RNodeAddress otherFlattener : otherFlatteners) {
boolean retry = true;
// retry for 10s. This is crucial to the overall algorithm, as the cluster computing the flattening might be
// divided and processing different requestIds. This needs therefore to fail comparably quickly. And it will
// fail if after some retries our request ID is still not accepted by the otherFlatteners. See comment of
// runnable class.
int retryCountLeft = 10;
while (retry) {
retry = false;
try (ServiceProvider<ClusterFlattenService.Iface> serviceProv =
connectionOrLocalHelper.getService(ClusterFlattenService.Iface.class, otherFlattener, null)) {
serviceProv.getService().shardsFlattened(RUuidUtil.toRUuid(requestUuid),
origShardFirstRowIdToFlattenedNumberOfRowsDelta, clusterManager.getOurNodeAddress().createRemote());
} catch (RRetryLaterException e) {
if (retryCountLeft == 0)
// let the uncaughtExceptionHandler handle this...
throw new RuntimeException(
"Exception while communicating with other flatteners (retry exhausted): " + e.getMessage(), e);
logger.trace("Received a retry exception from {}: {}. Will retry.", otherFlattener, e.getMessage());
// we retry sending our results since the other flatteners might not have initialized for this request yet.
try {
Thread.sleep(1000); // 1s
} catch (InterruptedException e1) {
// let uncaughtExceptionHandler clean up.
throw new RuntimeException("Interrupted while waiting for other flatteners to get ready.", e1);
}
retryCountLeft--;
retry = true;
} catch (Exception e) {
// let the uncaughtExceptionHandler handle this...
throw new RuntimeException("Exception while communicating with other flatteners: " + e.getMessage(), e);
}
}
}
// We flattened our shard. Before we can use it, though, we need to adjust the rowIds of it, since the
// flattened table now still has the firstRowIds of the original table set, but the new table will (typically)
// have more rows -> we have overlapping rowIds between tableShards. And this includes both local and remote table
// shards -> we need to adjust the rowIds accordingly!
// be sure to work on our local results, too.
details.otherFlattenResults.add(origShardFirstRowIdToFlattenedNumberOfRowsDelta);
int numberOfOtherFlattenersResponded = -1; // -1 because we put our own result in the deque in the line above.
logger.trace("Flattening '{}' by '{}', request ID {}, waiting for results from other flatteners", tableName,
flattenBy, requestUuid);
while (numberOfOtherFlattenersResponded < otherFlatteners.size()) {
synchronized (details.sync) {
if (details.otherFlattenResults.isEmpty())
try {
details.sync.wait(1000); // 1s
} catch (InterruptedException e) {
// let the uncaughtExceptionHandler handle this...
throw new RuntimeException("Interrupted while waiting for results of other nodes: " + e.getMessage(), e);
}
}
if (System.nanoTime() > timeoutTime)
throw new RuntimeException("Timed out waiting for other flatteners to calculate their result.");
Map<Long, Long> otherFlattenerResult;
while ((otherFlattenerResult = details.otherFlattenResults.poll()) != null) {
logger.trace("Working on flattener result (limit): {}",
Iterables.limit(otherFlattenerResult.entrySet(), 100));
numberOfOtherFlattenersResponded++;
for (Entry<Long, Long> otherEntry : otherFlattenerResult.entrySet()) {
long otherOrigFirstRowId = otherEntry.getKey();
long otherFlattenedNumerOfRowsDelta = otherEntry.getValue();
Map<Long, TableShard> affectedShards =
origShardFirstRowIdToFlattenedShard.tailMap(otherOrigFirstRowId, false);
for (Entry<Long, TableShard> tableShardEntry : affectedShards.entrySet()) {
logger.trace("Adjusting tableShard which was originally at rowId {}", tableShardEntry.getKey());
for (StandardColumnShard colShard : tableShardEntry.getValue().getColumns().values()) {
((AdjustableStandardColumnShard) colShard)
.adjustToFirstRowId(colShard.getFirstRowId() + otherFlattenedNumerOfRowsDelta);
}
}
}
}
}
// Okay, all results from other flatteners received and incorporated, we're done!
flattenedTableInstanceManager.registerFlattenedTableVersion(flattenedTableId, flattenedTable, tableName,
flattenBy);
// not in "finally", since we do not want to clear this here if we have an exception -> the
// uncaughtExceptionHandler will handle that case!
synchronized (details.sync) {
requestIdByThreadId.remove(Thread.currentThread().getId());
requestDetails.remove(requestUuid);
currentFlattenRequest.remove(new Pair<>(tableName, flattenBy));
}
logger.info("Finished flattening '{}' by '{}', request ID {}.", tableName, flattenBy, requestUuid);
for (Pair<RNodeAddress, UUID> resultPair : details.resultAddresses) {
try (ServiceProvider<ClusterFlattenService.Iface> serviceProv =
connectionOrLocalHelper.getService(ClusterFlattenService.Iface.class, resultPair.getLeft(), null)) {
logger.trace("Sending result of flatten '{}' by '{}' to {} (its request ID was {})", tableName, flattenBy,
resultPair.getLeft(), resultPair.getRight());
serviceProv.getService().flattenDone(RUuidUtil.toRUuid(resultPair.getRight()),
RUuidUtil.toRUuid(flattenedTableId), clusterManager.getOurNodeAddress().createRemote());
} catch (Exception e) {
logger.warn("Could not send flattening result {}/{} to requesting machine at {}. Ignoring.", requestUuid,
resultPair.getRight(), resultPair.getLeft(), e);
}
}
// At last, start triggering the computation of the metadata for the flattened table. We do this at last, since we
// do not expect this to fail, since the original table was loaded successfully (and has valid metadata) and so
// the merging etc for the flattened one should work well, too.
try {
metadataPublisher.publishMetadataOfTableShards(flattenedTable.getName(), flattenedTable.getShards());
} catch (MergeImpossibleException e) {
logger.error("Metadata of flattened table '{}' could not be computed.", e);
// as we do not expect this to happen, just log and ignore.
}
}
}
}