/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.raptor.metadata;
import com.facebook.presto.raptor.NodeSupplier;
import com.facebook.presto.raptor.RaptorColumnHandle;
import com.facebook.presto.raptor.storage.organization.ShardOrganizerDao;
import com.facebook.presto.raptor.util.DaoSupplier;
import com.facebook.presto.spi.Node;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.predicate.TupleDomain;
import com.facebook.presto.spi.type.Type;
import com.google.common.base.Joiner;
import com.google.common.base.Throwables;
import com.google.common.base.Ticker;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.ExecutionError;
import com.google.common.util.concurrent.UncheckedExecutionException;
import io.airlift.log.Logger;
import io.airlift.units.Duration;
import org.h2.jdbc.JdbcConnection;
import org.skife.jdbi.v2.Handle;
import org.skife.jdbi.v2.IDBI;
import org.skife.jdbi.v2.ResultIterator;
import org.skife.jdbi.v2.exceptions.DBIException;
import org.skife.jdbi.v2.tweak.HandleConsumer;
import org.skife.jdbi.v2.util.ByteArrayMapper;
import javax.inject.Inject;
import java.sql.Connection;
import java.sql.JDBCType;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalLong;
import java.util.Set;
import java.util.StringJoiner;
import java.util.UUID;
import static com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_ERROR;
import static com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_EXTERNAL_BATCH_ALREADY_EXISTS;
import static com.facebook.presto.raptor.storage.ColumnIndexStatsUtils.jdbcType;
import static com.facebook.presto.raptor.storage.ShardStats.MAX_BINARY_INDEX_SIZE;
import static com.facebook.presto.raptor.util.ArrayUtil.intArrayFromBytes;
import static com.facebook.presto.raptor.util.ArrayUtil.intArrayToBytes;
import static com.facebook.presto.raptor.util.DatabaseUtil.bindOptionalInt;
import static com.facebook.presto.raptor.util.DatabaseUtil.isSyntaxOrAccessError;
import static com.facebook.presto.raptor.util.DatabaseUtil.metadataError;
import static com.facebook.presto.raptor.util.DatabaseUtil.runIgnoringConstraintViolation;
import static com.facebook.presto.raptor.util.DatabaseUtil.runTransaction;
import static com.facebook.presto.raptor.util.UuidUtil.uuidFromBytes;
import static com.facebook.presto.raptor.util.UuidUtil.uuidToBytes;
import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR;
import static com.facebook.presto.spi.StandardErrorCode.NO_NODES_AVAILABLE;
import static com.facebook.presto.spi.StandardErrorCode.SERVER_STARTING_UP;
import static com.facebook.presto.spi.StandardErrorCode.TRANSACTION_CONFLICT;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Throwables.propagateIfInstanceOf;
import static com.google.common.collect.Iterables.partition;
import static java.lang.Boolean.TRUE;
import static java.lang.Math.multiplyExact;
import static java.lang.String.format;
import static java.sql.Statement.RETURN_GENERATED_KEYS;
import static java.util.Arrays.asList;
import static java.util.Collections.nCopies;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static java.util.stream.Collectors.toMap;
import static java.util.stream.Collectors.toSet;
public class DatabaseShardManager
implements ShardManager
{
private static final Logger log = Logger.get(DatabaseShardManager.class);
private static final String INDEX_TABLE_PREFIX = "x_shards_t";
private static final int MAX_ADD_COLUMN_ATTEMPTS = 100;
private final IDBI dbi;
private final DaoSupplier<ShardDao> shardDaoSupplier;
private final ShardDao dao;
private final NodeSupplier nodeSupplier;
private final AssignmentLimiter assignmentLimiter;
private final Ticker ticker;
private final Duration startupGracePeriod;
private final long startTime;
private final LoadingCache<String, Integer> nodeIdCache = CacheBuilder.newBuilder()
.maximumSize(10_000)
.build(new CacheLoader<String, Integer>()
{
@Override
public Integer load(String nodeIdentifier)
{
return loadNodeId(nodeIdentifier);
}
});
private final LoadingCache<Long, Map<Integer, String>> bucketAssignmentsCache = CacheBuilder.newBuilder()
.expireAfterWrite(1, SECONDS)
.build(new CacheLoader<Long, Map<Integer, String>>()
{
@Override
public Map<Integer, String> load(Long distributionId)
{
return loadBucketAssignments(distributionId);
}
});
@Inject
public DatabaseShardManager(
@ForMetadata IDBI dbi,
DaoSupplier<ShardDao> shardDaoSupplier,
NodeSupplier nodeSupplier,
AssignmentLimiter assignmentLimiter,
Ticker ticker,
MetadataConfig config)
{
this(dbi, shardDaoSupplier, nodeSupplier, assignmentLimiter, ticker, config.getStartupGracePeriod());
}
public DatabaseShardManager(
IDBI dbi,
DaoSupplier<ShardDao> shardDaoSupplier,
NodeSupplier nodeSupplier,
AssignmentLimiter assignmentLimiter,
Ticker ticker,
Duration startupGracePeriod)
{
this.dbi = requireNonNull(dbi, "dbi is null");
this.shardDaoSupplier = requireNonNull(shardDaoSupplier, "shardDaoSupplier is null");
this.dao = shardDaoSupplier.onDemand();
this.nodeSupplier = requireNonNull(nodeSupplier, "nodeSupplier is null");
this.assignmentLimiter = requireNonNull(assignmentLimiter, "assignmentLimiter is null");
this.ticker = requireNonNull(ticker, "ticker is null");
this.startupGracePeriod = requireNonNull(startupGracePeriod, "startupGracePeriod is null");
this.startTime = ticker.read();
}
@Override
public void createTable(long tableId, List<ColumnInfo> columns, boolean bucketed, OptionalLong temporalColumnId)
{
StringJoiner tableColumns = new StringJoiner(",\n ", " ", ",\n").setEmptyValue("");
for (ColumnInfo column : columns) {
String columnType = sqlColumnType(column.getType());
if (columnType != null) {
tableColumns.add(minColumn(column.getColumnId()) + " " + columnType);
tableColumns.add(maxColumn(column.getColumnId()) + " " + columnType);
}
}
StringJoiner coveringIndexColumns = new StringJoiner(", ");
// Add the max temporal column first to accelerate queries that usually scan recent data
temporalColumnId.ifPresent(id -> coveringIndexColumns.add(maxColumn(id)));
temporalColumnId.ifPresent(id -> coveringIndexColumns.add(minColumn(id)));
String sql;
if (bucketed) {
coveringIndexColumns
.add("bucket_number")
.add("shard_id")
.add("shard_uuid");
sql = "" +
"CREATE TABLE " + shardIndexTable(tableId) + " (\n" +
" shard_id BIGINT NOT NULL,\n" +
" shard_uuid BINARY(16) NOT NULL,\n" +
" bucket_number INT NOT NULL\n," +
tableColumns +
" PRIMARY KEY (bucket_number, shard_uuid),\n" +
" UNIQUE (shard_id),\n" +
" UNIQUE (shard_uuid),\n" +
" UNIQUE (" + coveringIndexColumns + ")\n" +
")";
}
else {
coveringIndexColumns
.add("node_ids")
.add("shard_id")
.add("shard_uuid");
sql = "" +
"CREATE TABLE " + shardIndexTable(tableId) + " (\n" +
" shard_id BIGINT NOT NULL,\n" +
" shard_uuid BINARY(16) NOT NULL,\n" +
" node_ids VARBINARY(128) NOT NULL,\n" +
tableColumns +
" PRIMARY KEY (node_ids, shard_uuid),\n" +
" UNIQUE (shard_id),\n" +
" UNIQUE (shard_uuid),\n" +
" UNIQUE (" + coveringIndexColumns + ")\n" +
")";
}
try (Handle handle = dbi.open()) {
handle.execute(sql);
}
catch (DBIException e) {
throw metadataError(e);
}
}
@Override
public void dropTable(long tableId)
{
runTransaction(dbi, (handle, status) -> {
lockTable(handle, tableId);
ShardDao shardDao = shardDaoSupplier.attach(handle);
shardDao.insertDeletedShards(tableId);
shardDao.dropShardNodes(tableId);
shardDao.dropShards(tableId);
handle.attach(ShardOrganizerDao.class).dropOrganizerJobs(tableId);
MetadataDao dao = handle.attach(MetadataDao.class);
dao.dropColumns(tableId);
dao.dropTable(tableId);
return null;
});
// TODO: add a cleanup process for leftover index tables
// It is not possible to drop the index tables in a transaction.
try (Handle handle = dbi.open()) {
handle.execute("DROP TABLE " + shardIndexTable(tableId));
}
catch (DBIException e) {
log.warn(e, "Failed to drop index table %s", shardIndexTable(tableId));
}
}
@Override
public void addColumn(long tableId, ColumnInfo column)
{
String columnType = sqlColumnType(column.getType());
if (columnType == null) {
return;
}
String sql = format("ALTER TABLE %s ADD COLUMN (%s %s, %s %s)",
shardIndexTable(tableId),
minColumn(column.getColumnId()), columnType,
maxColumn(column.getColumnId()), columnType);
int attempts = 0;
while (true) {
attempts++;
try (Handle handle = dbi.open()) {
handle.execute(sql);
}
catch (DBIException e) {
if (isSyntaxOrAccessError(e)) {
// exit when column already exists
return;
}
if (attempts >= MAX_ADD_COLUMN_ATTEMPTS) {
throw metadataError(e);
}
}
}
}
@Override
public void commitShards(long transactionId, long tableId, List<ColumnInfo> columns, Collection<ShardInfo> shards, Optional<String> externalBatchId, long updateTime)
{
// attempt to fail up front with a proper exception
if (externalBatchId.isPresent() && dao.externalBatchExists(externalBatchId.get())) {
throw new PrestoException(RAPTOR_EXTERNAL_BATCH_ALREADY_EXISTS, "External batch already exists: " + externalBatchId.get());
}
Map<String, Integer> nodeIds = toNodeIdMap(shards);
runCommit(transactionId, (handle) -> {
externalBatchId.ifPresent(shardDaoSupplier.attach(handle)::insertExternalBatch);
lockTable(handle, tableId);
insertShardsAndIndex(tableId, columns, shards, nodeIds, handle);
ShardStats stats = shardStats(shards);
MetadataDao metadata = handle.attach(MetadataDao.class);
metadata.updateTableStats(tableId, shards.size(), stats.getRowCount(), stats.getCompressedSize(), stats.getUncompressedSize());
metadata.updateTableVersion(tableId, updateTime);
});
}
@Override
public void replaceShardUuids(long transactionId, long tableId, List<ColumnInfo> columns, Set<UUID> oldShardUuids, Collection<ShardInfo> newShards, OptionalLong updateTime)
{
Map<String, Integer> nodeIds = toNodeIdMap(newShards);
runCommit(transactionId, (handle) -> {
lockTable(handle, tableId);
if (!updateTime.isPresent() && handle.attach(MetadataDao.class).isMaintenanceBlockedLocked(tableId)) {
throw new PrestoException(TRANSACTION_CONFLICT, "Maintenance is blocked for table");
}
ShardStats newStats = shardStats(newShards);
long rowCount = newStats.getRowCount();
long compressedSize = newStats.getCompressedSize();
long uncompressedSize = newStats.getUncompressedSize();
for (List<ShardInfo> shards : partition(newShards, 1000)) {
insertShardsAndIndex(tableId, columns, shards, nodeIds, handle);
}
for (List<UUID> uuids : partition(oldShardUuids, 1000)) {
ShardStats stats = deleteShardsAndIndex(tableId, ImmutableSet.copyOf(uuids), handle);
rowCount -= stats.getRowCount();
compressedSize -= stats.getCompressedSize();
uncompressedSize -= stats.getUncompressedSize();
}
long shardCount = newShards.size() - oldShardUuids.size();
if (!oldShardUuids.isEmpty() || !newShards.isEmpty()) {
MetadataDao metadata = handle.attach(MetadataDao.class);
metadata.updateTableStats(tableId, shardCount, rowCount, compressedSize, uncompressedSize);
updateTime.ifPresent(time -> metadata.updateTableVersion(tableId, time));
}
});
}
private void runCommit(long transactionId, HandleConsumer callback)
{
int maxAttempts = 5;
for (int attempt = 1; attempt <= maxAttempts; attempt++) {
try {
dbi.useTransaction((handle, status) -> {
ShardDao dao = shardDaoSupplier.attach(handle);
if (commitTransaction(dao, transactionId)) {
callback.useHandle(handle);
dao.deleteCreatedShards(transactionId);
}
});
return;
}
catch (DBIException e) {
propagateIfInstanceOf(e.getCause(), PrestoException.class);
if (attempt == maxAttempts) {
throw metadataError(e);
}
log.warn(e, "Failed to commit shards on attempt %d, will retry.", attempt);
try {
SECONDS.sleep(multiplyExact(attempt, 2));
}
catch (InterruptedException ie) {
throw metadataError(ie);
}
}
}
}
private static boolean commitTransaction(ShardDao dao, long transactionId)
{
if (dao.finalizeTransaction(transactionId, true) != 1) {
if (TRUE.equals(dao.transactionSuccessful(transactionId))) {
return false;
}
throw new PrestoException(TRANSACTION_CONFLICT, "Transaction commit failed. Please retry the operation.");
}
return true;
}
private ShardStats deleteShardsAndIndex(long tableId, Set<UUID> shardUuids, Handle handle)
throws SQLException
{
String args = Joiner.on(",").join(nCopies(shardUuids.size(), "?"));
ImmutableSet.Builder<Long> shardIdSet = ImmutableSet.builder();
long rowCount = 0;
long compressedSize = 0;
long uncompressedSize = 0;
String selectShards = format("" +
"SELECT shard_id, row_count, compressed_size, uncompressed_size\n" +
"FROM shards\n" +
"WHERE shard_uuid IN (%s)", args);
try (PreparedStatement statement = handle.getConnection().prepareStatement(selectShards)) {
bindUuids(statement, shardUuids);
try (ResultSet rs = statement.executeQuery()) {
while (rs.next()) {
shardIdSet.add(rs.getLong("shard_id"));
rowCount += rs.getLong("row_count");
compressedSize += rs.getLong("compressed_size");
uncompressedSize += rs.getLong("uncompressed_size");
}
}
}
Set<Long> shardIds = shardIdSet.build();
if (shardIds.size() != shardUuids.size()) {
throw transactionConflict();
}
ShardDao dao = shardDaoSupplier.attach(handle);
dao.insertDeletedShards(shardUuids);
String where = " WHERE shard_id IN (" + args + ")";
String deleteFromShardNodes = "DELETE FROM shard_nodes " + where;
String deleteFromShards = "DELETE FROM shards " + where;
String deleteFromShardIndex = "DELETE FROM " + shardIndexTable(tableId) + where;
try (PreparedStatement statement = handle.getConnection().prepareStatement(deleteFromShardNodes)) {
bindLongs(statement, shardIds);
statement.executeUpdate();
}
for (String sql : asList(deleteFromShards, deleteFromShardIndex)) {
try (PreparedStatement statement = handle.getConnection().prepareStatement(sql)) {
bindLongs(statement, shardIds);
if (statement.executeUpdate() != shardIds.size()) {
throw transactionConflict();
}
}
}
return new ShardStats(rowCount, compressedSize, uncompressedSize);
}
private static void bindUuids(PreparedStatement statement, Iterable<UUID> uuids)
throws SQLException
{
int i = 1;
for (UUID uuid : uuids) {
statement.setBytes(i, uuidToBytes(uuid));
i++;
}
}
private static void bindLongs(PreparedStatement statement, Iterable<Long> values)
throws SQLException
{
int i = 1;
for (long value : values) {
statement.setLong(i, value);
i++;
}
}
private static void insertShardsAndIndex(long tableId, List<ColumnInfo> columns, Collection<ShardInfo> shards, Map<String, Integer> nodeIds, Handle handle)
throws SQLException
{
if (shards.isEmpty()) {
return;
}
boolean bucketed = shards.iterator().next().getBucketNumber().isPresent();
Connection connection = handle.getConnection();
try (IndexInserter indexInserter = new IndexInserter(connection, tableId, columns)) {
for (List<ShardInfo> batch : partition(shards, batchSize(connection))) {
List<Long> shardIds = insertShards(connection, tableId, batch);
if (!bucketed) {
insertShardNodes(connection, nodeIds, shardIds, batch);
}
for (int i = 0; i < batch.size(); i++) {
ShardInfo shard = batch.get(i);
Set<Integer> shardNodes = shard.getNodeIdentifiers().stream()
.map(nodeIds::get)
.collect(toSet());
indexInserter.insert(
shardIds.get(i),
shard.getShardUuid(),
shard.getBucketNumber(),
shardNodes,
shard.getColumnStats());
}
indexInserter.execute();
}
}
}
private static int batchSize(Connection connection)
{
// H2 does not return generated keys properly
// https://github.com/h2database/h2database/issues/156
return (connection instanceof JdbcConnection) ? 1 : 1000;
}
private Map<String, Integer> toNodeIdMap(Collection<ShardInfo> shards)
{
Set<String> identifiers = shards.stream()
.map(ShardInfo::getNodeIdentifiers)
.flatMap(Collection::stream)
.collect(toSet());
return Maps.toMap(identifiers, this::getOrCreateNodeId);
}
@Override
public Set<ShardMetadata> getNodeShards(String nodeIdentifier)
{
return dao.getNodeShards(nodeIdentifier, null);
}
@Override
public Set<ShardMetadata> getNodeShards(String nodeIdentifier, long tableId)
{
return dao.getNodeShards(nodeIdentifier, tableId);
}
@Override
public ResultIterator<BucketShards> getShardNodes(long tableId, TupleDomain<RaptorColumnHandle> effectivePredicate)
{
return new ShardIterator(tableId, false, Optional.empty(), effectivePredicate, dbi);
}
@Override
public ResultIterator<BucketShards> getShardNodesBucketed(long tableId, boolean merged, Map<Integer, String> bucketToNode, TupleDomain<RaptorColumnHandle> effectivePredicate)
{
return new ShardIterator(tableId, merged, Optional.of(bucketToNode), effectivePredicate, dbi);
}
@Override
public void assignShard(long tableId, UUID shardUuid, String nodeIdentifier, boolean gracePeriod)
{
if (gracePeriod && (nanosSince(startTime).compareTo(startupGracePeriod) < 0)) {
throw new PrestoException(SERVER_STARTING_UP, "Cannot reassign shards while server is starting");
}
int nodeId = getOrCreateNodeId(nodeIdentifier);
runTransaction(dbi, (handle, status) -> {
ShardDao dao = shardDaoSupplier.attach(handle);
Set<Integer> nodes = new HashSet<>(fetchLockedNodeIds(handle, tableId, shardUuid));
if (nodes.add(nodeId)) {
updateNodeIds(handle, tableId, shardUuid, nodes);
dao.insertShardNode(shardUuid, nodeId);
}
return null;
});
}
@Override
public void unassignShard(long tableId, UUID shardUuid, String nodeIdentifier)
{
int nodeId = getOrCreateNodeId(nodeIdentifier);
runTransaction(dbi, (handle, status) -> {
ShardDao dao = shardDaoSupplier.attach(handle);
Set<Integer> nodes = new HashSet<>(fetchLockedNodeIds(handle, tableId, shardUuid));
if (nodes.remove(nodeId)) {
updateNodeIds(handle, tableId, shardUuid, nodes);
dao.deleteShardNode(shardUuid, nodeId);
}
return null;
});
}
@Override
public Map<String, Long> getNodeBytes()
{
return dao.getNodeSizes().stream()
.collect(toMap(NodeSize::getNodeIdentifier, NodeSize::getSizeInBytes));
}
@Override
public long beginTransaction()
{
return dao.insertTransaction();
}
@Override
public void rollbackTransaction(long transactionId)
{
dao.finalizeTransaction(transactionId, false);
}
@Override
public void createBuckets(long distributionId, int bucketCount)
{
Iterator<String> nodeIterator = cyclingShuffledIterator(getNodeIdentifiers());
List<Integer> bucketNumbers = new ArrayList<>();
List<Integer> nodeIds = new ArrayList<>();
for (int bucket = 0; bucket < bucketCount; bucket++) {
bucketNumbers.add(bucket);
nodeIds.add(getOrCreateNodeId(nodeIterator.next()));
}
runIgnoringConstraintViolation(() -> dao.insertBuckets(distributionId, bucketNumbers, nodeIds));
}
@Override
public Map<Integer, String> getBucketAssignments(long distributionId)
{
try {
return bucketAssignmentsCache.getUnchecked(distributionId);
}
catch (UncheckedExecutionException | ExecutionError e) {
throw Throwables.propagate(e.getCause());
}
}
@Override
public void updateBucketAssignment(long distributionId, int bucketNumber, String nodeId)
{
dao.updateBucketNode(distributionId, bucketNumber, getOrCreateNodeId(nodeId));
}
@Override
public List<Distribution> getDistributions()
{
return dao.listActiveDistributions();
}
@Override
public long getDistributionSizeInBytes(long distributionId)
{
return dao.getDistributionSizeBytes(distributionId);
}
@Override
public List<BucketNode> getBucketNodes(long distibutionId)
{
return dao.getBucketNodes(distibutionId);
}
@Override
public Set<UUID> getExistingShardUuids(long tableId, Set<UUID> shardUuids)
{
try (Handle handle = dbi.open()) {
String args = Joiner.on(",").join(nCopies(shardUuids.size(), "?"));
String selectShards = format(
"SELECT shard_uuid FROM %s WHERE shard_uuid IN (%s)",
shardIndexTable(tableId), args);
ImmutableSet.Builder<UUID> existingShards = ImmutableSet.builder();
try (PreparedStatement statement = handle.getConnection().prepareStatement(selectShards)) {
bindUuids(statement, shardUuids);
try (ResultSet rs = statement.executeQuery()) {
while (rs.next()) {
existingShards.add(uuidFromBytes(rs.getBytes("shard_uuid")));
}
}
}
return existingShards.build();
}
catch (SQLException e) {
throw Throwables.propagate(e);
}
}
private List<BucketNode> getBuckets(long distributionId)
{
return dao.getBucketNodes(distributionId);
}
private Map<Integer, String> loadBucketAssignments(long distributionId)
{
Set<String> nodeIds = getNodeIdentifiers();
Iterator<String> nodeIterator = cyclingShuffledIterator(nodeIds);
ImmutableMap.Builder<Integer, String> assignments = ImmutableMap.builder();
for (BucketNode bucketNode : getBuckets(distributionId)) {
int bucket = bucketNode.getBucketNumber();
String nodeId = bucketNode.getNodeIdentifier();
if (!nodeIds.contains(nodeId)) {
if (nanosSince(startTime).compareTo(startupGracePeriod) < 0) {
throw new PrestoException(SERVER_STARTING_UP, "Cannot reassign buckets while server is starting");
}
assignmentLimiter.checkAssignFrom(nodeId);
String oldNodeId = nodeId;
// TODO: use smarter system to choose replacement node
nodeId = nodeIterator.next();
dao.updateBucketNode(distributionId, bucket, getOrCreateNodeId(nodeId));
log.info("Reassigned bucket %s for distribution ID %s from %s to %s", bucket, distributionId, oldNodeId, nodeId);
}
assignments.put(bucket, nodeId);
}
return assignments.build();
}
private Set<String> getNodeIdentifiers()
{
Set<String> nodeIds = nodeSupplier.getWorkerNodes().stream()
.map(Node::getNodeIdentifier)
.collect(toSet());
if (nodeIds.isEmpty()) {
throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available for bucket assignments");
}
return nodeIds;
}
private int getOrCreateNodeId(String nodeIdentifier)
{
try {
return nodeIdCache.getUnchecked(nodeIdentifier);
}
catch (UncheckedExecutionException | ExecutionError e) {
throw Throwables.propagate(e.getCause());
}
}
private int loadNodeId(String nodeIdentifier)
{
Integer id = dao.getNodeId(nodeIdentifier);
if (id != null) {
return id;
}
// creating a node is idempotent
runIgnoringConstraintViolation(() -> dao.insertNode(nodeIdentifier));
id = dao.getNodeId(nodeIdentifier);
if (id == null) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "node does not exist after insert");
}
return id;
}
private Duration nanosSince(long nanos)
{
return new Duration(ticker.read() - nanos, NANOSECONDS);
}
private static List<Long> insertShards(Connection connection, long tableId, List<ShardInfo> shards)
throws SQLException
{
String sql = "" +
"INSERT INTO shards (shard_uuid, table_id, create_time, row_count, compressed_size, uncompressed_size, bucket_number)\n" +
"VALUES (?, ?, CURRENT_TIMESTAMP, ?, ?, ?, ?)";
try (PreparedStatement statement = connection.prepareStatement(sql, RETURN_GENERATED_KEYS)) {
for (ShardInfo shard : shards) {
statement.setBytes(1, uuidToBytes(shard.getShardUuid()));
statement.setLong(2, tableId);
statement.setLong(3, shard.getRowCount());
statement.setLong(4, shard.getCompressedSize());
statement.setLong(5, shard.getUncompressedSize());
bindOptionalInt(statement, 6, shard.getBucketNumber());
statement.addBatch();
}
statement.executeBatch();
ImmutableList.Builder<Long> builder = ImmutableList.builder();
try (ResultSet keys = statement.getGeneratedKeys()) {
while (keys.next()) {
builder.add(keys.getLong(1));
}
}
List<Long> shardIds = builder.build();
if (shardIds.size() != shards.size()) {
throw new PrestoException(RAPTOR_ERROR, "Wrong number of generated keys for inserted shards");
}
return shardIds;
}
}
private static void insertShardNodes(Connection connection, Map<String, Integer> nodeIds, List<Long> shardIds, List<ShardInfo> shards)
throws SQLException
{
checkArgument(shardIds.size() == shards.size(), "lists are not the same size");
String sql = "INSERT INTO shard_nodes (shard_id, node_id) VALUES (?, ?)";
try (PreparedStatement statement = connection.prepareStatement(sql)) {
for (int i = 0; i < shards.size(); i++) {
for (String identifier : shards.get(i).getNodeIdentifiers()) {
statement.setLong(1, shardIds.get(i));
statement.setInt(2, nodeIds.get(identifier));
statement.addBatch();
}
}
statement.executeBatch();
}
}
private static Collection<Integer> fetchLockedNodeIds(Handle handle, long tableId, UUID shardUuid)
{
String sql = format(
"SELECT node_ids FROM %s WHERE shard_uuid = ? FOR UPDATE",
shardIndexTable(tableId));
byte[] nodeArray = handle.createQuery(sql)
.bind(0, uuidToBytes(shardUuid))
.map(ByteArrayMapper.FIRST)
.first();
return intArrayFromBytes(nodeArray);
}
private static void updateNodeIds(Handle handle, long tableId, UUID shardUuid, Set<Integer> nodeIds)
{
String sql = format(
"UPDATE %s SET node_ids = ? WHERE shard_uuid = ?",
shardIndexTable(tableId));
handle.execute(sql, intArrayToBytes(nodeIds), uuidToBytes(shardUuid));
}
private static void lockTable(Handle handle, long tableId)
{
if (handle.attach(MetadataDao.class).getLockedTableId(tableId) == null) {
throw transactionConflict();
}
}
private static PrestoException transactionConflict()
{
return new PrestoException(TRANSACTION_CONFLICT, "Table was updated by a different transaction. Please retry the operation.");
}
public static String shardIndexTable(long tableId)
{
return INDEX_TABLE_PREFIX + tableId;
}
public static String minColumn(long columnId)
{
checkArgument(columnId >= 0, "invalid columnId %s", columnId);
return format("c%s_min", columnId);
}
public static String maxColumn(long columnId)
{
checkArgument(columnId >= 0, "invalid columnId %s", columnId);
return format("c%s_max", columnId);
}
private static String sqlColumnType(Type type)
{
JDBCType jdbcType = jdbcType(type);
if (jdbcType != null) {
switch (jdbcType) {
case BOOLEAN:
return "boolean";
case BIGINT:
return "bigint";
case DOUBLE:
return "double";
case INTEGER:
return "int";
case VARBINARY:
return format("varbinary(%s)", MAX_BINARY_INDEX_SIZE);
}
}
return null;
}
private static <T> Iterator<T> cyclingShuffledIterator(Collection<T> collection)
{
List<T> list = new ArrayList<>(collection);
Collections.shuffle(list);
return Iterables.cycle(list).iterator();
}
private static ShardStats shardStats(Collection<ShardInfo> shards)
{
return new ShardStats(
shards.stream().mapToLong(ShardInfo::getRowCount).sum(),
shards.stream().mapToLong(ShardInfo::getCompressedSize).sum(),
shards.stream().mapToLong(ShardInfo::getUncompressedSize).sum());
}
private static class ShardStats
{
private final long rowCount;
private final long compressedSize;
private final long uncompressedSize;
public ShardStats(long rowCount, long compressedSize, long uncompressedSize)
{
this.rowCount = rowCount;
this.compressedSize = compressedSize;
this.uncompressedSize = uncompressedSize;
}
public long getRowCount()
{
return rowCount;
}
public long getCompressedSize()
{
return compressedSize;
}
public long getUncompressedSize()
{
return uncompressedSize;
}
}
}