/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.execution.scheduler;
import com.facebook.presto.execution.RemoteTask;
import com.facebook.presto.execution.SqlStageExecution;
import com.facebook.presto.metadata.Split;
import com.facebook.presto.spi.Node;
import com.facebook.presto.split.EmptySplit;
import com.facebook.presto.split.SplitSource;
import com.facebook.presto.sql.planner.plan.PlanNodeId;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import java.util.Collection;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import static com.facebook.presto.execution.scheduler.ScheduleResult.BlockedReason.SPLIT_QUEUES_FULL;
import static com.facebook.presto.execution.scheduler.ScheduleResult.BlockedReason.WAITING_FOR_SOURCE;
import static com.facebook.presto.spi.StandardErrorCode.NO_NODES_AVAILABLE;
import static com.facebook.presto.util.Failures.checkCondition;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.util.concurrent.Futures.nonCancellationPropagating;
import static io.airlift.concurrent.MoreFutures.getFutureValue;
import static java.util.Objects.requireNonNull;
public class SourcePartitionedScheduler
implements StageScheduler
{
private enum State
{
INITIALIZED,
SPLITS_SCHEDULED,
FINISHED
}
private final SqlStageExecution stage;
private final SplitSource splitSource;
private final SplitPlacementPolicy splitPlacementPolicy;
private final int splitBatchSize;
private final PlanNodeId partitionedNode;
private ListenableFuture<List<Split>> batchFuture;
private Set<Split> pendingSplits = ImmutableSet.of();
private State state = State.INITIALIZED;
public SourcePartitionedScheduler(
SqlStageExecution stage,
PlanNodeId partitionedNode,
SplitSource splitSource,
SplitPlacementPolicy splitPlacementPolicy,
int splitBatchSize)
{
this.stage = requireNonNull(stage, "stage is null");
this.splitSource = requireNonNull(splitSource, "splitSource is null");
this.splitPlacementPolicy = requireNonNull(splitPlacementPolicy, "splitPlacementPolicy is null");
checkArgument(splitBatchSize > 0, "splitBatchSize must be at least one");
this.splitBatchSize = splitBatchSize;
this.partitionedNode = partitionedNode;
}
@Override
public synchronized ScheduleResult schedule()
{
// try to get the next batch if necessary
if (pendingSplits.isEmpty()) {
if (batchFuture == null) {
if (splitSource.isFinished()) {
return handleNoMoreSplits();
}
batchFuture = splitSource.getNextBatch(splitBatchSize);
long start = System.nanoTime();
Futures.addCallback(batchFuture, new FutureCallback<List<Split>>()
{
@Override
public void onSuccess(List<Split> result)
{
stage.recordGetSplitTime(start);
}
@Override
public void onFailure(Throwable t)
{
}
});
}
if (!batchFuture.isDone()) {
// wrap batch future so cancellation is not propagated
ListenableFuture<List<Split>> blocked = nonCancellationPropagating(batchFuture);
return new ScheduleResult(false, ImmutableSet.of(), blocked, WAITING_FOR_SOURCE, 0);
}
pendingSplits = ImmutableSet.copyOf(getFutureValue(batchFuture));
batchFuture = null;
}
if (!pendingSplits.isEmpty() && state == State.INITIALIZED) {
state = State.SPLITS_SCHEDULED;
}
// assign the splits
SplitPlacementResult splitPlacementResult = splitPlacementPolicy.computeAssignments(pendingSplits);
Multimap<Node, Split> splitAssignment = splitPlacementResult.getAssignments();
Set<RemoteTask> newTasks = assignSplits(splitAssignment);
// remove assigned splits
pendingSplits = ImmutableSet.copyOf(Sets.difference(pendingSplits, ImmutableSet.copyOf(splitAssignment.values())));
// if not all splits were consumed, return a partial result
if (!pendingSplits.isEmpty()) {
newTasks = ImmutableSet.<RemoteTask>builder()
.addAll(newTasks)
.addAll(finalizeTaskCreationIfNecessary())
.build();
return new ScheduleResult(false, newTasks, splitPlacementResult.getBlocked(), SPLIT_QUEUES_FULL, splitAssignment.values().size());
}
// all splits assigned - check if the source is finished
boolean finished = splitSource.isFinished();
if (finished) {
splitSource.close();
}
return new ScheduleResult(finished, newTasks, splitAssignment.values().size());
}
private ScheduleResult handleNoMoreSplits()
{
switch (state) {
case INITIALIZED:
// we have not scheduled a single split so far
return scheduleEmptySplit();
case SPLITS_SCHEDULED:
state = State.FINISHED;
splitSource.close();
return new ScheduleResult(true, ImmutableSet.of(), 0);
}
throw new IllegalStateException("SourcePartitionedScheduler expected to be in INITIALIZED or SPLITS_SCHEDULED state but is in " + state);
}
@Override
public void close()
{
splitSource.close();
}
private ScheduleResult scheduleEmptySplit()
{
state = State.SPLITS_SCHEDULED;
List<Node> nodes = splitPlacementPolicy.allNodes();
checkCondition(!nodes.isEmpty(), NO_NODES_AVAILABLE, "No nodes available to run query");
Node node = nodes.iterator().next();
Split emptySplit = new Split(
splitSource.getConnectorId(),
splitSource.getTransactionHandle(),
new EmptySplit(splitSource.getConnectorId()));
Set<RemoteTask> emptyTask = assignSplits(ImmutableMultimap.of(node, emptySplit));
return new ScheduleResult(false, emptyTask, 1);
}
private Set<RemoteTask> assignSplits(Multimap<Node, Split> splitAssignment)
{
ImmutableSet.Builder<RemoteTask> newTasks = ImmutableSet.builder();
for (Entry<Node, Collection<Split>> taskSplits : splitAssignment.asMap().entrySet()) {
// source partitioned tasks can only receive broadcast data; otherwise it would have a different distribution
newTasks.addAll(stage.scheduleSplits(taskSplits.getKey(), ImmutableMultimap.<PlanNodeId, Split>builder()
.putAll(partitionedNode, taskSplits.getValue())
.build()));
}
return newTasks.build();
}
private Set<RemoteTask> finalizeTaskCreationIfNecessary()
{
// only lock down tasks if there is a sub stage that could block waiting for this stage to create all tasks
if (stage.getFragment().isLeaf()) {
return ImmutableSet.of();
}
splitPlacementPolicy.lockDownNodes();
Set<Node> scheduledNodes = stage.getScheduledNodes();
Set<RemoteTask> newTasks = splitPlacementPolicy.allNodes().stream()
.filter(node -> !scheduledNodes.contains(node))
.flatMap(node -> stage.scheduleSplits(node, ImmutableMultimap.of()).stream())
.collect(toImmutableSet());
// notify listeners that we have scheduled all tasks so they can set no more buffers or exchange splits
stage.transitionToSchedulingSplits();
return newTasks;
}
}