/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tajo.master.querymaster;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.state.*;
import org.apache.hadoop.yarn.util.Records;
import org.apache.tajo.ExecutionBlockId;
import org.apache.tajo.QueryIdFactory;
import org.apache.tajo.QueryUnitId;
import org.apache.tajo.catalog.*;
import org.apache.tajo.catalog.proto.CatalogProtos;
import org.apache.tajo.catalog.statistics.ColumnStats;
import org.apache.tajo.catalog.statistics.StatisticsUtil;
import org.apache.tajo.catalog.statistics.TableStats;
import org.apache.tajo.conf.TajoConf;
import org.apache.tajo.engine.planner.PlannerUtil;
import org.apache.tajo.engine.planner.global.DataChannel;
import org.apache.tajo.engine.planner.global.ExecutionBlock;
import org.apache.tajo.engine.planner.global.MasterPlan;
import org.apache.tajo.engine.planner.logical.GroupbyNode;
import org.apache.tajo.engine.planner.logical.NodeType;
import org.apache.tajo.engine.planner.logical.ScanNode;
import org.apache.tajo.engine.planner.logical.StoreTableNode;
import org.apache.tajo.ipc.TajoMasterProtocol;
import org.apache.tajo.master.*;
import org.apache.tajo.master.TaskRunnerGroupEvent.EventType;
import org.apache.tajo.master.event.*;
import org.apache.tajo.master.event.QueryUnitAttemptScheduleEvent.QueryUnitAttemptScheduleContext;
import org.apache.tajo.storage.AbstractStorageManager;
import org.apache.tajo.storage.fragment.FileFragment;
import java.io.IOException;
import java.net.URI;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import static org.apache.tajo.conf.TajoConf.ConfVars;
import static org.apache.tajo.ipc.TajoWorkerProtocol.ShuffleType;
/**
* SubQuery plays a role in controlling an ExecutionBlock and is a finite state machine.
*/
public class SubQuery implements EventHandler<SubQueryEvent> {
private static final Log LOG = LogFactory.getLog(SubQuery.class);
private MasterPlan masterPlan;
private ExecutionBlock block;
private int priority;
private Schema schema;
private TableMeta meta;
private TableStats resultStatistics;
private TableStats inputStatistics;
private EventHandler<Event> eventHandler;
private final AbstractStorageManager sm;
private AbstractTaskScheduler taskScheduler;
private QueryMasterTask.QueryMasterTaskContext context;
private final List<String> diagnostics = new ArrayList<String>();
private long startTime;
private long finishTime;
volatile Map<QueryUnitId, QueryUnit> tasks = new ConcurrentHashMap<QueryUnitId, QueryUnit>();
volatile Map<ContainerId, Container> containers = new ConcurrentHashMap<ContainerId, Container>();
private static final DiagnosticsUpdateTransition DIAGNOSTIC_UPDATE_TRANSITION = new DiagnosticsUpdateTransition();
private static final InternalErrorTransition INTERNAL_ERROR_TRANSITION = new InternalErrorTransition();
private static final ContainerLaunchTransition CONTAINER_LAUNCH_TRANSITION = new ContainerLaunchTransition();
private static final TaskCompletedTransition TASK_COMPLETED_TRANSITION = new TaskCompletedTransition();
private static final AllocatedContainersCancelTransition CONTAINERS_CANCEL_TRANSITION =
new AllocatedContainersCancelTransition();
private static final SubQueryCompleteTransition SUBQUERY_COMPLETED_TRANSITION =
new SubQueryCompleteTransition();
private StateMachine<SubQueryState, SubQueryEventType, SubQueryEvent> stateMachine;
protected static final StateMachineFactory<SubQuery, SubQueryState,
SubQueryEventType, SubQueryEvent> stateMachineFactory =
new StateMachineFactory <SubQuery, SubQueryState,
SubQueryEventType, SubQueryEvent> (SubQueryState.NEW)
// Transitions from NEW state
.addTransition(SubQueryState.NEW,
EnumSet.of(SubQueryState.INITED, SubQueryState.ERROR, SubQueryState.SUCCEEDED),
SubQueryEventType.SQ_INIT,
new InitAndRequestContainer())
.addTransition(SubQueryState.NEW, SubQueryState.NEW,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
.addTransition(SubQueryState.NEW, SubQueryState.KILLED,
SubQueryEventType.SQ_KILL)
.addTransition(SubQueryState.NEW, SubQueryState.ERROR,
SubQueryEventType.SQ_INTERNAL_ERROR,
INTERNAL_ERROR_TRANSITION)
// Transitions from INITED state
.addTransition(SubQueryState.INITED, SubQueryState.RUNNING,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
CONTAINER_LAUNCH_TRANSITION)
.addTransition(SubQueryState.INITED, SubQueryState.INITED,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
.addTransition(SubQueryState.INITED, SubQueryState.KILL_WAIT,
SubQueryEventType.SQ_KILL)
.addTransition(SubQueryState.INITED, SubQueryState.ERROR,
SubQueryEventType.SQ_INTERNAL_ERROR,
INTERNAL_ERROR_TRANSITION)
// Transitions from RUNNING state
.addTransition(SubQueryState.RUNNING, SubQueryState.RUNNING,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
CONTAINER_LAUNCH_TRANSITION)
.addTransition(SubQueryState.RUNNING, SubQueryState.RUNNING,
SubQueryEventType.SQ_TASK_COMPLETED,
TASK_COMPLETED_TRANSITION)
.addTransition(SubQueryState.RUNNING,
EnumSet.of(SubQueryState.SUCCEEDED, SubQueryState.FAILED),
SubQueryEventType.SQ_SUBQUERY_COMPLETED,
SUBQUERY_COMPLETED_TRANSITION)
.addTransition(SubQueryState.RUNNING, SubQueryState.RUNNING,
SubQueryEventType.SQ_FAILED,
TASK_COMPLETED_TRANSITION)
.addTransition(SubQueryState.RUNNING, SubQueryState.RUNNING,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
.addTransition(SubQueryState.RUNNING, SubQueryState.KILL_WAIT,
SubQueryEventType.SQ_KILL,
new KillTasksTransition())
.addTransition(SubQueryState.RUNNING, SubQueryState.ERROR,
SubQueryEventType.SQ_INTERNAL_ERROR,
INTERNAL_ERROR_TRANSITION)
// Ignore-able Transition
.addTransition(SubQueryState.RUNNING, SubQueryState.RUNNING,
SubQueryEventType.SQ_START)
// Transitions from KILL_WAIT state
.addTransition(SubQueryState.KILL_WAIT, SubQueryState.KILL_WAIT,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
CONTAINERS_CANCEL_TRANSITION)
.addTransition(SubQueryState.KILL_WAIT, SubQueryState.KILL_WAIT,
EnumSet.of(SubQueryEventType.SQ_KILL))
.addTransition(SubQueryState.KILL_WAIT, SubQueryState.KILL_WAIT,
SubQueryEventType.SQ_TASK_COMPLETED,
TASK_COMPLETED_TRANSITION)
.addTransition(SubQueryState.KILL_WAIT,
EnumSet.of(SubQueryState.SUCCEEDED, SubQueryState.FAILED, SubQueryState.KILLED),
SubQueryEventType.SQ_SUBQUERY_COMPLETED,
SUBQUERY_COMPLETED_TRANSITION)
.addTransition(SubQueryState.KILL_WAIT, SubQueryState.KILL_WAIT,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
.addTransition(SubQueryState.KILL_WAIT, SubQueryState.KILL_WAIT,
SubQueryEventType.SQ_FAILED,
TASK_COMPLETED_TRANSITION)
.addTransition(SubQueryState.KILL_WAIT, SubQueryState.ERROR,
SubQueryEventType.SQ_INTERNAL_ERROR,
INTERNAL_ERROR_TRANSITION)
// Transitions from SUCCEEDED state
.addTransition(SubQueryState.SUCCEEDED, SubQueryState.SUCCEEDED,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
CONTAINERS_CANCEL_TRANSITION)
.addTransition(SubQueryState.SUCCEEDED, SubQueryState.SUCCEEDED,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
.addTransition(SubQueryState.SUCCEEDED, SubQueryState.ERROR,
SubQueryEventType.SQ_INTERNAL_ERROR,
INTERNAL_ERROR_TRANSITION)
// Ignore-able events
.addTransition(SubQueryState.SUCCEEDED, SubQueryState.SUCCEEDED,
EnumSet.of(
SubQueryEventType.SQ_START,
SubQueryEventType.SQ_KILL,
SubQueryEventType.SQ_CONTAINER_ALLOCATED))
// Transitions from FAILED state
.addTransition(SubQueryState.FAILED, SubQueryState.FAILED,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
CONTAINERS_CANCEL_TRANSITION)
.addTransition(SubQueryState.FAILED, SubQueryState.FAILED,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
.addTransition(SubQueryState.FAILED, SubQueryState.ERROR,
SubQueryEventType.SQ_INTERNAL_ERROR,
INTERNAL_ERROR_TRANSITION)
// Ignore-able transitions
.addTransition(SubQueryState.FAILED, SubQueryState.FAILED,
EnumSet.of(
SubQueryEventType.SQ_START,
SubQueryEventType.SQ_KILL,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
SubQueryEventType.SQ_FAILED))
// Transitions from FAILED state
.addTransition(SubQueryState.ERROR, SubQueryState.ERROR,
SubQueryEventType.SQ_CONTAINER_ALLOCATED,
CONTAINERS_CANCEL_TRANSITION)
.addTransition(SubQueryState.ERROR, SubQueryState.ERROR,
SubQueryEventType.SQ_DIAGNOSTIC_UPDATE,
DIAGNOSTIC_UPDATE_TRANSITION)
// Ignore-able transitions
.addTransition(SubQueryState.ERROR, SubQueryState.ERROR,
EnumSet.of(
SubQueryEventType.SQ_START,
SubQueryEventType.SQ_KILL,
SubQueryEventType.SQ_FAILED,
SubQueryEventType.SQ_INTERNAL_ERROR))
.installTopology();
private final Lock readLock;
private final Lock writeLock;
private int totalScheduledObjectsCount;
private int succeededObjectCount = 0;
private int completedTaskCount = 0;
private int succeededTaskCount = 0;
private int killedObjectCount = 0;
private int failedObjectCount = 0;
private TaskSchedulerContext schedulerContext;
public SubQuery(QueryMasterTask.QueryMasterTaskContext context, MasterPlan masterPlan, ExecutionBlock block, AbstractStorageManager sm) {
this.context = context;
this.masterPlan = masterPlan;
this.block = block;
this.sm = sm;
this.eventHandler = context.getEventHandler();
ReadWriteLock readWriteLock = new ReentrantReadWriteLock();
this.readLock = readWriteLock.readLock();
this.writeLock = readWriteLock.writeLock();
stateMachine = stateMachineFactory.make(this);
}
public static boolean isRunningState(SubQueryState state) {
return state == SubQueryState.INITED || state == SubQueryState.NEW || state == SubQueryState.RUNNING;
}
public QueryMasterTask.QueryMasterTaskContext getContext() {
return context;
}
public MasterPlan getMasterPlan() {
return masterPlan;
}
public DataChannel getDataChannel() {
return masterPlan.getOutgoingChannels(getId()).iterator().next();
}
public EventHandler<Event> getEventHandler() {
return eventHandler;
}
public AbstractTaskScheduler getTaskScheduler() {
return taskScheduler;
}
public void setStartTime() {
startTime = context.getClock().getTime();
}
@SuppressWarnings("UnusedDeclaration")
public long getStartTime() {
return this.startTime;
}
public void setFinishTime() {
finishTime = context.getClock().getTime();
}
@SuppressWarnings("UnusedDeclaration")
public long getFinishTime() {
return this.finishTime;
}
public float getTaskProgress() {
readLock.lock();
try {
if (getState() == SubQueryState.NEW) {
return 0;
} else {
return (float)(succeededObjectCount) / (float)totalScheduledObjectsCount;
}
} finally {
readLock.unlock();
}
}
public float getProgress() {
List<QueryUnit> tempTasks = null;
readLock.lock();
try {
if (getState() == SubQueryState.NEW) {
return 0;
} else {
tempTasks = new ArrayList<QueryUnit>(tasks.values());
}
} finally {
readLock.unlock();
}
float totalProgress = 0.0f;
for (QueryUnit eachQueryUnit: tempTasks) {
if (eachQueryUnit.getLastAttempt() != null) {
totalProgress += eachQueryUnit.getLastAttempt().getProgress();
}
}
return totalProgress/(float)tempTasks.size();
}
public int getSucceededObjectCount() {
return succeededObjectCount;
}
public int getTotalScheduledObjectsCount() {
return totalScheduledObjectsCount;
}
public ExecutionBlock getBlock() {
return block;
}
public void addTask(QueryUnit task) {
tasks.put(task.getId(), task);
}
/**
* It finalizes this subquery. It is only invoked when the subquery is succeeded.
*/
public void complete() {
cleanup();
finalizeStats();
setFinishTime();
eventHandler.handle(new SubQueryCompletedEvent(getId(), SubQueryState.SUCCEEDED));
}
/**
* It finalizes this subquery. Unlike {@link SubQuery#complete()},
* it is invoked when a subquery is abnormally finished.
*
* @param finalState The final subquery state
*/
public void abort(SubQueryState finalState) {
// TODO -
// - committer.abortSubQuery(...)
// - record SubQuery Finish Time
// - CleanUp Tasks
// - Record History
cleanup();
setFinishTime();
eventHandler.handle(new SubQueryCompletedEvent(getId(), finalState));
}
public StateMachine<SubQueryState, SubQueryEventType, SubQueryEvent> getStateMachine() {
return this.stateMachine;
}
public void setPriority(int priority) {
this.priority = priority;
}
public int getPriority() {
return this.priority;
}
public AbstractStorageManager getStorageManager() {
return sm;
}
public ExecutionBlockId getId() {
return block.getId();
}
public QueryUnit[] getQueryUnits() {
return tasks.values().toArray(new QueryUnit[tasks.size()]);
}
public QueryUnit getQueryUnit(QueryUnitId qid) {
return tasks.get(qid);
}
public Schema getSchema() {
return schema;
}
public TableMeta getTableMeta() {
return meta;
}
public TableStats getResultStats() {
return resultStatistics;
}
public TableStats getInputStats() {
return inputStatistics;
}
public List<String> getDiagnostics() {
readLock.lock();
try {
return diagnostics;
} finally {
readLock.unlock();
}
}
protected void addDiagnostic(String diag) {
diagnostics.add(diag);
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.getId());
return sb.toString();
}
@Override
public boolean equals(Object o) {
if (o instanceof SubQuery) {
SubQuery other = (SubQuery)o;
return getId().equals(other.getId());
}
return false;
}
@Override
public int hashCode() {
return getId().hashCode();
}
public int compareTo(SubQuery other) {
return getId().compareTo(other.getId());
}
public SubQueryState getState() {
readLock.lock();
try {
return stateMachine.getCurrentState();
} finally {
readLock.unlock();
}
}
public static TableStats[] computeStatFromUnionBlock(SubQuery subQuery) {
TableStats[] stat = new TableStats[]{new TableStats(), new TableStats()};
long[] avgRows = new long[]{0, 0};
long[] numBytes = new long[]{0, 0};
long[] readBytes = new long[]{0, 0};
long[] numRows = new long[]{0, 0};
int[] numBlocks = new int[]{0, 0};
int[] numOutputs = new int[]{0, 0};
List<ColumnStats> columnStatses = Lists.newArrayList();
MasterPlan masterPlan = subQuery.getMasterPlan();
Iterator<ExecutionBlock> it = masterPlan.getChilds(subQuery.getBlock()).iterator();
while (it.hasNext()) {
ExecutionBlock block = it.next();
SubQuery childSubQuery = subQuery.context.getSubQuery(block.getId());
TableStats[] childStatArray = new TableStats[]{
childSubQuery.getInputStats(), childSubQuery.getResultStats()
};
for (int i = 0; i < 2; i++) {
if (childStatArray[i] == null) {
continue;
}
avgRows[i] += childStatArray[i].getAvgRows();
numBlocks[i] += childStatArray[i].getNumBlocks();
numBytes[i] += childStatArray[i].getNumBytes();
readBytes[i] += childStatArray[i].getReadBytes();
numOutputs[i] += childStatArray[i].getNumShuffleOutputs();
numRows[i] += childStatArray[i].getNumRows();
}
columnStatses.addAll(childStatArray[1].getColumnStats());
}
for (int i = 0; i < 2; i++) {
stat[i].setNumBlocks(numBlocks[i]);
stat[i].setNumBytes(numBytes[i]);
stat[i].setReadBytes(readBytes[i]);
stat[i].setNumShuffleOutputs(numOutputs[i]);
stat[i].setNumRows(numRows[i]);
stat[i].setAvgRows(avgRows[i]);
}
stat[1].setColumnStats(columnStatses);
return stat;
}
private TableStats[] computeStatFromTasks() {
List<TableStats> inputStatsList = Lists.newArrayList();
List<TableStats> resultStatsList = Lists.newArrayList();
for (QueryUnit unit : getQueryUnits()) {
resultStatsList.add(unit.getStats());
if (unit.getLastAttempt().getInputStats() != null) {
inputStatsList.add(unit.getLastAttempt().getInputStats());
}
}
TableStats inputStats = StatisticsUtil.aggregateTableStat(inputStatsList);
TableStats resultStats = StatisticsUtil.aggregateTableStat(resultStatsList);
return new TableStats[]{inputStats, resultStats};
}
private void stopScheduler() {
// If there are launched TaskRunners, send the 'shouldDie' message to all r
// via received task requests.
if (taskScheduler != null) {
taskScheduler.stop();
}
}
private void releaseContainers() {
// If there are still live TaskRunners, try to kill the containers.
eventHandler.handle(new TaskRunnerGroupEvent(EventType.CONTAINER_REMOTE_CLEANUP, getId(), containers.values()));
}
public void releaseContainer(ContainerId containerId) {
// try to kill the container.
ArrayList<Container> list = new ArrayList<Container>();
list.add(containers.get(containerId));
eventHandler.handle(new TaskRunnerGroupEvent(EventType.CONTAINER_REMOTE_CLEANUP, getId(), list));
}
/**
* It computes all stats and sets the intermediate result.
*/
private void finalizeStats() {
TableStats[] statsArray;
if (block.hasUnion()) {
statsArray = computeStatFromUnionBlock(this);
} else {
statsArray = computeStatFromTasks();
}
DataChannel channel = masterPlan.getOutgoingChannels(getId()).get(0);
// get default or store type
CatalogProtos.StoreType storeType = CatalogProtos.StoreType.CSV; // default setting
// if store plan (i.e., CREATE or INSERT OVERWRITE)
StoreTableNode storeTableNode = PlannerUtil.findTopNode(getBlock().getPlan(), NodeType.STORE);
if (storeTableNode != null) {
storeType = storeTableNode.getStorageType();
}
schema = channel.getSchema();
meta = CatalogUtil.newTableMeta(storeType, new Options());
inputStatistics = statsArray[0];
resultStatistics = statsArray[1];
}
@Override
public void handle(SubQueryEvent event) {
if (LOG.isDebugEnabled()) {
LOG.debug("Processing " + event.getSubQueryId() + " of type " + event.getType() + ", preState=" + getState());
}
try {
writeLock.lock();
SubQueryState oldState = getState();
try {
getStateMachine().doTransition(event.getType(), event);
} catch (InvalidStateTransitonException e) {
LOG.error("Can't handle this event at current state", e);
eventHandler.handle(new SubQueryEvent(getId(),
SubQueryEventType.SQ_INTERNAL_ERROR));
}
// notify the eventhandler of state change
if (LOG.isDebugEnabled()) {
if (oldState != getState()) {
LOG.debug(getId() + " SubQuery Transitioned from " + oldState + " to "
+ getState());
}
}
} finally {
writeLock.unlock();
}
}
public void handleTaskRequestEvent(TaskRequestEvent event) {
taskScheduler.handleTaskRequestEvent(event);
}
private static class InitAndRequestContainer implements MultipleArcTransition<SubQuery,
SubQueryEvent, SubQueryState> {
@Override
public SubQueryState transition(SubQuery subQuery, SubQueryEvent subQueryEvent) {
subQuery.setStartTime();
ExecutionBlock execBlock = subQuery.getBlock();
SubQueryState state;
try {
// Union operator does not require actual query processing. It is performed logically.
if (execBlock.hasUnion()) {
subQuery.finalizeStats();
state = SubQueryState.SUCCEEDED;
} else {
ExecutionBlock parent = subQuery.getMasterPlan().getParent(subQuery.getBlock());
DataChannel channel = subQuery.getMasterPlan().getChannel(subQuery.getId(), parent.getId());
setShuffleIfNecessary(subQuery, channel);
initTaskScheduler(subQuery);
schedule(subQuery);
subQuery.totalScheduledObjectsCount = subQuery.getTaskScheduler().remainingScheduledObjectNum();
LOG.info(subQuery.totalScheduledObjectsCount + " objects are scheduled");
if (subQuery.getTaskScheduler().remainingScheduledObjectNum() == 0) { // if there is no tasks
subQuery.stopScheduler();
subQuery.finalizeStats();
subQuery.eventHandler.handle(new SubQueryCompletedEvent(subQuery.getId(), SubQueryState.SUCCEEDED));
return SubQueryState.SUCCEEDED;
} else {
subQuery.taskScheduler.start();
allocateContainers(subQuery);
return SubQueryState.INITED;
}
}
} catch (Exception e) {
LOG.error("SubQuery (" + subQuery.getId() + ") ERROR: ", e);
subQuery.setFinishTime();
subQuery.eventHandler.handle(new SubQueryDiagnosticsUpdateEvent(subQuery.getId(), e.getMessage()));
subQuery.eventHandler.handle(new SubQueryCompletedEvent(subQuery.getId(), SubQueryState.ERROR));
return SubQueryState.ERROR;
}
return state;
}
private void initTaskScheduler(SubQuery subQuery) throws IOException {
TajoConf conf = subQuery.context.getConf();
subQuery.schedulerContext = new TaskSchedulerContext(subQuery.context,
subQuery.getMasterPlan().isLeaf(subQuery.getId()), subQuery.getId());
subQuery.taskScheduler = TaskSchedulerFactory.get(conf, subQuery.schedulerContext, subQuery);
subQuery.taskScheduler.init(conf);
LOG.info(subQuery.taskScheduler.getName() + " is chosen for the task scheduling for " + subQuery.getId());
}
/**
* If a parent block requires a repartition operation, the method sets proper repartition
* methods and the number of partitions to a given subquery.
*/
private static void setShuffleIfNecessary(SubQuery subQuery, DataChannel channel) {
if (channel.getShuffleType() != ShuffleType.NONE_SHUFFLE) {
int numTasks = calculateShuffleOutputNum(subQuery, channel);
Repartitioner.setShuffleOutputNumForTwoPhase(subQuery, numTasks, channel);
}
}
/**
* Getting the total memory of cluster
*
* @param subQuery
* @return mega bytes
*/
private static int getClusterTotalMemory(SubQuery subQuery) {
List<TajoMasterProtocol.WorkerResourceProto> workers =
subQuery.context.getQueryMasterContext().getQueryMaster().getAllWorker();
int totalMem = 0;
for (TajoMasterProtocol.WorkerResourceProto worker : workers) {
totalMem += worker.getMemoryMB();
}
return totalMem;
}
/**
* Getting the desire number of partitions according to the volume of input data.
* This method is only used to determine the partition key number of hash join or aggregation.
*
* @param subQuery
* @return
*/
public static int calculateShuffleOutputNum(SubQuery subQuery, DataChannel channel) {
TajoConf conf = subQuery.context.getConf();
MasterPlan masterPlan = subQuery.getMasterPlan();
ExecutionBlock parent = masterPlan.getParent(subQuery.getBlock());
GroupbyNode grpNode = null;
if (parent != null) {
grpNode = PlannerUtil.findMostBottomNode(parent.getPlan(), NodeType.GROUP_BY);
}
// Is this subquery the first step of join?
if (parent != null && parent.getScanNodes().length == 2) {
List<ExecutionBlock> childs = masterPlan.getChilds(parent);
// for outer
ExecutionBlock outer = childs.get(0);
long outerVolume = getInputVolume(subQuery.masterPlan, subQuery.context, outer);
// for inner
ExecutionBlock inner = childs.get(1);
long innerVolume = getInputVolume(subQuery.masterPlan, subQuery.context, inner);
LOG.info(subQuery.getId() + ", Outer volume: " + Math.ceil((double) outerVolume / 1048576) + "MB, "
+ "Inner volume: " + Math.ceil((double) innerVolume / 1048576) + "MB");
long bigger = Math.max(outerVolume, innerVolume);
int mb = (int) Math.ceil((double) bigger / 1048576);
LOG.info(subQuery.getId() + ", Bigger Table's volume is approximately " + mb + " MB");
int taskNum = (int) Math.ceil((double) mb /
conf.getIntVar(ConfVars.DIST_QUERY_JOIN_PARTITION_VOLUME));
int totalMem = getClusterTotalMemory(subQuery);
LOG.info(subQuery.getId() + ", Total memory of cluster is " + totalMem + " MB");
int slots = Math.max(totalMem / conf.getIntVar(ConfVars.TASK_DEFAULT_MEMORY), 1);
// determine the number of task
taskNum = Math.min(taskNum, slots);
LOG.info(subQuery.getId() + ", The determined number of join partitions is " + taskNum);
// The shuffle output numbers of join may be inconsistent by execution block order.
// Thus, we need to compare the number with DataChannel output numbers.
// If the number is right, the number and DataChannel output numbers will be consistent.
int outerShuffleOutptNum = 0, innerShuffleOutputNum = 0;
for (DataChannel eachChannel : masterPlan.getOutgoingChannels(outer.getId())) {
outerShuffleOutptNum = Math.max(outerShuffleOutptNum, eachChannel.getShuffleOutputNum());
}
for (DataChannel eachChannel : masterPlan.getOutgoingChannels(inner.getId())) {
innerShuffleOutputNum = Math.max(innerShuffleOutputNum, eachChannel.getShuffleOutputNum());
}
if (outerShuffleOutptNum != innerShuffleOutputNum
&& taskNum != outerShuffleOutptNum
&& taskNum != innerShuffleOutputNum) {
taskNum = Math.max(outerShuffleOutptNum, innerShuffleOutputNum);
}
return taskNum;
// Is this subquery the first step of group-by?
} else if (grpNode != null) {
if (grpNode.getGroupingColumns().length == 0) {
return 1;
} else {
long volume = getInputVolume(subQuery.masterPlan, subQuery.context, subQuery.block);
int mb = (int) Math.ceil((double) volume / 1048576);
LOG.info(subQuery.getId() + ", Table's volume is approximately " + mb + " MB");
// determine the number of task
int taskNumBySize = (int) Math.ceil((double) mb /
conf.getIntVar(ConfVars.DIST_QUERY_GROUPBY_PARTITION_VOLUME));
int totalMem = getClusterTotalMemory(subQuery);
LOG.info(subQuery.getId() + ", Total memory of cluster is " + totalMem + " MB");
int slots = Math.max(totalMem / conf.getIntVar(ConfVars.TASK_DEFAULT_MEMORY), 1);
int taskNum = Math.min(taskNumBySize, slots); //Maximum partitions
LOG.info(subQuery.getId() + ", The determined number of aggregation partitions is " + taskNum);
return taskNum;
}
} else {
LOG.info("============>>>>> Unexpected Case! <<<<<================");
long volume = getInputVolume(subQuery.masterPlan, subQuery.context, subQuery.block);
int mb = (int) Math.ceil((double)volume / 1048576);
LOG.info(subQuery.getId() + ", Table's volume is approximately " + mb + " MB");
// determine the number of task per 128MB
int taskNum = (int) Math.ceil((double)mb / 128);
LOG.info(subQuery.getId() + ", The determined number of partitions is " + taskNum);
return taskNum;
}
}
private static void schedule(SubQuery subQuery) throws IOException {
MasterPlan masterPlan = subQuery.getMasterPlan();
ExecutionBlock execBlock = subQuery.getBlock();
if (subQuery.getMasterPlan().isLeaf(execBlock.getId()) && execBlock.getScanNodes().length == 1) { // Case 1: Just Scan
scheduleFragmentsForLeafQuery(subQuery);
} else if (execBlock.getScanNodes().length > 1) { // Case 2: Join
Repartitioner.scheduleFragmentsForJoinQuery(subQuery.schedulerContext, subQuery);
} else { // Case 3: Others (Sort or Aggregation)
int numTasks = getNonLeafTaskNum(subQuery);
Repartitioner.scheduleFragmentsForNonLeafTasks(subQuery.schedulerContext, masterPlan, subQuery, numTasks);
}
}
/**
* Getting the desire number of tasks according to the volume of input data
*
* @param subQuery
* @return
*/
public static int getNonLeafTaskNum(SubQuery subQuery) {
// Getting intermediate data size
long volume = getInputVolume(subQuery.getMasterPlan(), subQuery.context, subQuery.getBlock());
int mb = (int) Math.ceil((double)volume / 1048576);
LOG.info("Table's volume is approximately " + mb + " MB");
// determine the number of task per 64MB
int maxTaskNum = Math.max(1, (int) Math.ceil((double)mb / 64));
LOG.info("The determined number of non-leaf tasks is " + maxTaskNum);
return maxTaskNum;
}
public static long getInputVolume(MasterPlan masterPlan, QueryMasterTask.QueryMasterTaskContext context,
ExecutionBlock execBlock) {
Map<String, TableDesc> tableMap = context.getTableDescMap();
if (masterPlan.isLeaf(execBlock)) {
ScanNode[] outerScans = execBlock.getScanNodes();
long maxVolume = 0;
for (ScanNode eachScanNode: outerScans) {
TableStats stat = tableMap.get(eachScanNode.getCanonicalName()).getStats();
if (stat.getNumBytes() > maxVolume) {
maxVolume = stat.getNumBytes();
}
}
return maxVolume;
} else {
long aggregatedVolume = 0;
for (ExecutionBlock childBlock : masterPlan.getChilds(execBlock)) {
SubQuery subquery = context.getSubQuery(childBlock.getId());
if (subquery == null || subquery.getState() != SubQueryState.SUCCEEDED) {
aggregatedVolume += getInputVolume(masterPlan, context, childBlock);
} else {
aggregatedVolume += subquery.getResultStats().getNumBytes();
}
}
return aggregatedVolume;
}
}
public static void allocateContainers(SubQuery subQuery) {
ExecutionBlock execBlock = subQuery.getBlock();
//TODO consider disk slot
int requiredMemoryMBPerTask = 512;
int numRequest = subQuery.getContext().getResourceAllocator().calculateNumRequestContainers(
subQuery.getContext().getQueryMasterContext().getWorkerContext(),
subQuery.schedulerContext.getEstimatedTaskNum(),
requiredMemoryMBPerTask
);
final Resource resource = Records.newRecord(Resource.class);
resource.setMemory(requiredMemoryMBPerTask);
LOG.info("Request Container for " + subQuery.getId() + " containers=" + numRequest);
Priority priority = Records.newRecord(Priority.class);
priority.setPriority(subQuery.getPriority());
ContainerAllocationEvent event =
new ContainerAllocationEvent(ContainerAllocatorEventType.CONTAINER_REQ,
subQuery.getId(), priority, resource, numRequest,
subQuery.masterPlan.isLeaf(execBlock), 0.0f);
subQuery.eventHandler.handle(event);
}
private static void scheduleFragmentsForLeafQuery(SubQuery subQuery) throws IOException {
ExecutionBlock execBlock = subQuery.getBlock();
ScanNode[] scans = execBlock.getScanNodes();
Preconditions.checkArgument(scans.length == 1, "Must be Scan Query");
ScanNode scan = scans[0];
TableDesc table = subQuery.context.getTableDescMap().get(scan.getCanonicalName());
Collection<FileFragment> fragments;
TableMeta meta = table.getMeta();
// Depending on scanner node's type, it creates fragments. If scan is for
// a partitioned table, It will creates lots fragments for all partitions.
// Otherwise, it creates at least one fragments for a table, which may
// span a number of blocks or possibly consists of a number of files.
if (scan.getType() == NodeType.PARTITIONS_SCAN) {
fragments = Repartitioner.getFragmentsFromPartitionedTable(subQuery.getStorageManager(), scan, table);
} else {
Path inputPath = table.getPath();
fragments = subQuery.getStorageManager().getSplits(scan.getCanonicalName(), meta, table.getSchema(), inputPath);
}
SubQuery.scheduleFragments(subQuery, fragments);
if (subQuery.getTaskScheduler() instanceof DefaultTaskScheduler) {
//Leaf task of DefaultTaskScheduler should be fragment size
// EstimatedTaskNum determined number of initial container
subQuery.schedulerContext.setTaskSize(fragments.size());
subQuery.schedulerContext.setEstimatedTaskNum(fragments.size());
} else {
TajoConf conf = subQuery.context.getConf();
subQuery.schedulerContext.setTaskSize(conf.getIntVar(ConfVars.TASK_DEFAULT_SIZE) * 1024 * 1024);
int estimatedTaskNum = (int) Math.ceil((double) table.getStats().getNumBytes() /
(double) subQuery.schedulerContext.getTaskSize());
subQuery.schedulerContext.setEstimatedTaskNum(estimatedTaskNum);
}
}
}
public static void scheduleFragment(SubQuery subQuery, FileFragment fragment) {
subQuery.taskScheduler.handle(new FragmentScheduleEvent(TaskSchedulerEvent.EventType.T_SCHEDULE,
subQuery.getId(), fragment));
}
public static void scheduleFragments(SubQuery subQuery, Collection<FileFragment> fragments) {
for (FileFragment eachFragment : fragments) {
scheduleFragment(subQuery, eachFragment);
}
}
public static void scheduleFragments(SubQuery subQuery, Collection<FileFragment> leftFragments,
Collection<FileFragment> broadcastFragments) {
for (FileFragment eachLeafFragment : leftFragments) {
scheduleFragment(subQuery, eachLeafFragment, broadcastFragments);
}
}
public static void scheduleFragment(SubQuery subQuery,
FileFragment leftFragment, Collection<FileFragment> rightFragments) {
subQuery.taskScheduler.handle(new FragmentScheduleEvent(TaskSchedulerEvent.EventType.T_SCHEDULE,
subQuery.getId(), leftFragment, rightFragments));
}
public static void scheduleFetches(SubQuery subQuery, Map<String, List<URI>> fetches) {
subQuery.taskScheduler.handle(new FetchScheduleEvent(TaskSchedulerEvent.EventType.T_SCHEDULE,
subQuery.getId(), fetches));
}
public static QueryUnit newEmptyQueryUnit(TaskSchedulerContext schedulerContext,
QueryUnitAttemptScheduleContext queryUnitContext,
SubQuery subQuery, int taskId) {
ExecutionBlock execBlock = subQuery.getBlock();
QueryUnit unit = new QueryUnit(schedulerContext.getMasterContext().getConf(),
queryUnitContext,
QueryIdFactory.newQueryUnitId(schedulerContext.getBlockId(), taskId),
schedulerContext.isLeafQuery(), subQuery.eventHandler);
unit.setLogicalPlan(execBlock.getPlan());
subQuery.addTask(unit);
return unit;
}
private static class ContainerLaunchTransition
implements SingleArcTransition<SubQuery, SubQueryEvent> {
@Override
public void transition(SubQuery subQuery, SubQueryEvent event) {
try {
SubQueryContainerAllocationEvent allocationEvent =
(SubQueryContainerAllocationEvent) event;
for (Container container : allocationEvent.getAllocatedContainer()) {
ContainerId cId = container.getId();
if (subQuery.containers.containsKey(cId)) {
subQuery.eventHandler.handle(new SubQueryDiagnosticsUpdateEvent(subQuery.getId(),
"Duplicated containers are allocated: " + cId.toString()));
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_INTERNAL_ERROR));
}
subQuery.containers.put(cId, container);
}
LOG.info("SubQuery (" + subQuery.getId() + ") has " + subQuery.containers.size() + " containers!");
subQuery.eventHandler.handle(
new TaskRunnerGroupEvent(EventType.CONTAINER_REMOTE_LAUNCH,
subQuery.getId(), allocationEvent.getAllocatedContainer()));
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_START));
} catch (Throwable t) {
subQuery.eventHandler.handle(new SubQueryDiagnosticsUpdateEvent(subQuery.getId(),
ExceptionUtils.getStackTrace(t)));
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_INTERNAL_ERROR));
}
}
}
/**
* It is used in KILL_WAIT state against Contained Allocated event.
* It just returns allocated containers to resource manager.
*/
private static class AllocatedContainersCancelTransition implements SingleArcTransition<SubQuery, SubQueryEvent> {
@Override
public void transition(SubQuery subQuery, SubQueryEvent event) {
try {
SubQueryContainerAllocationEvent allocationEvent =
(SubQueryContainerAllocationEvent) event;
subQuery.eventHandler.handle(
new TaskRunnerGroupEvent(EventType.CONTAINER_REMOTE_CLEANUP,
subQuery.getId(), allocationEvent.getAllocatedContainer()));
LOG.info(String.format("[%s] %d allocated containers are canceled",
subQuery.getId().toString(),
allocationEvent.getAllocatedContainer().size()));
} catch (Throwable t) {
subQuery.eventHandler.handle(new SubQueryDiagnosticsUpdateEvent(subQuery.getId(),
ExceptionUtils.getStackTrace(t)));
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_INTERNAL_ERROR));
}
}
}
private static class TaskCompletedTransition implements SingleArcTransition<SubQuery, SubQueryEvent> {
@Override
public void transition(SubQuery subQuery,
SubQueryEvent event) {
SubQueryTaskEvent taskEvent = (SubQueryTaskEvent) event;
QueryUnit task = subQuery.getQueryUnit(taskEvent.getTaskId());
if (task == null) { // task failed
LOG.error(String.format("Task %s is absent", taskEvent.getTaskId()));
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_FAILED));
} else {
subQuery.completedTaskCount++;
if (taskEvent.getState() == TaskState.SUCCEEDED) {
// if (task.isLeafTask()) {
// subQuery.succeededObjectCount += task.getTotalFragmentNum();
// } else {
// subQuery.succeededObjectCount++;
// }
subQuery.succeededObjectCount++;
} else if (task.getState() == TaskState.KILLED) {
// if (task.isLeafTask()) {
// subQuery.killedObjectCount += task.getTotalFragmentNum();
// } else {
// subQuery.killedObjectCount++;
// }
subQuery.killedObjectCount++;
} else if (task.getState() == TaskState.FAILED) {
// if (task.isLeafTask()) {
// subQuery.failedObjectCount+= task.getTotalFragmentNum();
// } else {
// subQuery.failedObjectCount++;
// }
subQuery.failedObjectCount++;
// if at least one task is failed, try to kill all tasks.
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_KILL));
}
LOG.info(String.format("[%s] Task Completion Event (Total: %d, Success: %d, Killed: %d, Failed: %d",
subQuery.getId(),
subQuery.getTotalScheduledObjectsCount(),
subQuery.succeededObjectCount,
subQuery.killedObjectCount,
subQuery.failedObjectCount));
if (subQuery.totalScheduledObjectsCount ==
subQuery.succeededObjectCount + subQuery.killedObjectCount + subQuery.failedObjectCount) {
subQuery.eventHandler.handle(new SubQueryEvent(subQuery.getId(), SubQueryEventType.SQ_SUBQUERY_COMPLETED));
}
}
}
}
private static class KillTasksTransition implements SingleArcTransition<SubQuery, SubQueryEvent> {
@Override
public void transition(SubQuery subQuery, SubQueryEvent subQueryEvent) {
subQuery.getTaskScheduler().stop();
for (QueryUnit queryUnit : subQuery.getQueryUnits()) {
subQuery.eventHandler.handle(new TaskEvent(queryUnit.getId(), TaskEventType.T_KILL));
}
}
}
private void cleanup() {
stopScheduler();
releaseContainers();
}
private static class SubQueryCompleteTransition
implements MultipleArcTransition<SubQuery, SubQueryEvent, SubQueryState> {
@Override
public SubQueryState transition(SubQuery subQuery, SubQueryEvent subQueryEvent) {
// TODO - Commit subQuery & do cleanup
// TODO - records succeeded, failed, killed completed task
// TODO - records metrics
try {
LOG.info(String.format("subQuery completed - %s (total=%d, success=%d, killed=%d)",
subQuery.getId().toString(),
subQuery.getTotalScheduledObjectsCount(),
subQuery.getSucceededObjectCount(),
subQuery.killedObjectCount));
if (subQuery.killedObjectCount > 0 || subQuery.failedObjectCount > 0) {
if (subQuery.failedObjectCount > 0) {
subQuery.abort(SubQueryState.FAILED);
return SubQueryState.FAILED;
} else if (subQuery.killedObjectCount > 0) {
subQuery.abort(SubQueryState.KILLED);
return SubQueryState.KILLED;
} else {
LOG.error("Invalid State " + subQuery.getState() + " State");
subQuery.abort(SubQueryState.ERROR);
return SubQueryState.ERROR;
}
} else {
subQuery.complete();
return SubQueryState.SUCCEEDED;
}
} catch (Throwable t) {
LOG.error(t);
subQuery.abort(SubQueryState.ERROR);
return SubQueryState.ERROR;
}
}
}
private static class DiagnosticsUpdateTransition implements SingleArcTransition<SubQuery, SubQueryEvent> {
@Override
public void transition(SubQuery subQuery, SubQueryEvent event) {
subQuery.addDiagnostic(((SubQueryDiagnosticsUpdateEvent) event).getDiagnosticUpdate());
}
}
private static class InternalErrorTransition implements SingleArcTransition<SubQuery, SubQueryEvent> {
@Override
public void transition(SubQuery subQuery, SubQueryEvent subQueryEvent) {
subQuery.abort(SubQueryState.ERROR);
}
}
}