/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.execution; import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import org.diqube.data.column.ColumnShard; import org.diqube.data.table.TableShard; import org.diqube.executionenv.ExecutionEnvironment; import org.diqube.executionenv.cache.ColumnShardCache; import org.diqube.queries.QueryUuid.QueryUuidThreadState; import org.diqube.remote.cluster.thrift.RColOrValue; import org.diqube.remote.cluster.thrift.RExecutionPlan; import org.diqube.remote.cluster.thrift.RExecutionPlanStep; import org.diqube.remote.cluster.thrift.RExecutionPlanStepDataType; import org.diqube.remote.cluster.thrift.RExecutionPlanStepType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Optimizes a {@link RExecutionPlan} that was received from a query master according to the circumstances a local * {@link TableShard} provides. * * @author Bastian Gloeckle */ public class RemoteExecutionPlanOptimizer { private static final Logger logger = LoggerFactory.getLogger(RemoteExecutionPlanOptimizer.class); /** * Optimizes the given plan to be executed on the given {@link ExecutionEnvironment}. * * <p> * Note that when running this, {@link ColumnShard}s from the {@link ColumnShardCache} might already be put into the * provided {@link ExecutionEnvironment}. * * <p> * This method must be executed with correct {@link QueryUuidThreadState} set, as it accesses the * {@link ExecutionEnvironment}. * * @param defaultEnv * The {@link ExecutionEnvironment} the resulting plan should be executed on. This is expected to be backed * by a concrete {@link TableShard} (and probably a {@link ColumnShardCache}). These properties of these * objects will be inspected for optimizing the plan - e.g. if a specific column is available in the cache * already, we do not need to execute a ProjectStep that would create the same column, so that ProjectStep * will be removed in the resulting executable plan. * @param plan * The source plan as provided by the query master. That plan is basically a plan which we'd have to execute * if there'd be no caches etc. We though are free to adjust that plan to the circumstances we find in the * local {@link TableShard} and {@link ColumnShardCache} that we'll be executing on. Important is just that * our plan creates the same output as the one that the query master sent. * @return A new {@link RExecutionPlan}, optimized to be executed on the given env. */ public RExecutionPlan optimize(ExecutionEnvironment defaultEnv, RExecutionPlan plan) { RExecutionPlan res = new RExecutionPlan(plan); removeUnneededColumnCreations(defaultEnv, res); if (!res.equals(plan)) logger.info("Optimized plan to {}", res.toString()); return res; } /** * Checks all {@link RExecutionPlanStepType#PROJECT} and {@link RExecutionPlanStepType#COLUMN_AGGREGATE} steps and * identifies steps that do not need to be executed, because their result column exists already (e.g. in a cache). It * will then remove these steps and all steps that would be executed only for their results to be fed into the removed * steps (transitively). */ private void removeUnneededColumnCreations(ExecutionEnvironment defaultEnv, RExecutionPlan plan) { Map<String, RExecutionPlanStep> columnCreatingSteps = new HashMap<>(); Map<String, List<String>> sourceColumns = new HashMap<>(); Map<String, Integer> numberOfFollowUpSteps = new HashMap<>(); for (RExecutionPlanStep step : plan.getSteps()) { switch (step.getType()) { case COLUMN_AGGREGATE: case PROJECT: case REPEATED_PROJECT: String outCol = step.getDetailsFunction().getResultColumn().getColName(); columnCreatingSteps.put(outCol, step); sourceColumns.put(outCol, new ArrayList<>()); int provideDataForStepsColBuiltCount = step.getProvideDataForSteps().entrySet().stream() .mapToInt(e -> (e.getValue().contains(RExecutionPlanStepDataType.COLUMN_BUILT)) ? 1 : 0).sum(); numberOfFollowUpSteps.put(outCol, provideDataForStepsColBuiltCount); for (RColOrValue fnParam : step.getDetailsFunction().getFunctionArguments()) { if (fnParam.isSetColumn()) { String inputCol = fnParam.getColumn().getColName(); sourceColumns.get(outCol).add(inputCol); } } break; default: } } // Work on those steps whose output col is available already. They basically do not need to provide data to anyone // any more, as the cols are available already. // Note that this will /never/ happen for RepeatedProjectSteps, as their output column has '[*]' appended - that // column will never exist. This is because that step will not only create one, but multiple columns (a repeated // field). These steps will therefore /always/ run, even when all of their output cols would be in the cache, // but only when these results are needed only for a column which in turn is already cached. Running the // RepeatedProjectStep though is not as bad, as that step itself checks what columns it needs to create and which // ones are available. Deque<String> emptyStepQueue = new LinkedList<>(); for (String colName : columnCreatingSteps.keySet()) if (defaultEnv.getColumnShard(colName) != null) { logger.trace("Column {} is available already (cache). Will remove the corresponding step from the plan.", colName); numberOfFollowUpSteps.put(colName, 0); emptyStepQueue.add(colName); } Set<RExecutionPlanStep> stepsToDelete = new HashSet<>(); // now keep searching the steps which have no output any more, marking the steps with no output for removal. Set<String> columnsWorkedOn = new HashSet<>(); while (!emptyStepQueue.isEmpty()) { String colWithNoOutput = emptyStepQueue.poll(); if (columnsWorkedOn.contains(colWithNoOutput)) continue; columnsWorkedOn.add(colWithNoOutput); stepsToDelete.add(columnCreatingSteps.get(colWithNoOutput)); for (String sourceCol : sourceColumns.get(colWithNoOutput)) { // only work on cols we know of - i.e. not on cols of the TableShard, but only on function-created cols. if (sourceColumns.containsKey(sourceCol)) { int newCount = numberOfFollowUpSteps.compute(sourceCol, (k, count) -> count - 1); if (newCount == 0) emptyStepQueue.add(sourceCol); } } } if (!stepsToDelete.isEmpty()) { // remove the steps plan.getSteps().removeAll(stepsToDelete); logger.trace("Removing following steps from plan because their result is not needed: {}", stepsToDelete); // If any remaining step provided input to the removed steps, we need to remove that input provider. Set<Integer> stepIdsRemoved = stepsToDelete.stream().map(step -> step.getStepId()).collect(Collectors.toSet()); for (RExecutionPlanStep step : plan.getSteps()) if (step.isSetProvideDataForSteps() && !step.getProvideDataForSteps().isEmpty()) step.getProvideDataForSteps().keySet().removeAll(stepIdsRemoved); } } }