/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.planner.logical.partition;
import java.util.BitSet;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import com.google.common.base.Stopwatch;
import org.apache.calcite.adapter.enumerable.EnumerableTableScan;
import org.apache.calcite.rel.core.Filter;
import org.apache.calcite.rel.core.Project;
import org.apache.calcite.rel.core.TableScan;
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.util.BitSets;
import org.apache.drill.common.expression.ErrorCollectorImpl;
import org.apache.drill.common.expression.LogicalExpression;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.expr.ExpressionTreeMaterializer;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.expr.fn.interpreter.InterpreterEvaluator;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.OptimizerRulesContext;
import org.apache.drill.exec.physical.base.FileGroupScan;
import org.apache.drill.exec.physical.base.GroupScan;
import org.apache.drill.exec.planner.FileSystemPartitionDescriptor;
import org.apache.drill.exec.planner.PartitionDescriptor;
import org.apache.drill.exec.planner.PartitionLocation;
import org.apache.drill.exec.planner.logical.DrillOptiq;
import org.apache.drill.exec.planner.logical.DrillParseContext;
import org.apache.drill.exec.planner.logical.DrillScanRel;
import org.apache.drill.exec.planner.logical.DrillTable;
import org.apache.drill.exec.planner.logical.DrillTranslatableTable;
import org.apache.drill.exec.planner.logical.RelOptHelper;
import org.apache.drill.exec.planner.physical.PlannerSettings;
import org.apache.drill.exec.planner.physical.PrelUtil;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.store.StoragePluginOptimizerRule;
import org.apache.drill.exec.store.dfs.FormatSelection;
import org.apache.drill.exec.store.dfs.MetadataContext;
import org.apache.drill.exec.store.dfs.MetadataContext.PruneStatus;
import org.apache.drill.exec.vector.NullableBitVector;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.plan.RelOptRuleOperand;
import org.apache.calcite.plan.RelOptUtil;
import org.apache.calcite.rex.RexNode;
import org.apache.commons.lang3.tuple.Pair;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.drill.exec.vector.ValueVector;
public abstract class PruneScanRule extends StoragePluginOptimizerRule {
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(PruneScanRule.class);
final OptimizerRulesContext optimizerContext;
public PruneScanRule(RelOptRuleOperand operand, String id, OptimizerRulesContext optimizerContext) {
super(operand, id);
this.optimizerContext = optimizerContext;
}
private static class DirPruneScanFilterOnProjectRule extends PruneScanRule {
public DirPruneScanFilterOnProjectRule(OptimizerRulesContext optimizerRulesContext) {
super(RelOptHelper.some(Filter.class, RelOptHelper.some(Project.class, RelOptHelper.any(TableScan.class))), "DirPruneScanRule:Filter_On_Project", optimizerRulesContext);
}
@Override
public PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel) {
return new FileSystemPartitionDescriptor(settings, scanRel);
}
@Override
public boolean matches(RelOptRuleCall call) {
final TableScan scan = call.rel(2);
return isQualifiedDirPruning(scan);
}
@Override
public void onMatch(RelOptRuleCall call) {
final Filter filterRel = call.rel(0);
final Project projectRel = call.rel(1);
final TableScan scanRel = call.rel(2);
doOnMatch(call, filterRel, projectRel, scanRel);
}
}
private static class DirPruneScanFilterOnScanRule extends PruneScanRule {
public DirPruneScanFilterOnScanRule(OptimizerRulesContext optimizerRulesContext) {
super(RelOptHelper.some(Filter.class, RelOptHelper.any(TableScan.class)), "DirPruneScanRule:Filter_On_Scan", optimizerRulesContext);
}
@Override
public PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel) {
return new FileSystemPartitionDescriptor(settings, scanRel);
}
@Override
public boolean matches(RelOptRuleCall call) {
final TableScan scan = call.rel(1);
return isQualifiedDirPruning(scan);
}
@Override
public void onMatch(RelOptRuleCall call) {
final Filter filterRel = call.rel(0);
final TableScan scanRel = call.rel(1);
doOnMatch(call, filterRel, null, scanRel);
}
}
public static final RelOptRule getDirFilterOnProject(OptimizerRulesContext optimizerRulesContext) {
return new DirPruneScanFilterOnProjectRule(optimizerRulesContext);
}
public static final RelOptRule getDirFilterOnScan(OptimizerRulesContext optimizerRulesContext) {
return new DirPruneScanFilterOnScanRule(optimizerRulesContext);
}
protected void doOnMatch(RelOptRuleCall call, Filter filterRel, Project projectRel, TableScan scanRel) {
final String pruningClassName = getClass().getName();
logger.info("Beginning partition pruning, pruning class: {}", pruningClassName);
Stopwatch totalPruningTime = Stopwatch.createStarted();
final PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());
PartitionDescriptor descriptor = getPartitionDescriptor(settings, scanRel);
final BufferAllocator allocator = optimizerContext.getAllocator();
final Object selection = getDrillTable(scanRel).getSelection();
MetadataContext metaContext = null;
if (selection instanceof FormatSelection) {
metaContext = ((FormatSelection)selection).getSelection().getMetaContext();
}
RexNode condition = null;
if (projectRel == null) {
condition = filterRel.getCondition();
} else {
// get the filter as if it were below the projection.
condition = RelOptUtil.pushFilterPastProject(filterRel.getCondition(), projectRel);
}
RewriteAsBinaryOperators visitor = new RewriteAsBinaryOperators(true, filterRel.getCluster().getRexBuilder());
condition = condition.accept(visitor);
Map<Integer, String> fieldNameMap = Maps.newHashMap();
List<String> fieldNames = scanRel.getRowType().getFieldNames();
BitSet columnBitset = new BitSet();
BitSet partitionColumnBitSet = new BitSet();
Map<Integer, Integer> partitionMap = Maps.newHashMap();
int relColIndex = 0;
for (String field : fieldNames) {
final Integer partitionIndex = descriptor.getIdIfValid(field);
if (partitionIndex != null) {
fieldNameMap.put(partitionIndex, field);
partitionColumnBitSet.set(partitionIndex);
columnBitset.set(relColIndex);
// mapping between the relColIndex and partitionIndex
partitionMap.put(relColIndex, partitionIndex);
}
relColIndex++;
}
if (partitionColumnBitSet.isEmpty()) {
logger.info("No partition columns are projected from the scan..continue. " +
"Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
return;
}
// stop watch to track how long we spend in different phases of pruning
Stopwatch miscTimer = Stopwatch.createUnstarted();
// track how long we spend building the filter tree
miscTimer.start();
FindPartitionConditions c = new FindPartitionConditions(columnBitset, filterRel.getCluster().getRexBuilder());
c.analyze(condition);
RexNode pruneCondition = c.getFinalCondition();
BitSet referencedDirsBitSet = c.getReferencedDirs();
logger.info("Total elapsed time to build and analyze filter tree: {} ms",
miscTimer.elapsed(TimeUnit.MILLISECONDS));
miscTimer.reset();
if (pruneCondition == null) {
logger.info("No conditions were found eligible for partition pruning." +
"Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
return;
}
// set up the partitions
List<PartitionLocation> newPartitions = Lists.newArrayList();
long numTotal = 0; // total number of partitions
int batchIndex = 0;
PartitionLocation firstLocation = null;
LogicalExpression materializedExpr = null;
String[] spInfo = null;
int maxIndex = -1;
BitSet matchBitSet = new BitSet();
// Outer loop: iterate over a list of batches of PartitionLocations
for (List<PartitionLocation> partitions : descriptor) {
numTotal += partitions.size();
logger.debug("Evaluating partition pruning for batch {}", batchIndex);
if (batchIndex == 0) { // save the first location in case everything is pruned
firstLocation = partitions.get(0);
}
final NullableBitVector output = new NullableBitVector(MaterializedField.create("", Types.optional(MinorType.BIT)), allocator);
final VectorContainer container = new VectorContainer();
try {
final ValueVector[] vectors = new ValueVector[descriptor.getMaxHierarchyLevel()];
for (int partitionColumnIndex : BitSets.toIter(partitionColumnBitSet)) {
SchemaPath column = SchemaPath.getSimplePath(fieldNameMap.get(partitionColumnIndex));
MajorType type = descriptor.getVectorType(column, settings);
MaterializedField field = MaterializedField.create(column.getAsUnescapedPath(), type);
ValueVector v = TypeHelper.getNewVector(field, allocator);
v.allocateNew();
vectors[partitionColumnIndex] = v;
container.add(v);
}
// track how long we spend populating partition column vectors
miscTimer.start();
// populate partition vectors.
descriptor.populatePartitionVectors(vectors, partitions, partitionColumnBitSet, fieldNameMap);
logger.info("Elapsed time to populate partitioning column vectors: {} ms within batchIndex: {}",
miscTimer.elapsed(TimeUnit.MILLISECONDS), batchIndex);
miscTimer.reset();
// materialize the expression; only need to do this once
if (batchIndex == 0) {
materializedExpr = materializePruneExpr(pruneCondition, settings, scanRel, container);
if (materializedExpr == null) {
// continue without partition pruning; no need to log anything here since
// materializePruneExpr logs it already
logger.info("Total pruning elapsed time: {} ms",
totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
return;
}
}
output.allocateNew(partitions.size());
// start the timer to evaluate how long we spend in the interpreter evaluation
miscTimer.start();
InterpreterEvaluator.evaluate(partitions.size(), optimizerContext, container, output, materializedExpr);
logger.info("Elapsed time in interpreter evaluation: {} ms within batchIndex: {} with # of partitions : {}",
miscTimer.elapsed(TimeUnit.MILLISECONDS), batchIndex, partitions.size());
miscTimer.reset();
int recordCount = 0;
int qualifiedCount = 0;
if (descriptor.supportsMetadataCachePruning() &&
partitions.get(0).isCompositePartition() /* apply single partition check only for composite partitions */) {
// Inner loop: within each batch iterate over the PartitionLocations
for (PartitionLocation part : partitions) {
assert part.isCompositePartition();
if(!output.getAccessor().isNull(recordCount) && output.getAccessor().get(recordCount) == 1) {
newPartitions.add(part);
// Rather than using the PartitionLocation, get the array of partition values for the directories that are
// referenced by the filter since we are not interested in directory references in other parts of the query.
Pair<String[], Integer> p = composePartition(referencedDirsBitSet, partitionMap, vectors, recordCount);
String[] parts = p.getLeft();
int tmpIndex = p.getRight();
maxIndex = Math.max(maxIndex, tmpIndex);
if (spInfo == null) { // initialization
spInfo = parts;
for (int j = 0; j <= tmpIndex; j++) {
if (parts[j] != null) {
matchBitSet.set(j);
}
}
} else {
// compare the new partition with existing partition
for (int j=0; j <= tmpIndex; j++) {
if (parts[j] == null || spInfo[j] == null) { // nulls don't match
matchBitSet.clear(j);
} else {
if (!parts[j].equals(spInfo[j])) {
matchBitSet.clear(j);
}
}
}
}
qualifiedCount++;
}
recordCount++;
}
} else {
// Inner loop: within each batch iterate over the PartitionLocations
for(PartitionLocation part: partitions){
if(!output.getAccessor().isNull(recordCount) && output.getAccessor().get(recordCount) == 1) {
newPartitions.add(part);
qualifiedCount++;
}
recordCount++;
}
}
logger.debug("Within batch {}: total records: {}, qualified records: {}", batchIndex, recordCount, qualifiedCount);
batchIndex++;
} catch (Exception e) {
logger.warn("Exception while trying to prune partition.", e);
logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
return; // continue without partition pruning
} finally {
container.clear();
if (output != null) {
output.clear();
}
}
}
try {
if (newPartitions.size() == numTotal) {
logger.info("No partitions were eligible for pruning");
return;
}
// handle the case all partitions are filtered out.
boolean canDropFilter = true;
boolean wasAllPartitionsPruned = false;
String cacheFileRoot = null;
if (newPartitions.isEmpty()) {
assert firstLocation != null;
// Add the first non-composite partition location, since execution requires schema.
// In such case, we should not drop filter.
newPartitions.add(firstLocation.getPartitionLocationRecursive().get(0));
canDropFilter = false;
// NOTE: with DRILL-4530, the PruneScanRule may be called with only a list of
// directories first and the non-composite partition location will still return
// directories, not files. So, additional processing is done depending on this flag
wasAllPartitionsPruned = true;
logger.info("All {} partitions were pruned; added back a single partition to allow creating a schema", numTotal);
// set the cacheFileRoot appropriately
if (firstLocation.isCompositePartition()) {
cacheFileRoot = descriptor.getBaseTableLocation() + firstLocation.getCompositePartitionPath();
}
}
logger.info("Pruned {} partitions down to {}", numTotal, newPartitions.size());
List<RexNode> conjuncts = RelOptUtil.conjunctions(condition);
List<RexNode> pruneConjuncts = RelOptUtil.conjunctions(pruneCondition);
conjuncts.removeAll(pruneConjuncts);
RexNode newCondition = RexUtil.composeConjunction(filterRel.getCluster().getRexBuilder(), conjuncts, false);
RewriteCombineBinaryOperators reverseVisitor = new RewriteCombineBinaryOperators(true, filterRel.getCluster().getRexBuilder());
condition = condition.accept(reverseVisitor);
pruneCondition = pruneCondition.accept(reverseVisitor);
if (descriptor.supportsMetadataCachePruning() && !wasAllPartitionsPruned) {
// if metadata cache file could potentially be used, then assign a proper cacheFileRoot
int index = -1;
if (!matchBitSet.isEmpty()) {
String path = "";
index = matchBitSet.length() - 1;
for (int j = 0; j < matchBitSet.length(); j++) {
if (!matchBitSet.get(j)) {
// stop at the first index with no match and use the immediate
// previous index
index = j-1;
break;
}
}
for (int j=0; j <= index; j++) {
path += "/" + spInfo[j];
}
cacheFileRoot = descriptor.getBaseTableLocation() + path;
}
if (index != maxIndex) {
// if multiple partitions are being selected, we should not drop the filter
// since we are reading the cache file at a parent/ancestor level
canDropFilter = false;
}
}
RelNode inputRel = descriptor.supportsMetadataCachePruning() ?
descriptor.createTableScan(newPartitions, cacheFileRoot, wasAllPartitionsPruned, metaContext) :
descriptor.createTableScan(newPartitions, wasAllPartitionsPruned);
if (projectRel != null) {
inputRel = projectRel.copy(projectRel.getTraitSet(), Collections.singletonList(inputRel));
}
if (newCondition.isAlwaysTrue() && canDropFilter) {
call.transformTo(inputRel);
} else {
final RelNode newFilter = filterRel.copy(filterRel.getTraitSet(), Collections.singletonList(inputRel));
call.transformTo(newFilter);
}
setPruneStatus(metaContext, PruneStatus.PRUNED);
} catch (Exception e) {
logger.warn("Exception while using the pruned partitions.", e);
} finally {
logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
}
}
/** Compose the array of partition values for the directories that are referenced by filter:
* e.g suppose the dir hierarchy is year/quarter/month and the query is:
* SELECT * FROM T WHERE dir0=2015 AND dir1 = 'Q1',
* then for 2015/Q1/Feb, this will have ['2015', 'Q1', null]
* If the query filter condition is WHERE dir1 = 'Q2' (i.e no dir0 condition) then the array will
* have [null, 'Q2', null]
*/
private Pair<String[], Integer> composePartition(BitSet referencedDirsBitSet,
Map<Integer, Integer> partitionMap,
ValueVector[] vectors,
int recordCount) {
String[] partition = new String[vectors.length];
int maxIndex = -1;
for (int referencedDirsIndex : BitSets.toIter(referencedDirsBitSet)) {
int partitionColumnIndex = partitionMap.get(referencedDirsIndex);
ValueVector vv = vectors[partitionColumnIndex];
if (vv.getAccessor().getValueCount() > 0 &&
vv.getAccessor().getObject(recordCount) != null) {
String value = vv.getAccessor().getObject(recordCount).toString();
partition[partitionColumnIndex] = value;
maxIndex = Math.max(maxIndex, partitionColumnIndex);
}
}
return Pair.of(partition, maxIndex);
}
protected LogicalExpression materializePruneExpr(RexNode pruneCondition,
PlannerSettings settings,
RelNode scanRel,
VectorContainer container
) {
// materialize the expression
logger.debug("Attempting to prune {}", pruneCondition);
final LogicalExpression expr = DrillOptiq.toDrill(new DrillParseContext(settings), scanRel, pruneCondition);
final ErrorCollectorImpl errors = new ErrorCollectorImpl();
LogicalExpression materializedExpr = ExpressionTreeMaterializer.materialize(expr, container, errors, optimizerContext.getFunctionRegistry());
// Make sure pruneCondition's materialized expression is always of BitType, so that
// it's same as the type of output vector.
if (materializedExpr.getMajorType().getMode() == TypeProtos.DataMode.REQUIRED) {
materializedExpr = ExpressionTreeMaterializer.convertToNullableType(
materializedExpr,
materializedExpr.getMajorType().getMinorType(),
optimizerContext.getFunctionRegistry(),
errors);
}
if (errors.getErrorCount() != 0) {
logger.warn("Failure while materializing expression [{}]. Errors: {}", expr, errors);
return null;
}
return materializedExpr;
}
protected OptimizerRulesContext getOptimizerRulesContext() {
return optimizerContext;
}
public abstract PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel);
private static DrillTable getDrillTable(final TableScan scan) {
DrillTable drillTable;
drillTable = scan.getTable().unwrap(DrillTable.class);
if (drillTable == null) {
drillTable = scan.getTable().unwrap(DrillTranslatableTable.class).getDrillTable();
}
return drillTable;
}
private static boolean isQualifiedDirPruning(final TableScan scan) {
if (scan instanceof EnumerableTableScan) {
final Object selection = getDrillTable(scan).getSelection();
if (selection instanceof FormatSelection
&& ((FormatSelection)selection).supportDirPruning()) {
return true; // Do directory-based pruning in Calcite logical
} else {
return false; // Do not do directory-based pruning in Calcite logical
}
} else if (scan instanceof DrillScanRel) {
final GroupScan groupScan = ((DrillScanRel) scan).getGroupScan();
// this rule is applicable only for dfs based partition pruning in Drill Logical
return groupScan instanceof FileGroupScan && groupScan.supportsPartitionFilterPushdown() && !((DrillScanRel)scan).partitionFilterPushdown();
}
return false;
}
private static void setPruneStatus(MetadataContext metaContext, PruneStatus pruneStatus) {
if (metaContext != null) {
metaContext.setPruneStatus(pruneStatus);
}
}
}