/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.planner.logical.partition; import java.util.BitSet; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import com.google.common.base.Stopwatch; import org.apache.calcite.adapter.enumerable.EnumerableTableScan; import org.apache.calcite.rel.core.Filter; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.TableScan; import org.apache.calcite.rex.RexUtil; import org.apache.calcite.util.BitSets; import org.apache.drill.common.expression.ErrorCollectorImpl; import org.apache.drill.common.expression.LogicalExpression; import org.apache.drill.common.expression.SchemaPath; import org.apache.drill.common.types.TypeProtos; import org.apache.drill.common.types.TypeProtos.MajorType; import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.common.types.Types; import org.apache.drill.exec.expr.ExpressionTreeMaterializer; import org.apache.drill.exec.expr.TypeHelper; import org.apache.drill.exec.expr.fn.interpreter.InterpreterEvaluator; import org.apache.drill.exec.memory.BufferAllocator; import org.apache.drill.exec.ops.OptimizerRulesContext; import org.apache.drill.exec.physical.base.FileGroupScan; import org.apache.drill.exec.physical.base.GroupScan; import org.apache.drill.exec.planner.FileSystemPartitionDescriptor; import org.apache.drill.exec.planner.PartitionDescriptor; import org.apache.drill.exec.planner.PartitionLocation; import org.apache.drill.exec.planner.logical.DrillOptiq; import org.apache.drill.exec.planner.logical.DrillParseContext; import org.apache.drill.exec.planner.logical.DrillScanRel; import org.apache.drill.exec.planner.logical.DrillTable; import org.apache.drill.exec.planner.logical.DrillTranslatableTable; import org.apache.drill.exec.planner.logical.RelOptHelper; import org.apache.drill.exec.planner.physical.PlannerSettings; import org.apache.drill.exec.planner.physical.PrelUtil; import org.apache.drill.exec.record.MaterializedField; import org.apache.drill.exec.record.VectorContainer; import org.apache.drill.exec.store.StoragePluginOptimizerRule; import org.apache.drill.exec.store.dfs.FormatSelection; import org.apache.drill.exec.store.dfs.MetadataContext; import org.apache.drill.exec.store.dfs.MetadataContext.PruneStatus; import org.apache.drill.exec.vector.NullableBitVector; import org.apache.calcite.rel.RelNode; import org.apache.calcite.plan.RelOptRule; import org.apache.calcite.plan.RelOptRuleCall; import org.apache.calcite.plan.RelOptRuleOperand; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.rex.RexNode; import org.apache.commons.lang3.tuple.Pair; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.drill.exec.vector.ValueVector; public abstract class PruneScanRule extends StoragePluginOptimizerRule { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(PruneScanRule.class); final OptimizerRulesContext optimizerContext; public PruneScanRule(RelOptRuleOperand operand, String id, OptimizerRulesContext optimizerContext) { super(operand, id); this.optimizerContext = optimizerContext; } private static class DirPruneScanFilterOnProjectRule extends PruneScanRule { public DirPruneScanFilterOnProjectRule(OptimizerRulesContext optimizerRulesContext) { super(RelOptHelper.some(Filter.class, RelOptHelper.some(Project.class, RelOptHelper.any(TableScan.class))), "DirPruneScanRule:Filter_On_Project", optimizerRulesContext); } @Override public PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel) { return new FileSystemPartitionDescriptor(settings, scanRel); } @Override public boolean matches(RelOptRuleCall call) { final TableScan scan = call.rel(2); return isQualifiedDirPruning(scan); } @Override public void onMatch(RelOptRuleCall call) { final Filter filterRel = call.rel(0); final Project projectRel = call.rel(1); final TableScan scanRel = call.rel(2); doOnMatch(call, filterRel, projectRel, scanRel); } } private static class DirPruneScanFilterOnScanRule extends PruneScanRule { public DirPruneScanFilterOnScanRule(OptimizerRulesContext optimizerRulesContext) { super(RelOptHelper.some(Filter.class, RelOptHelper.any(TableScan.class)), "DirPruneScanRule:Filter_On_Scan", optimizerRulesContext); } @Override public PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel) { return new FileSystemPartitionDescriptor(settings, scanRel); } @Override public boolean matches(RelOptRuleCall call) { final TableScan scan = call.rel(1); return isQualifiedDirPruning(scan); } @Override public void onMatch(RelOptRuleCall call) { final Filter filterRel = call.rel(0); final TableScan scanRel = call.rel(1); doOnMatch(call, filterRel, null, scanRel); } } public static final RelOptRule getDirFilterOnProject(OptimizerRulesContext optimizerRulesContext) { return new DirPruneScanFilterOnProjectRule(optimizerRulesContext); } public static final RelOptRule getDirFilterOnScan(OptimizerRulesContext optimizerRulesContext) { return new DirPruneScanFilterOnScanRule(optimizerRulesContext); } protected void doOnMatch(RelOptRuleCall call, Filter filterRel, Project projectRel, TableScan scanRel) { final String pruningClassName = getClass().getName(); logger.info("Beginning partition pruning, pruning class: {}", pruningClassName); Stopwatch totalPruningTime = Stopwatch.createStarted(); final PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner()); PartitionDescriptor descriptor = getPartitionDescriptor(settings, scanRel); final BufferAllocator allocator = optimizerContext.getAllocator(); final Object selection = getDrillTable(scanRel).getSelection(); MetadataContext metaContext = null; if (selection instanceof FormatSelection) { metaContext = ((FormatSelection)selection).getSelection().getMetaContext(); } RexNode condition = null; if (projectRel == null) { condition = filterRel.getCondition(); } else { // get the filter as if it were below the projection. condition = RelOptUtil.pushFilterPastProject(filterRel.getCondition(), projectRel); } RewriteAsBinaryOperators visitor = new RewriteAsBinaryOperators(true, filterRel.getCluster().getRexBuilder()); condition = condition.accept(visitor); Map<Integer, String> fieldNameMap = Maps.newHashMap(); List<String> fieldNames = scanRel.getRowType().getFieldNames(); BitSet columnBitset = new BitSet(); BitSet partitionColumnBitSet = new BitSet(); Map<Integer, Integer> partitionMap = Maps.newHashMap(); int relColIndex = 0; for (String field : fieldNames) { final Integer partitionIndex = descriptor.getIdIfValid(field); if (partitionIndex != null) { fieldNameMap.put(partitionIndex, field); partitionColumnBitSet.set(partitionIndex); columnBitset.set(relColIndex); // mapping between the relColIndex and partitionIndex partitionMap.put(relColIndex, partitionIndex); } relColIndex++; } if (partitionColumnBitSet.isEmpty()) { logger.info("No partition columns are projected from the scan..continue. " + "Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS)); setPruneStatus(metaContext, PruneStatus.NOT_PRUNED); return; } // stop watch to track how long we spend in different phases of pruning Stopwatch miscTimer = Stopwatch.createUnstarted(); // track how long we spend building the filter tree miscTimer.start(); FindPartitionConditions c = new FindPartitionConditions(columnBitset, filterRel.getCluster().getRexBuilder()); c.analyze(condition); RexNode pruneCondition = c.getFinalCondition(); BitSet referencedDirsBitSet = c.getReferencedDirs(); logger.info("Total elapsed time to build and analyze filter tree: {} ms", miscTimer.elapsed(TimeUnit.MILLISECONDS)); miscTimer.reset(); if (pruneCondition == null) { logger.info("No conditions were found eligible for partition pruning." + "Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS)); setPruneStatus(metaContext, PruneStatus.NOT_PRUNED); return; } // set up the partitions List<PartitionLocation> newPartitions = Lists.newArrayList(); long numTotal = 0; // total number of partitions int batchIndex = 0; PartitionLocation firstLocation = null; LogicalExpression materializedExpr = null; String[] spInfo = null; int maxIndex = -1; BitSet matchBitSet = new BitSet(); // Outer loop: iterate over a list of batches of PartitionLocations for (List<PartitionLocation> partitions : descriptor) { numTotal += partitions.size(); logger.debug("Evaluating partition pruning for batch {}", batchIndex); if (batchIndex == 0) { // save the first location in case everything is pruned firstLocation = partitions.get(0); } final NullableBitVector output = new NullableBitVector(MaterializedField.create("", Types.optional(MinorType.BIT)), allocator); final VectorContainer container = new VectorContainer(); try { final ValueVector[] vectors = new ValueVector[descriptor.getMaxHierarchyLevel()]; for (int partitionColumnIndex : BitSets.toIter(partitionColumnBitSet)) { SchemaPath column = SchemaPath.getSimplePath(fieldNameMap.get(partitionColumnIndex)); MajorType type = descriptor.getVectorType(column, settings); MaterializedField field = MaterializedField.create(column.getAsUnescapedPath(), type); ValueVector v = TypeHelper.getNewVector(field, allocator); v.allocateNew(); vectors[partitionColumnIndex] = v; container.add(v); } // track how long we spend populating partition column vectors miscTimer.start(); // populate partition vectors. descriptor.populatePartitionVectors(vectors, partitions, partitionColumnBitSet, fieldNameMap); logger.info("Elapsed time to populate partitioning column vectors: {} ms within batchIndex: {}", miscTimer.elapsed(TimeUnit.MILLISECONDS), batchIndex); miscTimer.reset(); // materialize the expression; only need to do this once if (batchIndex == 0) { materializedExpr = materializePruneExpr(pruneCondition, settings, scanRel, container); if (materializedExpr == null) { // continue without partition pruning; no need to log anything here since // materializePruneExpr logs it already logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS)); setPruneStatus(metaContext, PruneStatus.NOT_PRUNED); return; } } output.allocateNew(partitions.size()); // start the timer to evaluate how long we spend in the interpreter evaluation miscTimer.start(); InterpreterEvaluator.evaluate(partitions.size(), optimizerContext, container, output, materializedExpr); logger.info("Elapsed time in interpreter evaluation: {} ms within batchIndex: {} with # of partitions : {}", miscTimer.elapsed(TimeUnit.MILLISECONDS), batchIndex, partitions.size()); miscTimer.reset(); int recordCount = 0; int qualifiedCount = 0; if (descriptor.supportsMetadataCachePruning() && partitions.get(0).isCompositePartition() /* apply single partition check only for composite partitions */) { // Inner loop: within each batch iterate over the PartitionLocations for (PartitionLocation part : partitions) { assert part.isCompositePartition(); if(!output.getAccessor().isNull(recordCount) && output.getAccessor().get(recordCount) == 1) { newPartitions.add(part); // Rather than using the PartitionLocation, get the array of partition values for the directories that are // referenced by the filter since we are not interested in directory references in other parts of the query. Pair<String[], Integer> p = composePartition(referencedDirsBitSet, partitionMap, vectors, recordCount); String[] parts = p.getLeft(); int tmpIndex = p.getRight(); maxIndex = Math.max(maxIndex, tmpIndex); if (spInfo == null) { // initialization spInfo = parts; for (int j = 0; j <= tmpIndex; j++) { if (parts[j] != null) { matchBitSet.set(j); } } } else { // compare the new partition with existing partition for (int j=0; j <= tmpIndex; j++) { if (parts[j] == null || spInfo[j] == null) { // nulls don't match matchBitSet.clear(j); } else { if (!parts[j].equals(spInfo[j])) { matchBitSet.clear(j); } } } } qualifiedCount++; } recordCount++; } } else { // Inner loop: within each batch iterate over the PartitionLocations for(PartitionLocation part: partitions){ if(!output.getAccessor().isNull(recordCount) && output.getAccessor().get(recordCount) == 1) { newPartitions.add(part); qualifiedCount++; } recordCount++; } } logger.debug("Within batch {}: total records: {}, qualified records: {}", batchIndex, recordCount, qualifiedCount); batchIndex++; } catch (Exception e) { logger.warn("Exception while trying to prune partition.", e); logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS)); setPruneStatus(metaContext, PruneStatus.NOT_PRUNED); return; // continue without partition pruning } finally { container.clear(); if (output != null) { output.clear(); } } } try { if (newPartitions.size() == numTotal) { logger.info("No partitions were eligible for pruning"); return; } // handle the case all partitions are filtered out. boolean canDropFilter = true; boolean wasAllPartitionsPruned = false; String cacheFileRoot = null; if (newPartitions.isEmpty()) { assert firstLocation != null; // Add the first non-composite partition location, since execution requires schema. // In such case, we should not drop filter. newPartitions.add(firstLocation.getPartitionLocationRecursive().get(0)); canDropFilter = false; // NOTE: with DRILL-4530, the PruneScanRule may be called with only a list of // directories first and the non-composite partition location will still return // directories, not files. So, additional processing is done depending on this flag wasAllPartitionsPruned = true; logger.info("All {} partitions were pruned; added back a single partition to allow creating a schema", numTotal); // set the cacheFileRoot appropriately if (firstLocation.isCompositePartition()) { cacheFileRoot = descriptor.getBaseTableLocation() + firstLocation.getCompositePartitionPath(); } } logger.info("Pruned {} partitions down to {}", numTotal, newPartitions.size()); List<RexNode> conjuncts = RelOptUtil.conjunctions(condition); List<RexNode> pruneConjuncts = RelOptUtil.conjunctions(pruneCondition); conjuncts.removeAll(pruneConjuncts); RexNode newCondition = RexUtil.composeConjunction(filterRel.getCluster().getRexBuilder(), conjuncts, false); RewriteCombineBinaryOperators reverseVisitor = new RewriteCombineBinaryOperators(true, filterRel.getCluster().getRexBuilder()); condition = condition.accept(reverseVisitor); pruneCondition = pruneCondition.accept(reverseVisitor); if (descriptor.supportsMetadataCachePruning() && !wasAllPartitionsPruned) { // if metadata cache file could potentially be used, then assign a proper cacheFileRoot int index = -1; if (!matchBitSet.isEmpty()) { String path = ""; index = matchBitSet.length() - 1; for (int j = 0; j < matchBitSet.length(); j++) { if (!matchBitSet.get(j)) { // stop at the first index with no match and use the immediate // previous index index = j-1; break; } } for (int j=0; j <= index; j++) { path += "/" + spInfo[j]; } cacheFileRoot = descriptor.getBaseTableLocation() + path; } if (index != maxIndex) { // if multiple partitions are being selected, we should not drop the filter // since we are reading the cache file at a parent/ancestor level canDropFilter = false; } } RelNode inputRel = descriptor.supportsMetadataCachePruning() ? descriptor.createTableScan(newPartitions, cacheFileRoot, wasAllPartitionsPruned, metaContext) : descriptor.createTableScan(newPartitions, wasAllPartitionsPruned); if (projectRel != null) { inputRel = projectRel.copy(projectRel.getTraitSet(), Collections.singletonList(inputRel)); } if (newCondition.isAlwaysTrue() && canDropFilter) { call.transformTo(inputRel); } else { final RelNode newFilter = filterRel.copy(filterRel.getTraitSet(), Collections.singletonList(inputRel)); call.transformTo(newFilter); } setPruneStatus(metaContext, PruneStatus.PRUNED); } catch (Exception e) { logger.warn("Exception while using the pruned partitions.", e); } finally { logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS)); } } /** Compose the array of partition values for the directories that are referenced by filter: * e.g suppose the dir hierarchy is year/quarter/month and the query is: * SELECT * FROM T WHERE dir0=2015 AND dir1 = 'Q1', * then for 2015/Q1/Feb, this will have ['2015', 'Q1', null] * If the query filter condition is WHERE dir1 = 'Q2' (i.e no dir0 condition) then the array will * have [null, 'Q2', null] */ private Pair<String[], Integer> composePartition(BitSet referencedDirsBitSet, Map<Integer, Integer> partitionMap, ValueVector[] vectors, int recordCount) { String[] partition = new String[vectors.length]; int maxIndex = -1; for (int referencedDirsIndex : BitSets.toIter(referencedDirsBitSet)) { int partitionColumnIndex = partitionMap.get(referencedDirsIndex); ValueVector vv = vectors[partitionColumnIndex]; if (vv.getAccessor().getValueCount() > 0 && vv.getAccessor().getObject(recordCount) != null) { String value = vv.getAccessor().getObject(recordCount).toString(); partition[partitionColumnIndex] = value; maxIndex = Math.max(maxIndex, partitionColumnIndex); } } return Pair.of(partition, maxIndex); } protected LogicalExpression materializePruneExpr(RexNode pruneCondition, PlannerSettings settings, RelNode scanRel, VectorContainer container ) { // materialize the expression logger.debug("Attempting to prune {}", pruneCondition); final LogicalExpression expr = DrillOptiq.toDrill(new DrillParseContext(settings), scanRel, pruneCondition); final ErrorCollectorImpl errors = new ErrorCollectorImpl(); LogicalExpression materializedExpr = ExpressionTreeMaterializer.materialize(expr, container, errors, optimizerContext.getFunctionRegistry()); // Make sure pruneCondition's materialized expression is always of BitType, so that // it's same as the type of output vector. if (materializedExpr.getMajorType().getMode() == TypeProtos.DataMode.REQUIRED) { materializedExpr = ExpressionTreeMaterializer.convertToNullableType( materializedExpr, materializedExpr.getMajorType().getMinorType(), optimizerContext.getFunctionRegistry(), errors); } if (errors.getErrorCount() != 0) { logger.warn("Failure while materializing expression [{}]. Errors: {}", expr, errors); return null; } return materializedExpr; } protected OptimizerRulesContext getOptimizerRulesContext() { return optimizerContext; } public abstract PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel); private static DrillTable getDrillTable(final TableScan scan) { DrillTable drillTable; drillTable = scan.getTable().unwrap(DrillTable.class); if (drillTable == null) { drillTable = scan.getTable().unwrap(DrillTranslatableTable.class).getDrillTable(); } return drillTable; } private static boolean isQualifiedDirPruning(final TableScan scan) { if (scan instanceof EnumerableTableScan) { final Object selection = getDrillTable(scan).getSelection(); if (selection instanceof FormatSelection && ((FormatSelection)selection).supportDirPruning()) { return true; // Do directory-based pruning in Calcite logical } else { return false; // Do not do directory-based pruning in Calcite logical } } else if (scan instanceof DrillScanRel) { final GroupScan groupScan = ((DrillScanRel) scan).getGroupScan(); // this rule is applicable only for dfs based partition pruning in Drill Logical return groupScan instanceof FileGroupScan && groupScan.supportsPartitionFilterPushdown() && !((DrillScanRel)scan).partitionFilterPushdown(); } return false; } private static void setPruneStatus(MetadataContext metaContext, PruneStatus pruneStatus) { if (metaContext != null) { metaContext.setPruneStatus(pruneStatus); } } }