PruneScanRule.java example

Explorer
drill-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.planner.logical.partition;

import java.util.BitSet;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import com.google.common.base.Stopwatch;

import org.apache.calcite.adapter.enumerable.EnumerableTableScan;
import org.apache.calcite.rel.core.Filter;
import org.apache.calcite.rel.core.Project;
import org.apache.calcite.rel.core.TableScan;
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.util.BitSets;
import org.apache.drill.common.expression.ErrorCollectorImpl;
import org.apache.drill.common.expression.LogicalExpression;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.expr.ExpressionTreeMaterializer;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.expr.fn.interpreter.InterpreterEvaluator;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.OptimizerRulesContext;
import org.apache.drill.exec.physical.base.FileGroupScan;
import org.apache.drill.exec.physical.base.GroupScan;
import org.apache.drill.exec.planner.FileSystemPartitionDescriptor;
import org.apache.drill.exec.planner.PartitionDescriptor;
import org.apache.drill.exec.planner.PartitionLocation;
import org.apache.drill.exec.planner.logical.DrillOptiq;
import org.apache.drill.exec.planner.logical.DrillParseContext;
import org.apache.drill.exec.planner.logical.DrillScanRel;
import org.apache.drill.exec.planner.logical.DrillTable;
import org.apache.drill.exec.planner.logical.DrillTranslatableTable;
import org.apache.drill.exec.planner.logical.RelOptHelper;
import org.apache.drill.exec.planner.physical.PlannerSettings;
import org.apache.drill.exec.planner.physical.PrelUtil;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.store.StoragePluginOptimizerRule;
import org.apache.drill.exec.store.dfs.FormatSelection;
import org.apache.drill.exec.store.dfs.MetadataContext;
import org.apache.drill.exec.store.dfs.MetadataContext.PruneStatus;
import org.apache.drill.exec.vector.NullableBitVector;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.plan.RelOptRuleOperand;
import org.apache.calcite.plan.RelOptUtil;
import org.apache.calcite.rex.RexNode;
import org.apache.commons.lang3.tuple.Pair;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import org.apache.drill.exec.vector.ValueVector;


public abstract class PruneScanRule extends StoragePluginOptimizerRule {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(PruneScanRule.class);

  final OptimizerRulesContext optimizerContext;

  public PruneScanRule(RelOptRuleOperand operand, String id, OptimizerRulesContext optimizerContext) {
    super(operand, id);
    this.optimizerContext = optimizerContext;
  }

  private static class DirPruneScanFilterOnProjectRule extends PruneScanRule {
    public DirPruneScanFilterOnProjectRule(OptimizerRulesContext optimizerRulesContext) {
      super(RelOptHelper.some(Filter.class, RelOptHelper.some(Project.class, RelOptHelper.any(TableScan.class))), "DirPruneScanRule:Filter_On_Project", optimizerRulesContext);
    }

    @Override
    public PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel) {
      return new FileSystemPartitionDescriptor(settings, scanRel);
    }

    @Override
    public boolean matches(RelOptRuleCall call) {
      final TableScan scan = call.rel(2);
      return isQualifiedDirPruning(scan);
    }

    @Override
    public void onMatch(RelOptRuleCall call) {
      final Filter filterRel = call.rel(0);
      final Project projectRel = call.rel(1);
      final TableScan scanRel = call.rel(2);
      doOnMatch(call, filterRel, projectRel, scanRel);
    }
  }

  private static class DirPruneScanFilterOnScanRule extends PruneScanRule {
    public DirPruneScanFilterOnScanRule(OptimizerRulesContext optimizerRulesContext) {
      super(RelOptHelper.some(Filter.class, RelOptHelper.any(TableScan.class)), "DirPruneScanRule:Filter_On_Scan", optimizerRulesContext);
    }

    @Override
    public PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel) {
      return new FileSystemPartitionDescriptor(settings, scanRel);
    }

    @Override
    public boolean matches(RelOptRuleCall call) {
      final TableScan scan = call.rel(1);
      return isQualifiedDirPruning(scan);
    }

    @Override
    public void onMatch(RelOptRuleCall call) {
      final Filter filterRel = call.rel(0);
      final TableScan scanRel = call.rel(1);
      doOnMatch(call, filterRel, null, scanRel);
    }
  }

  public static final RelOptRule getDirFilterOnProject(OptimizerRulesContext optimizerRulesContext) {
    return new DirPruneScanFilterOnProjectRule(optimizerRulesContext);
  }

  public static final RelOptRule getDirFilterOnScan(OptimizerRulesContext optimizerRulesContext) {
    return new DirPruneScanFilterOnScanRule(optimizerRulesContext);
  }

  protected void doOnMatch(RelOptRuleCall call, Filter filterRel, Project projectRel, TableScan scanRel) {

    final String pruningClassName = getClass().getName();
    logger.info("Beginning partition pruning, pruning class: {}", pruningClassName);
    Stopwatch totalPruningTime = Stopwatch.createStarted();

    final PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());
    PartitionDescriptor descriptor = getPartitionDescriptor(settings, scanRel);
    final BufferAllocator allocator = optimizerContext.getAllocator();

    final Object selection = getDrillTable(scanRel).getSelection();
    MetadataContext metaContext = null;
    if (selection instanceof FormatSelection) {
         metaContext = ((FormatSelection)selection).getSelection().getMetaContext();
    }

    RexNode condition = null;
    if (projectRel == null) {
      condition = filterRel.getCondition();
    } else {
      // get the filter as if it were below the projection.
      condition = RelOptUtil.pushFilterPastProject(filterRel.getCondition(), projectRel);
    }

    RewriteAsBinaryOperators visitor = new RewriteAsBinaryOperators(true, filterRel.getCluster().getRexBuilder());
    condition = condition.accept(visitor);

    Map<Integer, String> fieldNameMap = Maps.newHashMap();
    List<String> fieldNames = scanRel.getRowType().getFieldNames();
    BitSet columnBitset = new BitSet();
    BitSet partitionColumnBitSet = new BitSet();
    Map<Integer, Integer> partitionMap = Maps.newHashMap();

    int relColIndex = 0;
    for (String field : fieldNames) {
      final Integer partitionIndex = descriptor.getIdIfValid(field);
      if (partitionIndex != null) {
        fieldNameMap.put(partitionIndex, field);
        partitionColumnBitSet.set(partitionIndex);
        columnBitset.set(relColIndex);
        // mapping between the relColIndex and partitionIndex
        partitionMap.put(relColIndex, partitionIndex);
      }
      relColIndex++;
    }

    if (partitionColumnBitSet.isEmpty()) {
      logger.info("No partition columns are projected from the scan..continue. " +
          "Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
      setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
      return;
    }

    // stop watch to track how long we spend in different phases of pruning
    Stopwatch miscTimer = Stopwatch.createUnstarted();

    // track how long we spend building the filter tree
    miscTimer.start();

    FindPartitionConditions c = new FindPartitionConditions(columnBitset, filterRel.getCluster().getRexBuilder());
    c.analyze(condition);
    RexNode pruneCondition = c.getFinalCondition();
    BitSet referencedDirsBitSet = c.getReferencedDirs();

    logger.info("Total elapsed time to build and analyze filter tree: {} ms",
        miscTimer.elapsed(TimeUnit.MILLISECONDS));
    miscTimer.reset();

    if (pruneCondition == null) {
      logger.info("No conditions were found eligible for partition pruning." +
          "Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
      setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
      return;
    }

    // set up the partitions
    List<PartitionLocation> newPartitions = Lists.newArrayList();
    long numTotal = 0; // total number of partitions
    int batchIndex = 0;
    PartitionLocation firstLocation = null;
    LogicalExpression materializedExpr = null;
    String[] spInfo = null;
    int maxIndex = -1;
    BitSet matchBitSet = new BitSet();

    // Outer loop: iterate over a list of batches of PartitionLocations
    for (List<PartitionLocation> partitions : descriptor) {
      numTotal += partitions.size();
      logger.debug("Evaluating partition pruning for batch {}", batchIndex);
      if (batchIndex == 0) { // save the first location in case everything is pruned
        firstLocation = partitions.get(0);
      }
      final NullableBitVector output = new NullableBitVector(MaterializedField.create("", Types.optional(MinorType.BIT)), allocator);
      final VectorContainer container = new VectorContainer();

      try {
        final ValueVector[] vectors = new ValueVector[descriptor.getMaxHierarchyLevel()];
        for (int partitionColumnIndex : BitSets.toIter(partitionColumnBitSet)) {
          SchemaPath column = SchemaPath.getSimplePath(fieldNameMap.get(partitionColumnIndex));
          MajorType type = descriptor.getVectorType(column, settings);
          MaterializedField field = MaterializedField.create(column.getAsUnescapedPath(), type);
          ValueVector v = TypeHelper.getNewVector(field, allocator);
          v.allocateNew();
          vectors[partitionColumnIndex] = v;
          container.add(v);
        }

        // track how long we spend populating partition column vectors
        miscTimer.start();

        // populate partition vectors.
        descriptor.populatePartitionVectors(vectors, partitions, partitionColumnBitSet, fieldNameMap);

        logger.info("Elapsed time to populate partitioning column vectors: {} ms within batchIndex: {}",
            miscTimer.elapsed(TimeUnit.MILLISECONDS), batchIndex);
        miscTimer.reset();

        // materialize the expression; only need to do this once
        if (batchIndex == 0) {
          materializedExpr = materializePruneExpr(pruneCondition, settings, scanRel, container);
          if (materializedExpr == null) {
            // continue without partition pruning; no need to log anything here since
            // materializePruneExpr logs it already
            logger.info("Total pruning elapsed time: {} ms",
                totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
            setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
            return;
          }
        }

        output.allocateNew(partitions.size());

        // start the timer to evaluate how long we spend in the interpreter evaluation
        miscTimer.start();

        InterpreterEvaluator.evaluate(partitions.size(), optimizerContext, container, output, materializedExpr);

        logger.info("Elapsed time in interpreter evaluation: {} ms within batchIndex: {} with # of partitions : {}",
            miscTimer.elapsed(TimeUnit.MILLISECONDS), batchIndex, partitions.size());
        miscTimer.reset();

        int recordCount = 0;
        int qualifiedCount = 0;

        if (descriptor.supportsMetadataCachePruning() &&
            partitions.get(0).isCompositePartition() /* apply single partition check only for composite partitions */) {
          // Inner loop: within each batch iterate over the PartitionLocations
          for (PartitionLocation part : partitions) {
            assert part.isCompositePartition();
            if(!output.getAccessor().isNull(recordCount) && output.getAccessor().get(recordCount) == 1) {
              newPartitions.add(part);
              // Rather than using the PartitionLocation, get the array of partition values for the directories that are
              // referenced by the filter since we are not interested in directory references in other parts of the query.
              Pair<String[], Integer> p = composePartition(referencedDirsBitSet, partitionMap, vectors, recordCount);
              String[] parts = p.getLeft();
              int tmpIndex = p.getRight();
              maxIndex = Math.max(maxIndex, tmpIndex);
              if (spInfo == null) { // initialization
                spInfo = parts;
                for (int j = 0; j <= tmpIndex; j++) {
                  if (parts[j] != null) {
                    matchBitSet.set(j);
                  }
                }
              } else {
                // compare the new partition with existing partition
                for (int j=0; j <= tmpIndex; j++) {
                  if (parts[j] == null || spInfo[j] == null) { // nulls don't match
                    matchBitSet.clear(j);
                  } else {
                    if (!parts[j].equals(spInfo[j])) {
                      matchBitSet.clear(j);
                    }
                  }
                }
              }
              qualifiedCount++;
            }
            recordCount++;
          }
        } else {
          // Inner loop: within each batch iterate over the PartitionLocations
          for(PartitionLocation part: partitions){
            if(!output.getAccessor().isNull(recordCount) && output.getAccessor().get(recordCount) == 1) {
              newPartitions.add(part);
              qualifiedCount++;
            }
            recordCount++;
          }
        }
        logger.debug("Within batch {}: total records: {}, qualified records: {}", batchIndex, recordCount, qualifiedCount);
        batchIndex++;
      } catch (Exception e) {
        logger.warn("Exception while trying to prune partition.", e);
        logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));

        setPruneStatus(metaContext, PruneStatus.NOT_PRUNED);
        return; // continue without partition pruning
      } finally {
        container.clear();
        if (output != null) {
          output.clear();
        }
      }
    }

    try {
      if (newPartitions.size() == numTotal) {
        logger.info("No partitions were eligible for pruning");
        return;
      }

      // handle the case all partitions are filtered out.
      boolean canDropFilter = true;
      boolean wasAllPartitionsPruned = false;
      String cacheFileRoot = null;

      if (newPartitions.isEmpty()) {
        assert firstLocation != null;
        // Add the first non-composite partition location, since execution requires schema.
        // In such case, we should not drop filter.
        newPartitions.add(firstLocation.getPartitionLocationRecursive().get(0));
        canDropFilter = false;
        // NOTE: with DRILL-4530, the PruneScanRule may be called with only a list of
        // directories first and the non-composite partition location will still return
        // directories, not files.  So, additional processing is done depending on this flag
        wasAllPartitionsPruned = true;
        logger.info("All {} partitions were pruned; added back a single partition to allow creating a schema", numTotal);

        // set the cacheFileRoot appropriately
        if (firstLocation.isCompositePartition()) {
          cacheFileRoot = descriptor.getBaseTableLocation() + firstLocation.getCompositePartitionPath();
        }
      }

      logger.info("Pruned {} partitions down to {}", numTotal, newPartitions.size());

      List<RexNode> conjuncts = RelOptUtil.conjunctions(condition);
      List<RexNode> pruneConjuncts = RelOptUtil.conjunctions(pruneCondition);
      conjuncts.removeAll(pruneConjuncts);
      RexNode newCondition = RexUtil.composeConjunction(filterRel.getCluster().getRexBuilder(), conjuncts, false);

      RewriteCombineBinaryOperators reverseVisitor = new RewriteCombineBinaryOperators(true, filterRel.getCluster().getRexBuilder());

      condition = condition.accept(reverseVisitor);
      pruneCondition = pruneCondition.accept(reverseVisitor);

      if (descriptor.supportsMetadataCachePruning() && !wasAllPartitionsPruned) {
        // if metadata cache file could potentially be used, then assign a proper cacheFileRoot
        int index = -1;
        if (!matchBitSet.isEmpty()) {
          String path = "";
          index = matchBitSet.length() - 1;

          for (int j = 0; j < matchBitSet.length(); j++) {
            if (!matchBitSet.get(j)) {
              // stop at the first index with no match and use the immediate
              // previous index
              index = j-1;
              break;
            }
          }
          for (int j=0; j <= index; j++) {
            path += "/" + spInfo[j];
          }
          cacheFileRoot = descriptor.getBaseTableLocation() + path;
        }
        if (index != maxIndex) {
          // if multiple partitions are being selected, we should not drop the filter
          // since we are reading the cache file at a parent/ancestor level
          canDropFilter = false;
        }

      }

      RelNode inputRel = descriptor.supportsMetadataCachePruning() ?
          descriptor.createTableScan(newPartitions, cacheFileRoot, wasAllPartitionsPruned, metaContext) :
            descriptor.createTableScan(newPartitions, wasAllPartitionsPruned);

      if (projectRel != null) {
        inputRel = projectRel.copy(projectRel.getTraitSet(), Collections.singletonList(inputRel));
      }

      if (newCondition.isAlwaysTrue() && canDropFilter) {
        call.transformTo(inputRel);
      } else {
        final RelNode newFilter = filterRel.copy(filterRel.getTraitSet(), Collections.singletonList(inputRel));
        call.transformTo(newFilter);
      }

      setPruneStatus(metaContext, PruneStatus.PRUNED);

    } catch (Exception e) {
      logger.warn("Exception while using the pruned partitions.", e);
    } finally {
      logger.info("Total pruning elapsed time: {} ms", totalPruningTime.elapsed(TimeUnit.MILLISECONDS));
    }
  }

  /** Compose the array of partition values for the directories that are referenced by filter:
   *  e.g suppose the dir hierarchy is year/quarter/month and the query is:
   *     SELECT * FROM T WHERE dir0=2015 AND dir1 = 'Q1',
   * then for 2015/Q1/Feb, this will have ['2015', 'Q1', null]
   * If the query filter condition is WHERE dir1 = 'Q2'  (i.e no dir0 condition) then the array will
   * have [null, 'Q2', null]
   */
  private Pair<String[], Integer> composePartition(BitSet referencedDirsBitSet,
      Map<Integer, Integer> partitionMap,
      ValueVector[] vectors,
      int recordCount) {
    String[] partition = new String[vectors.length];
    int maxIndex = -1;
    for (int referencedDirsIndex : BitSets.toIter(referencedDirsBitSet)) {
      int partitionColumnIndex = partitionMap.get(referencedDirsIndex);
      ValueVector vv = vectors[partitionColumnIndex];
      if (vv.getAccessor().getValueCount() > 0 &&
          vv.getAccessor().getObject(recordCount) != null) {
        String value = vv.getAccessor().getObject(recordCount).toString();
        partition[partitionColumnIndex] = value;
        maxIndex = Math.max(maxIndex, partitionColumnIndex);
      }
    }
    return Pair.of(partition, maxIndex);
  }

  protected LogicalExpression materializePruneExpr(RexNode pruneCondition,
      PlannerSettings settings,
      RelNode scanRel,
      VectorContainer container
      ) {
    // materialize the expression
    logger.debug("Attempting to prune {}", pruneCondition);
    final LogicalExpression expr = DrillOptiq.toDrill(new DrillParseContext(settings), scanRel, pruneCondition);
    final ErrorCollectorImpl errors = new ErrorCollectorImpl();

    LogicalExpression materializedExpr = ExpressionTreeMaterializer.materialize(expr, container, errors, optimizerContext.getFunctionRegistry());
    // Make sure pruneCondition's materialized expression is always of BitType, so that
    // it's same as the type of output vector.
    if (materializedExpr.getMajorType().getMode() == TypeProtos.DataMode.REQUIRED) {
      materializedExpr = ExpressionTreeMaterializer.convertToNullableType(
          materializedExpr,
          materializedExpr.getMajorType().getMinorType(),
          optimizerContext.getFunctionRegistry(),
          errors);
    }

    if (errors.getErrorCount() != 0) {
      logger.warn("Failure while materializing expression [{}].  Errors: {}", expr, errors);
      return null;
    }
    return materializedExpr;
  }

  protected OptimizerRulesContext getOptimizerRulesContext() {
    return optimizerContext;
  }

  public abstract PartitionDescriptor getPartitionDescriptor(PlannerSettings settings, TableScan scanRel);

  private static DrillTable getDrillTable(final TableScan scan) {
    DrillTable drillTable;
    drillTable = scan.getTable().unwrap(DrillTable.class);
    if (drillTable == null) {
      drillTable = scan.getTable().unwrap(DrillTranslatableTable.class).getDrillTable();
    }
    return drillTable;
  }

  private static boolean isQualifiedDirPruning(final TableScan scan) {
    if (scan instanceof EnumerableTableScan) {
      final Object selection = getDrillTable(scan).getSelection();
      if (selection instanceof FormatSelection
          && ((FormatSelection)selection).supportDirPruning()) {
        return true;  // Do directory-based pruning in Calcite logical
      } else {
        return false; // Do not do directory-based pruning in Calcite logical
      }
    } else if (scan instanceof DrillScanRel) {
      final GroupScan groupScan = ((DrillScanRel) scan).getGroupScan();
      // this rule is applicable only for dfs based partition pruning in Drill Logical
      return groupScan instanceof FileGroupScan && groupScan.supportsPartitionFilterPushdown() && !((DrillScanRel)scan).partitionFilterPushdown();
    }
    return false;
  }

  private static void setPruneStatus(MetadataContext metaContext, PruneStatus pruneStatus) {
    if (metaContext != null) {
      metaContext.setPruneStatus(pruneStatus);
    }
  }

}