FilterSelectivityEstimator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer.calcite.stats;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import org.apache.calcite.plan.RelOptUtil;
import org.apache.calcite.plan.RelOptUtil.InputReferencedVisitor;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.Filter;
import org.apache.calcite.rel.core.Project;
import org.apache.calcite.rel.metadata.RelMetadataQuery;
import org.apache.calcite.rex.RexCall;
import org.apache.calcite.rex.RexInputRef;
import org.apache.calcite.rex.RexLiteral;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexVisitorImpl;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.SqlOperator;
import org.apache.calcite.sql.type.SqlTypeUtil;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
import org.apache.hadoop.hive.ql.plan.ColStatistics;

public class FilterSelectivityEstimator extends RexVisitorImpl<Double> {
  private final RelNode childRel;
  private final double  childCardinality;

  protected FilterSelectivityEstimator(RelNode childRel) {
    super(true);
    this.childRel = childRel;
    this.childCardinality = RelMetadataQuery.instance().getRowCount(childRel);
  }

  public Double estimateSelectivity(RexNode predicate) {
    return predicate.accept(this);
  }

  public Double visitCall(RexCall call) {
    if (!deep) {
      return 1.0;
    }

    /*
     * Ignore any predicates on partition columns because we have already
     * accounted for these in the Table row count.
     */
    if (isPartitionPredicate(call, this.childRel)) {
      return 1.0;
    }

    Double selectivity = null;
    SqlKind op = getOp(call);

    switch (op) {
    case AND: {
      selectivity = computeConjunctionSelectivity(call);
      break;
    }

    case OR: {
      selectivity = computeDisjunctionSelectivity(call);
      break;
    }

    case NOT:
    case NOT_EQUALS: {
      selectivity = computeNotEqualitySelectivity(call);
      break;
    }

    case IS_NOT_NULL: {
      if (childRel instanceof HiveTableScan) {
        double noOfNulls = getMaxNulls(call, (HiveTableScan) childRel);
        double totalNoOfTuples = childRel.getRows();
        if (totalNoOfTuples >= noOfNulls) {
          selectivity = (totalNoOfTuples - noOfNulls) / Math.max(totalNoOfTuples, 1);
        } else {
          throw new RuntimeException("Invalid Stats number of null > no of tuples");
        }
      } else {
        selectivity = computeNotEqualitySelectivity(call);
      }
      break;
    }

    case LESS_THAN_OR_EQUAL:
    case GREATER_THAN_OR_EQUAL:
    case LESS_THAN:
    case GREATER_THAN: {
      selectivity = ((double) 1 / (double) 3);
      break;
    }

    case IN: {
      // TODO: 1) check for duplicates 2) We assume in clause values to be
      // present in NDV which may not be correct (Range check can find it) 3) We
      // assume values in NDV set is uniformly distributed over col values
      // (account for skewness - histogram).
      selectivity = computeFunctionSelectivity(call) * (call.operands.size() - 1);
      if (selectivity <= 0.0) {
        selectivity = 0.10;
      } else if (selectivity >= 1.0) {
        selectivity = 1.0;
      }
      break;
    }

    default:
      selectivity = computeFunctionSelectivity(call);
    }

    return selectivity;
  }

  /**
   * NDV of "f1(x, y, z) != f2(p, q, r)" ->
   * "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)".
   * <p>
   * 
   * @param call
   * @return
   */
  private Double computeNotEqualitySelectivity(RexCall call) {
    double tmpNDV = getMaxNDV(call);

    if (tmpNDV > 1)
      return (tmpNDV - (double) 1) / tmpNDV;
    else
      return 1.0;
  }

  /**
   * Selectivity of f(X,y,z) -> 1/maxNDV(x,y,z).
   * <p>
   * Note that >, >=, <, <=, = ... are considered generic functions and uses
   * this method to find their selectivity.
   * 
   * @param call
   * @return
   */
  private Double computeFunctionSelectivity(RexCall call) {
    return 1 / getMaxNDV(call);
  }

  /**
   * Disjunction Selectivity -> (1 D(1-m1/n)(1-m2/n)) where n is the total
   * number of tuples from child and m1 and m2 is the expected number of tuples
   * from each part of the disjunction predicate.
   * <p>
   * Note we compute m1. m2.. by applying selectivity of the disjunctive element
   * on the cardinality from child.
   * 
   * @param call
   * @return
   */
  private Double computeDisjunctionSelectivity(RexCall call) {
    Double tmpCardinality;
    Double tmpSelectivity;
    double selectivity = 1;

    for (RexNode dje : call.getOperands()) {
      tmpSelectivity = dje.accept(this);
      if (tmpSelectivity == null) {
        tmpSelectivity = 0.99;
      }
      tmpCardinality = childCardinality * tmpSelectivity;

      if (tmpCardinality > 1 && tmpCardinality < childCardinality) {
        tmpSelectivity = (1 - tmpCardinality / childCardinality);
      } else {
        tmpSelectivity = 1.0;
      }

      selectivity *= tmpSelectivity;
    }

    if (selectivity < 0.0)
      selectivity = 0.0;

    return (1 - selectivity);
  }

  /**
   * Selectivity of conjunctive predicate -> (selectivity of conjunctive
   * element1) * (selectivity of conjunctive element2)...
   * 
   * @param call
   * @return
   */
  private Double computeConjunctionSelectivity(RexCall call) {
    Double tmpSelectivity;
    double selectivity = 1;

    for (RexNode cje : call.getOperands()) {
      tmpSelectivity = cje.accept(this);
      if (tmpSelectivity != null) {
        selectivity *= tmpSelectivity;
      }
    }

    return selectivity;
  }

  /**
   * Given a RexCall & TableScan find max no of nulls. Currently it picks the
   * col with max no of nulls.
   * 
   * TODO: improve this
   * 
   * @param call
   * @param t
   * @return
   */
  private long getMaxNulls(RexCall call, HiveTableScan t) {
    long tmpNoNulls = 0;
    long maxNoNulls = 0;

    Set<Integer> iRefSet = HiveCalciteUtil.getInputRefs(call);
    List<ColStatistics> colStats = t.getColStat(new ArrayList<Integer>(iRefSet));

    for (ColStatistics cs : colStats) {
      tmpNoNulls = cs.getNumNulls();
      if (tmpNoNulls > maxNoNulls) {
        maxNoNulls = tmpNoNulls;
      }
    }

    return maxNoNulls;
  }

  private Double getMaxNDV(RexCall call) {
    double tmpNDV;
    double maxNDV = 1.0;
    InputReferencedVisitor irv;
    RelMetadataQuery mq = RelMetadataQuery.instance();
    for (RexNode op : call.getOperands()) {
      if (op instanceof RexInputRef) {
        tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(this.childRel, mq,
            ((RexInputRef) op).getIndex());
        if (tmpNDV > maxNDV)
          maxNDV = tmpNDV;
      } else {
        irv = new InputReferencedVisitor();
        irv.apply(op);
        for (Integer childProjIndx : irv.inputPosReferenced) {
          tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(this.childRel,
              mq, childProjIndx);
          if (tmpNDV > maxNDV)
            maxNDV = tmpNDV;
        }
      }
    }

    return maxNDV;
  }

  private boolean isPartitionPredicate(RexNode expr, RelNode r) {
    if (r instanceof Project) {
      expr = RelOptUtil.pushFilterPastProject(expr, (Project) r);
      return isPartitionPredicate(expr, ((Project) r).getInput());
    } else if (r instanceof Filter) {
      return isPartitionPredicate(expr, ((Filter) r).getInput());
    } else if (r instanceof HiveTableScan) {
      RelOptHiveTable table = (RelOptHiveTable) ((HiveTableScan) r).getTable();
      ImmutableBitSet cols = RelOptUtil.InputFinder.bits(expr);
      return table.containsPartitionColumnsOnly(cols);
    }
    return false;
  }

  private SqlKind getOp(RexCall call) {
    SqlKind op = call.getKind();

    if (call.getKind().equals(SqlKind.OTHER_FUNCTION)
        && SqlTypeUtil.inBooleanFamily(call.getType())) {
      SqlOperator sqlOp = call.getOperator();
      String opName = (sqlOp != null) ? sqlOp.getName() : "";
      if (opName.equalsIgnoreCase("in")) {
        op = SqlKind.IN;
      }
    }

    return op;
  }

  public Double visitLiteral(RexLiteral literal) {
    if (literal.isAlwaysFalse()) {
      return 0.0;
    } else if (literal.isAlwaysTrue()) {
      return 1.0;
    } else {
      assert false;
    }
    return null;
  }
}