/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.calcite.stats; import java.util.ArrayList; import java.util.List; import java.util.Set; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.plan.hep.HepRelVertex; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.RelVisitor; import org.apache.calcite.rel.core.Filter; import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.SemiJoin; import org.apache.calcite.rel.core.Sort; import org.apache.calcite.rel.core.TableScan; import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; import org.apache.calcite.rel.metadata.RelMdRowCount; import org.apache.calcite.rel.metadata.RelMetadataProvider; import org.apache.calcite.rel.metadata.RelMetadataQuery; import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexUtil; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.util.BuiltInMethod; import org.apache.calcite.util.ImmutableBitSet; import org.apache.calcite.util.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; public class HiveRelMdRowCount extends RelMdRowCount { protected static final Logger LOG = LoggerFactory.getLogger(HiveRelMdRowCount.class.getName()); public static final RelMetadataProvider SOURCE = ReflectiveRelMetadataProvider .reflectiveSource(BuiltInMethod.ROW_COUNT.method, new HiveRelMdRowCount()); protected HiveRelMdRowCount() { super(); } public Double getRowCount(Join join, RelMetadataQuery mq) { PKFKRelationInfo pkfk = analyzeJoinForPKFK(join, mq); if (pkfk != null) { double selectivity = (pkfk.pkInfo.selectivity * pkfk.ndvScalingFactor); selectivity = Math.min(1.0, selectivity); if (LOG.isDebugEnabled()) { LOG.debug("Identified Primary - Foreign Key relation: {} {}",RelOptUtil.toString(join), pkfk); } return pkfk.fkInfo.rowCount * selectivity; } return join.getRows(); } @Override public Double getRowCount(SemiJoin rel, RelMetadataQuery mq) { PKFKRelationInfo pkfk = analyzeJoinForPKFK(rel, mq); if (pkfk != null) { double selectivity = (pkfk.pkInfo.selectivity * pkfk.ndvScalingFactor); selectivity = Math.min(1.0, selectivity); if (LOG.isDebugEnabled()) { LOG.debug("Identified Primary - Foreign Key relation: {} {}", RelOptUtil.toString(rel), pkfk); } return pkfk.fkInfo.rowCount * selectivity; } return super.getRowCount(rel, mq); } @Override public Double getRowCount(Sort rel, RelMetadataQuery mq) { final Double rowCount = mq.getRowCount(rel.getInput()); if (rowCount != null && rel.fetch != null) { final int offset = rel.offset == null ? 0 : RexLiteral.intValue(rel.offset); final int limit = RexLiteral.intValue(rel.fetch); final Double offsetLimit = new Double(offset + limit); // offsetLimit is smaller than rowCount of the input operator // thus, we return the offsetLimit if (offsetLimit < rowCount) { return offsetLimit; } } return rowCount; } static class PKFKRelationInfo { public final int fkSide; public final double ndvScalingFactor; public final FKSideInfo fkInfo; public final PKSideInfo pkInfo; public final boolean isPKSideSimple; PKFKRelationInfo(int fkSide, FKSideInfo fkInfo, PKSideInfo pkInfo, double ndvScalingFactor, boolean isPKSideSimple) { this.fkSide = fkSide; this.fkInfo = fkInfo; this.pkInfo = pkInfo; this.ndvScalingFactor = ndvScalingFactor; this.isPKSideSimple = isPKSideSimple; } @Override public String toString() { return String.format( "Primary - Foreign Key join:\n\tfkSide = %d\n\tFKInfo:%s\n" + "\tPKInfo:%s\n\tisPKSideSimple:%s\n\tNDV Scaling Factor:%.2f\n", fkSide, fkInfo, pkInfo, isPKSideSimple, ndvScalingFactor); } } static class FKSideInfo { public final double rowCount; public final double distinctCount; public FKSideInfo(double rowCount, double distinctCount) { this.rowCount = rowCount; this.distinctCount = distinctCount; } @Override public String toString() { return String.format("FKInfo(rowCount=%.2f,ndv=%.2f)", rowCount, distinctCount); } } static class PKSideInfo extends FKSideInfo { public final double selectivity; public PKSideInfo(double rowCount, double distinctCount, double selectivity) { super(rowCount, distinctCount); this.selectivity = selectivity; } @Override public String toString() { return String.format("PKInfo(rowCount=%.2f,ndv=%.2f,selectivity=%.2f)", rowCount, distinctCount,selectivity); } } /* * For T1 join T2 on T1.x = T2.y if we identify 'y' s a key of T2 then we can * infer the join cardinality as: rowCount(T1) * selectivity(T2) i.e this is * like a SemiJoin where the T1(Fact side/FK side) is filtered by a factor * based on the Selectivity of the PK/Dim table side. * * 1. If both T1.x and T2.y are keys then use the larger one as the PK side. * 2. In case of outer Joins: a) The FK side should be the Null Preserving * side. It doesn't make sense to apply this heuristic in case of Dim loj Fact * or Fact roj Dim b) The selectivity factor applied on the Fact Table should * be 1. */ public static PKFKRelationInfo analyzeJoinForPKFK(Join joinRel, RelMetadataQuery mq) { RelNode left = joinRel.getInputs().get(0); RelNode right = joinRel.getInputs().get(1); final List<RexNode> initJoinFilters = RelOptUtil.conjunctions(joinRel .getCondition()); /* * No joining condition. */ if (initJoinFilters.isEmpty()) { return null; } List<RexNode> leftFilters = new ArrayList<RexNode>(); List<RexNode> rightFilters = new ArrayList<RexNode>(); List<RexNode> joinFilters = new ArrayList<RexNode>(initJoinFilters); // @todo: remove this. 8/28/14 hb // for now adding because RelOptUtil.classifyFilters has an assertion about // column counts that is not true for semiJoins. if (joinRel instanceof SemiJoin) { return null; } RelOptUtil.classifyFilters(joinRel, joinFilters, joinRel.getJoinType(), false, !joinRel.getJoinType().generatesNullsOnRight(), !joinRel .getJoinType().generatesNullsOnLeft(), joinFilters, leftFilters, rightFilters); Pair<Integer, Integer> joinCols = canHandleJoin(joinRel, leftFilters, rightFilters, joinFilters); if (joinCols == null) { return null; } int leftColIdx = joinCols.left; int rightColIdx = joinCols.right; RexBuilder rexBuilder = joinRel.getCluster().getRexBuilder(); RexNode leftPred = RexUtil .composeConjunction(rexBuilder, leftFilters, true); RexNode rightPred = RexUtil.composeConjunction(rexBuilder, rightFilters, true); ImmutableBitSet lBitSet = ImmutableBitSet.of(leftColIdx); ImmutableBitSet rBitSet = ImmutableBitSet.of(rightColIdx); /* * If the form is Dim loj F or Fact roj Dim or Dim semij Fact then return * null. */ boolean leftIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel .getJoinType() == JoinRelType.RIGHT) && !(joinRel instanceof SemiJoin) && isKey(lBitSet, left, mq); boolean rightIsKey = (joinRel.getJoinType() == JoinRelType.INNER || joinRel .getJoinType() == JoinRelType.LEFT) && isKey(rBitSet, right, mq); if (!leftIsKey && !rightIsKey) { return null; } double leftRowCount = mq.getRowCount(left); double rightRowCount = mq.getRowCount(right); if (leftIsKey && rightIsKey) { if (rightRowCount < leftRowCount) { leftIsKey = false; } } int pkSide = leftIsKey ? 0 : rightIsKey ? 1 : -1; boolean isPKSideSimpleTree = pkSide != -1 ? IsSimpleTreeOnJoinKey.check( pkSide == 0 ? left : right, pkSide == 0 ? leftColIdx : rightColIdx, mq) : false; double leftNDV = isPKSideSimpleTree ? mq.getDistinctRowCount(left, lBitSet, leftPred) : -1; double rightNDV = isPKSideSimpleTree ? mq.getDistinctRowCount(right, rBitSet, rightPred) : -1; /* * If the ndv of the PK - FK side don't match, and the PK side is a filter * on the Key column then scale the NDV on the FK side. * * As described by Peter Boncz: http://databasearchitects.blogspot.com/ * in such cases we can be off by a large margin in the Join cardinality * estimate. The e.g. he provides is on the join of StoreSales and DateDim * on the TPCDS dataset. Since the DateDim is populated for 20 years into * the future, while the StoreSales only has 5 years worth of data, there * are 40 times fewer distinct dates in StoreSales. * * In general it is hard to infer the range for the foreign key on an * arbitrary expression. For e.g. the NDV for DayofWeek is the same * irrespective of NDV on the number of unique days, whereas the * NDV of Quarters has the same ratio as the NDV on the keys. * * But for expressions that apply only on columns that have the same NDV * as the key (implying that they are alternate keys) we can apply the * ratio. So in the case of StoreSales - DateDim joins for predicate on the * d_date column we can apply the scaling factor. */ double ndvScalingFactor = 1.0; if ( isPKSideSimpleTree ) { ndvScalingFactor = pkSide == 0 ? leftNDV/rightNDV : rightNDV / leftNDV; } if (pkSide == 0) { FKSideInfo fkInfo = new FKSideInfo(rightRowCount, rightNDV); double pkSelectivity = pkSelectivity(joinRel, mq, true, left, leftRowCount); PKSideInfo pkInfo = new PKSideInfo(leftRowCount, leftNDV, joinRel.getJoinType().generatesNullsOnRight() ? 1.0 : pkSelectivity); return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree); } if (pkSide == 1) { FKSideInfo fkInfo = new FKSideInfo(leftRowCount, leftNDV); double pkSelectivity = pkSelectivity(joinRel, mq, false, right, rightRowCount); PKSideInfo pkInfo = new PKSideInfo(rightRowCount, rightNDV, joinRel.getJoinType().generatesNullsOnLeft() ? 1.0 : pkSelectivity); return new PKFKRelationInfo(1, fkInfo, pkInfo, ndvScalingFactor, isPKSideSimpleTree); } return null; } private static double pkSelectivity(Join joinRel, RelMetadataQuery mq, boolean leftChild, RelNode child, double childRowCount) { if ((leftChild && joinRel.getJoinType().generatesNullsOnRight()) || (!leftChild && joinRel.getJoinType().generatesNullsOnLeft())) { return 1.0; } else { HiveTableScan tScan = HiveRelMdUniqueKeys.getTableScan(child, true); if (tScan != null) { double tRowCount = mq.getRowCount(tScan); return childRowCount / tRowCount; } else { return 1.0; } } } private static boolean isKey(ImmutableBitSet c, RelNode rel, RelMetadataQuery mq) { boolean isKey = false; Set<ImmutableBitSet> keys = mq.getUniqueKeys(rel); if (keys != null) { for (ImmutableBitSet key : keys) { if (key.equals(c)) { isKey = true; break; } } } return isKey; } /* * 1. Join condition must be an Equality Predicate. * 2. both sides must reference 1 column. * 3. If needed flip the columns. */ private static Pair<Integer, Integer> canHandleJoin(Join joinRel, List<RexNode> leftFilters, List<RexNode> rightFilters, List<RexNode> joinFilters) { /* * If after classifying filters there is more than 1 joining predicate, we * don't handle this. Return null. */ if (joinFilters.size() != 1) { return null; } RexNode joinCond = joinFilters.get(0); int leftColIdx; int rightColIdx; if (!(joinCond instanceof RexCall)) { return null; } if (((RexCall) joinCond).getOperator() != SqlStdOperatorTable.EQUALS) { return null; } ImmutableBitSet leftCols = RelOptUtil.InputFinder.bits(((RexCall) joinCond).getOperands().get(0)); ImmutableBitSet rightCols = RelOptUtil.InputFinder.bits(((RexCall) joinCond).getOperands().get(1)); if (leftCols.cardinality() != 1 || rightCols.cardinality() != 1 ) { return null; } int nFieldsLeft = joinRel.getLeft().getRowType().getFieldList().size(); int nFieldsRight = joinRel.getRight().getRowType().getFieldList().size(); int nSysFields = joinRel.getSystemFieldList().size(); ImmutableBitSet rightFieldsBitSet = ImmutableBitSet.range(nSysFields + nFieldsLeft, nSysFields + nFieldsLeft + nFieldsRight); /* * flip column references if join condition specified in reverse order to * join sources. */ if (rightFieldsBitSet.contains(leftCols)) { ImmutableBitSet t = leftCols; leftCols = rightCols; rightCols = t; } leftColIdx = leftCols.nextSetBit(0) - nSysFields; rightColIdx = rightCols.nextSetBit(0) - (nSysFields + nFieldsLeft); return new Pair<Integer, Integer>(leftColIdx, rightColIdx); } private static class IsSimpleTreeOnJoinKey extends RelVisitor { int joinKey; boolean simpleTree; RelMetadataQuery mq; static boolean check(RelNode r, int joinKey, RelMetadataQuery mq) { IsSimpleTreeOnJoinKey v = new IsSimpleTreeOnJoinKey(joinKey, mq); v.go(r); return v.simpleTree; } IsSimpleTreeOnJoinKey(int joinKey, RelMetadataQuery mq) { super(); this.joinKey = joinKey; this.mq = mq; simpleTree = true; } @Override public void visit(RelNode node, int ordinal, RelNode parent) { if (node instanceof HepRelVertex) { node = ((HepRelVertex) node).getCurrentRel(); } if (node instanceof TableScan) { simpleTree = true; } else if (node instanceof Project) { simpleTree = isSimple((Project) node); } else if (node instanceof Filter) { simpleTree = isSimple((Filter) node, mq); } else { simpleTree = false; } if (simpleTree) { super.visit(node, ordinal, parent); } } private boolean isSimple(Project project) { RexNode r = project.getProjects().get(joinKey); if (r instanceof RexInputRef) { joinKey = ((RexInputRef) r).getIndex(); return true; } return false; } private boolean isSimple(Filter filter, RelMetadataQuery mq) { ImmutableBitSet condBits = RelOptUtil.InputFinder.bits(filter.getCondition()); return isKey(condBits, filter, mq); } } }