HiveExceptRewriteRule.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer.calcite.rules;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.calcite.plan.RelOptCluster;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.Aggregate;
import org.apache.calcite.rel.core.AggregateCall;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeField;
import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexInputRef;
import org.apache.calcite.rex.RexLiteral;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.sql.SqlAggFunction;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.SqlOperator;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion;
import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter;
import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.calcite.tools.RelBuilder;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.calcite.util.Util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
import com.google.common.collect.Lists;

/**
 * Planner rule that rewrite
 * {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept}
 * Note, we only have 2 branches because of except's semantic.
 * R1 Except(all) R2
 * R1 introduce VCol ‘2’, R2 introduce VCol ‘1’
 * R3 = GB(R1 on all keys + VCol + count(VCol) as c) union all GB(R2 on all keys + VCol + count(VCol) as c)
 * R4 = GB(R3 on all keys + sum(c) as a + sum(VCol*c) as b) we
 * have m+n=a, 2m+n=b where m is the #row in R1 and n is the #row in R2 then
 * m=b-a, n=2a-b, m-n=2b-3a
 * if it is except (distinct)
 * then R5 = Fil (b-a>0 && 2a-b=0) R6 = select only keys from R5
 * else R5 = Fil (2b-3a>0) R6 = UDTF (R5) which will explode the tuples based on 2b-3a.
 * Note that NULLs are handled the same as other values. Please refer to the test cases.
 */
public class HiveExceptRewriteRule extends RelOptRule {

  public static final HiveExceptRewriteRule INSTANCE = new HiveExceptRewriteRule();

  protected static final Logger LOG = LoggerFactory.getLogger(HiveIntersectRewriteRule.class);


  // ~ Constructors -----------------------------------------------------------

  private HiveExceptRewriteRule() {
    super(operand(HiveExcept.class, any()));
  }

  // ~ Methods ----------------------------------------------------------------

  public void onMatch(RelOptRuleCall call) {
    final HiveExcept hiveExcept = call.rel(0);

    final RelOptCluster cluster = hiveExcept.getCluster();
    final RexBuilder rexBuilder = cluster.getRexBuilder();
    Builder<RelNode> bldr = new ImmutableList.Builder<RelNode>();

    // 1st level GB: create a GB(R1 on all keys + VCol + count() as c) for each
    // branch
    try {
      bldr.add(createFirstGB(hiveExcept.getInputs().get(0), true, cluster, rexBuilder));
      bldr.add(createFirstGB(hiveExcept.getInputs().get(1), false, cluster, rexBuilder));
    } catch (CalciteSemanticException e) {
      LOG.debug(e.toString());
      throw new RuntimeException(e);
    }

    // create a union above all the branches
    // the schema of union looks like this
    // all keys + VCol + c
    HiveRelNode union = new HiveUnion(cluster, TraitsUtil.getDefaultTraitSet(cluster), bldr.build());

    // 2nd level GB: create a GB (all keys + sum(c) as a + sum(VCol*c) as b) for
    // each branch
    final List<RexNode> gbChildProjLst = Lists.newArrayList();
    final List<Integer> groupSetPositions = Lists.newArrayList();
    int unionColumnSize = union.getRowType().getFieldList().size();
    for (int cInd = 0; cInd < unionColumnSize; cInd++) {
      gbChildProjLst.add(rexBuilder.makeInputRef(union, cInd));
      // the last 2 columns are VCol and c
      if (cInd < unionColumnSize - 2) {
        groupSetPositions.add(cInd);
      }
    }

    try {
      gbChildProjLst.add(multiply(rexBuilder.makeInputRef(union, unionColumnSize - 2),
          rexBuilder.makeInputRef(union, unionColumnSize - 1), cluster, rexBuilder));
    } catch (CalciteSemanticException e) {
      LOG.debug(e.toString());
      throw new RuntimeException(e);
    }

    RelNode gbInputRel = null;
    try {
      // Here we create a project for the following reasons:
      // (1) GBy only accepts arg as a position of the input, however, we need to sum on VCol*c
      // (2) This can better reuse the function createSingleArgAggCall.
      gbInputRel = HiveProject.create(union, gbChildProjLst, null);
    } catch (CalciteSemanticException e) {
      LOG.debug(e.toString());
      throw new RuntimeException(e);
    }

    // gbInputRel's schema is like this
    // all keys + VCol + c + VCol*c
    List<AggregateCall> aggregateCalls = Lists.newArrayList();
    RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory());

    // sum(c)
    AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("sum", cluster,
        TypeInfoFactory.longTypeInfo, unionColumnSize - 1, aggFnRetType);
    aggregateCalls.add(aggregateCall);

    // sum(VCol*c)
    aggregateCall = HiveCalciteUtil.createSingleArgAggCall("sum", cluster,
        TypeInfoFactory.longTypeInfo, unionColumnSize, aggFnRetType);
    aggregateCalls.add(aggregateCall);

    final ImmutableBitSet groupSet = ImmutableBitSet.of(groupSetPositions);
    HiveRelNode aggregateRel = new HiveAggregate(cluster,
        cluster.traitSetOf(HiveRelNode.CONVENTION), gbInputRel, false, groupSet, null,
        aggregateCalls);

    // the schema after GB is like this
    // all keys + sum(c) as a + sum(VCol*c) as b
    // the column size is the same as unionColumnSize;
    // (1) for except distinct add a filter (b-a>0 && 2a-b=0)
    // i.e., a > 0 && 2a = b
    // then add the project
    // (2) for except all add a project to change it to
    // (2b-3a) + all keys
    // then add the UDTF

    if (!hiveExcept.all) {
      RelNode filterRel = null;
      try {
        filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
            aggregateRel, makeFilterExprForExceptDistinct(aggregateRel, unionColumnSize, cluster,
                rexBuilder));
      } catch (CalciteSemanticException e) {
        LOG.debug(e.toString());
        throw new RuntimeException(e);
      }

      // finally add a project to project out the last 2 columns
      Set<Integer> projectOutColumnPositions = new HashSet<>();
      projectOutColumnPositions.add(filterRel.getRowType().getFieldList().size() - 2);
      projectOutColumnPositions.add(filterRel.getRowType().getFieldList().size() - 1);
      try {
        call.transformTo(HiveCalciteUtil.createProjectWithoutColumn(filterRel,
            projectOutColumnPositions));
      } catch (CalciteSemanticException e) {
        LOG.debug(e.toString());
        throw new RuntimeException(e);
      }
    } else {
      List<RexNode> originalInputRefs = Lists.transform(aggregateRel.getRowType().getFieldList(),
          new Function<RelDataTypeField, RexNode>() {
            @Override
            public RexNode apply(RelDataTypeField input) {
              return new RexInputRef(input.getIndex(), input.getType());
            }
          });

      List<RexNode> copyInputRefs = new ArrayList<>();
      try {
        copyInputRefs.add(makeExprForExceptAll(aggregateRel, unionColumnSize, cluster, rexBuilder));
      } catch (CalciteSemanticException e) {
        LOG.debug(e.toString());
        throw new RuntimeException(e);
      }
      for (int i = 0; i < originalInputRefs.size() - 2; i++) {
        copyInputRefs.add(originalInputRefs.get(i));
      }
      RelNode srcRel = null;
      try {
        srcRel = HiveProject.create(aggregateRel, copyInputRefs, null);
        HiveTableFunctionScan udtf = HiveCalciteUtil.createUDTFForSetOp(cluster, srcRel);
        // finally add a project to project out the 1st columns
        Set<Integer> projectOutColumnPositions = new HashSet<>();
        projectOutColumnPositions.add(0);
        call.transformTo(HiveCalciteUtil
            .createProjectWithoutColumn(udtf, projectOutColumnPositions));
      } catch (SemanticException e) {
        LOG.debug(e.toString());
        throw new RuntimeException(e);
      }
    }
  }

  private RelNode createFirstGB(RelNode input, boolean left, RelOptCluster cluster,
      RexBuilder rexBuilder) throws CalciteSemanticException {
    final List<RexNode> gbChildProjLst = Lists.newArrayList();
    final List<Integer> groupSetPositions = Lists.newArrayList();
    for (int cInd = 0; cInd < input.getRowType().getFieldList().size(); cInd++) {
      gbChildProjLst.add(rexBuilder.makeInputRef(input, cInd));
      groupSetPositions.add(cInd);
    }
    if (left) {
      gbChildProjLst.add(rexBuilder.makeBigintLiteral(new BigDecimal(2)));
    } else {
      gbChildProjLst.add(rexBuilder.makeBigintLiteral(new BigDecimal(1)));
    }

    // also add the last VCol
    groupSetPositions.add(input.getRowType().getFieldList().size());

    // create the project before GB
    RelNode gbInputRel = HiveProject.create(input, gbChildProjLst, null);

    // groupSetPosition includes all the positions
    final ImmutableBitSet groupSet = ImmutableBitSet.of(groupSetPositions);

    List<AggregateCall> aggregateCalls = Lists.newArrayList();
    RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory());

    AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("count", cluster,
        TypeInfoFactory.longTypeInfo, input.getRowType().getFieldList().size(), aggFnRetType);
    aggregateCalls.add(aggregateCall);
    return new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), gbInputRel,
        false, groupSet, null, aggregateCalls);
  }

  private RexNode multiply(RexNode r1, RexNode r2, RelOptCluster cluster, RexBuilder rexBuilder)
      throws CalciteSemanticException {
    List<RexNode> childRexNodeLst = new ArrayList<RexNode>();
    childRexNodeLst.add(r1);
    childRexNodeLst.add(r2);
    ImmutableList.Builder<RelDataType> calciteArgTypesBldr = new ImmutableList.Builder<RelDataType>();
    calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory()));
    calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory()));
    return rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("*", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), true),
        childRexNodeLst);
  }

  private RexNode makeFilterExprForExceptDistinct(HiveRelNode input, int columnSize,
      RelOptCluster cluster, RexBuilder rexBuilder) throws CalciteSemanticException {
    List<RexNode> childRexNodeLst = new ArrayList<RexNode>();
    RexInputRef a = rexBuilder.makeInputRef(input, columnSize - 2);
    RexLiteral zero = rexBuilder.makeBigintLiteral(new BigDecimal(0));
    childRexNodeLst.add(a);
    childRexNodeLst.add(zero);
    ImmutableList.Builder<RelDataType> calciteArgTypesBldr = new ImmutableList.Builder<RelDataType>();
    calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory()));
    calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory()));
    // a>0
    RexNode aMorethanZero = rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn(">", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);
    childRexNodeLst = new ArrayList<RexNode>();
    RexLiteral two = rexBuilder.makeBigintLiteral(new BigDecimal(2));
    childRexNodeLst.add(a);
    childRexNodeLst.add(two);
    // 2*a
    RexNode twoa = rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("*", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);
    childRexNodeLst = new ArrayList<RexNode>();
    RexInputRef b = rexBuilder.makeInputRef(input, columnSize - 1);
    childRexNodeLst.add(twoa);
    childRexNodeLst.add(b);
    // 2a=b
    RexNode twoaEqualTob = rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("=", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);
    childRexNodeLst = new ArrayList<RexNode>();
    childRexNodeLst.add(aMorethanZero);
    childRexNodeLst.add(twoaEqualTob);
    // a>0 && 2a=b
    return rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("and", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);
  }

  private RexNode makeExprForExceptAll(HiveRelNode input, int columnSize, RelOptCluster cluster,
      RexBuilder rexBuilder) throws CalciteSemanticException {
    List<RexNode> childRexNodeLst = new ArrayList<RexNode>();
    ImmutableList.Builder<RelDataType> calciteArgTypesBldr = new ImmutableList.Builder<RelDataType>();
    calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory()));
    calciteArgTypesBldr.add(TypeConverter.convert(TypeInfoFactory.longTypeInfo,
        cluster.getTypeFactory()));
    RexInputRef a = rexBuilder.makeInputRef(input, columnSize - 2);
    RexLiteral three = rexBuilder.makeBigintLiteral(new BigDecimal(3));
    childRexNodeLst.add(three);
    childRexNodeLst.add(a);
    RexNode threea = rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("*", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);

    RexLiteral two = rexBuilder.makeBigintLiteral(new BigDecimal(2));
    RexInputRef b = rexBuilder.makeInputRef(input, columnSize - 1);

    // 2*b
    childRexNodeLst = new ArrayList<RexNode>();
    childRexNodeLst.add(two);
    childRexNodeLst.add(b);
    RexNode twob = rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("*", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);

    // 2b-3a
    childRexNodeLst = new ArrayList<RexNode>();
    childRexNodeLst.add(twob);
    childRexNodeLst.add(threea);
    return rexBuilder.makeCall(
        SqlFunctionConverter.getCalciteFn("-", calciteArgTypesBldr.build(),
            TypeConverter.convert(TypeInfoFactory.longTypeInfo, cluster.getTypeFactory()), false),
        childRexNodeLst);
  }
}