HBaseFilterPlanUtil.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.hadoop.hive.metastore.hbase;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.hbase.PartitionKeyComparator.Operator;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.LeafNode;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.TreeNode;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.TreeVisitor;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;


/**
 * Utility function for generating hbase partition filtering plan representation
 * from ExpressionTree.
 * Optimizations to be done -
 *  - Case where all partition keys are specified. Should use a get
 *
 * {@link PartitionFilterGenerator} is a visitor on the given filter expression tree. After
 * walking it it produces the HBase execution plan represented by {@link FilterPlan}. See
 * their javadocs for more details.
 */
class HBaseFilterPlanUtil {

  /**
   * Compare two byte arrays.
   *
   * @param ar1
   *          first byte array
   * @param ar2
   *          second byte array
   * @return -1 if ar1 < ar2, 0 if == , 1 if >
   */
  static int compare(byte[] ar1, byte[] ar2) {
    // null check is not needed, nulls are not passed here
    for (int i = 0; i < ar1.length; i++) {
      if (i == ar2.length) {
        return 1;
      } else {
        if (ar1[i] == ar2[i]) {
          continue;
        } else if (ar1[i] > ar2[i]) {
          return 1;
        } else {
          return -1;
        }
      }
    }
    // ar2 equal until length of ar1.
    if(ar1.length == ar2.length) {
      return 0;
    }
    // ar2 has more bytes
    return -1;
  }

  /**
   * Represents the execution plan for hbase to find the set of partitions that
   * match given filter expression.
   * If you have an AND or OR of two expressions, you can determine FilterPlan for each
   * children and then call lhs.and(rhs) or lhs.or(rhs) respectively
   * to generate a new plan for the expression.
   *
   * The execution plan has one or more ScanPlan objects. To get the results the set union of all
   * ScanPlan objects needs to be done.
   */
  public static abstract class FilterPlan {
    abstract FilterPlan and(FilterPlan other);
    abstract FilterPlan or(FilterPlan other);
    abstract List<ScanPlan> getPlans();
    @Override
    public String toString() {
      return getPlans().toString();
    }

  }

  /**
   * Represents a union/OR of single scan plans (ScanPlan).
   */
  public static class MultiScanPlan extends FilterPlan {
    final ImmutableList<ScanPlan> scanPlans;

    public MultiScanPlan(List<ScanPlan> scanPlans){
      this.scanPlans = ImmutableList.copyOf(scanPlans);
    }

    @Override
    public FilterPlan and(FilterPlan other) {
      // Convert to disjunctive normal form (DNF), ie OR of ANDs
      // First get a new set of FilterPlans by doing an AND
      // on each ScanPlan in this one with the other FilterPlan
      List<FilterPlan> newFPlans = new ArrayList<FilterPlan>();
      for (ScanPlan splan : getPlans()) {
        newFPlans.add(splan.and(other));
      }
      //now combine scanPlans in multiple new FilterPlans into one
      // MultiScanPlan
      List<ScanPlan> newScanPlans = new ArrayList<ScanPlan>();
      for (FilterPlan fp : newFPlans) {
        newScanPlans.addAll(fp.getPlans());
      }
      return new MultiScanPlan(newScanPlans);
    }

    @Override
    public FilterPlan or(FilterPlan other) {
      // just combine the ScanPlans
      List<ScanPlan> newScanPlans = new ArrayList<ScanPlan>(this.getPlans());
      newScanPlans.addAll(other.getPlans());
      return new MultiScanPlan(newScanPlans);
    }

    @Override
    public List<ScanPlan> getPlans() {
      return scanPlans;
    }
  }

  /**
   * Represents a single Hbase Scan api call
   */
  public static class ScanPlan extends FilterPlan {

    public static class ScanMarker {
      final String value;
      /**
       * If inclusive = true, it means that the
       * marker includes those bytes.
       * If it is false, it means the marker starts at the next possible byte array
       * or ends at the next possible byte array
       */
      final boolean isInclusive;
      final String type;
      ScanMarker(String obj, boolean i, String type){
        this.value = obj;
        this.isInclusive = i;
        this.type = type;
      }
      @Override
      public String toString() {
        return "ScanMarker [" + "value=" + value.toString() + ", isInclusive=" + isInclusive +
            ", type=" + type + "]";
      }
      @Override
      public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + value.hashCode();
        result = prime * result + (isInclusive ? 1231 : 1237);
        result = prime * result + type.hashCode();
        return result;
      }
      @Override
      public boolean equals(Object obj) {
        if (this == obj)
          return true;
        if (obj == null)
          return false;
        if (getClass() != obj.getClass())
          return false;
        ScanMarker other = (ScanMarker) obj;
        if (!value.equals(other.value))
          return false;
        if (isInclusive != other.isInclusive)
          return false;
        if (type != other.type)
          return false;
        return true;
      }
    }
    public static class ScanMarkerPair {
      public ScanMarkerPair(ScanMarker startMarker, ScanMarker endMarker) {
        this.startMarker = startMarker;
        this.endMarker = endMarker;
      }
      ScanMarker startMarker;
      ScanMarker endMarker;
    }
    // represent Scan start, partition key name -> scanMarkerPair
    Map<String, ScanMarkerPair> markers = new HashMap<String, ScanMarkerPair>();
    List<Operator> ops = new ArrayList<Operator>();

    // Get the number of partition key prefixes which can be used in the scan range.
    // For example, if partition key is (year, month, state)
    // 1. year = 2015 and month >= 1 and month < 5
    //    year + month can be used in scan range, majorParts = 2
    // 2. year = 2015 and state = 'CA'
    //    only year can be used in scan range, majorParts = 1
    // 3. month = 10 and state = 'CA'
    //    nothing can be used in scan range, majorParts = 0
    private int getMajorPartsCount(List<FieldSchema> parts) {
      int majorPartsCount = 0;
      while (majorPartsCount<parts.size() && markers.containsKey(parts.get(majorPartsCount).getName())) {
        ScanMarkerPair pair = markers.get(parts.get(majorPartsCount).getName());
        majorPartsCount++;
        if (pair.startMarker!=null && pair.endMarker!=null && pair.startMarker.value.equals(pair
            .endMarker.value) && pair.startMarker.isInclusive && pair.endMarker.isInclusive) {
          // is equal
          continue;
        } else {
          break;
        }
      }
      return majorPartsCount;
    }
    public Filter getFilter(List<FieldSchema> parts) {
      int majorPartsCount = getMajorPartsCount(parts);
      Set<String> majorKeys = new HashSet<String>();
      for (int i=0;i<majorPartsCount;i++) {
        majorKeys.add(parts.get(i).getName());
      }

      List<String> names = HBaseUtils.getPartitionNames(parts);
      List<PartitionKeyComparator.Range> ranges = new ArrayList<PartitionKeyComparator.Range>();
      for (Map.Entry<String, ScanMarkerPair> entry : markers.entrySet()) {
        if (names.contains(entry.getKey()) && !majorKeys.contains(entry.getKey())) {
          PartitionKeyComparator.Mark startMark = null;
          if (entry.getValue().startMarker != null) {
            startMark = new PartitionKeyComparator.Mark(entry.getValue().startMarker.value,
                entry.getValue().startMarker.isInclusive);
          }
          PartitionKeyComparator.Mark endMark = null;
          if (entry.getValue().endMarker != null) {
            startMark = new PartitionKeyComparator.Mark(entry.getValue().endMarker.value,
                entry.getValue().endMarker.isInclusive);
          }
          PartitionKeyComparator.Range range = new PartitionKeyComparator.Range(
              entry.getKey(), startMark, endMark);
          ranges.add(range);
        }
      }

      if (ranges.isEmpty() && ops.isEmpty()) {
        return null;
      } else {
        return new RowFilter(CompareFilter.CompareOp.EQUAL, new PartitionKeyComparator(
            StringUtils.join(names, ","), StringUtils.join(HBaseUtils.getPartitionKeyTypes(parts), ","),
            ranges, ops));
      }
    }

    public void setStartMarker(String keyName, String keyType, String start, boolean isInclusive) {
      if (markers.containsKey(keyName)) {
        markers.get(keyName).startMarker = new ScanMarker(start, isInclusive, keyType);
      } else {
        ScanMarkerPair marker = new ScanMarkerPair(new ScanMarker(start, isInclusive, keyType), null);
        markers.put(keyName, marker);
      }
    }

    public ScanMarker getStartMarker(String keyName) {
      if (markers.containsKey(keyName)) {
        return markers.get(keyName).startMarker;
      } else {
        return null;
      }
    }

    public void setEndMarker(String keyName, String keyType, String end, boolean isInclusive) {
      if (markers.containsKey(keyName)) {
        markers.get(keyName).endMarker = new ScanMarker(end, isInclusive, keyType);
      } else {
        ScanMarkerPair marker = new ScanMarkerPair(null, new ScanMarker(end, isInclusive, keyType));
        markers.put(keyName, marker);
      }
    }

    public ScanMarker getEndMarker(String keyName) {
      if (markers.containsKey(keyName)) {
        return markers.get(keyName).endMarker;
      } else {
        return null;
      }
    }

    @Override
    public FilterPlan and(FilterPlan other) {
      List<ScanPlan> newSPlans = new ArrayList<ScanPlan>();
      for (ScanPlan otherSPlan : other.getPlans()) {
        newSPlans.add(this.and(otherSPlan));
      }
      return new MultiScanPlan(newSPlans);
    }

    private ScanPlan and(ScanPlan other) {
      // create combined FilterPlan based on existing lhs and rhs plan
      ScanPlan newPlan = new ScanPlan();
      newPlan.markers.putAll(markers);

      for (String keyName : other.markers.keySet()) {
        if (newPlan.markers.containsKey(keyName)) {
          // create new scan start
          ScanMarker greaterStartMarker = getComparedMarker(this.getStartMarker(keyName),
              other.getStartMarker(keyName), true);
          if (greaterStartMarker != null) {
            newPlan.setStartMarker(keyName, greaterStartMarker.type, greaterStartMarker.value, greaterStartMarker.isInclusive);
          }

          // create new scan end
          ScanMarker lesserEndMarker = getComparedMarker(this.getEndMarker(keyName), other.getEndMarker(keyName),
              false);
          if (lesserEndMarker != null) {
            newPlan.setEndMarker(keyName, lesserEndMarker.type, lesserEndMarker.value, lesserEndMarker.isInclusive);
          }
        } else {
          newPlan.markers.put(keyName, other.markers.get(keyName));
        }
      }

      newPlan.ops.addAll(ops);
      newPlan.ops.addAll(other.ops);
      return newPlan;
    }

    /**
     * @param lStartMarker
     * @param rStartMarker
     * @param getGreater if true return greater startmarker, else return smaller one
     * @return greater/lesser marker depending on value of getGreater
     */
    @VisibleForTesting
    static ScanMarker getComparedMarker(ScanMarker lStartMarker, ScanMarker rStartMarker,
        boolean getGreater) {
      // if one of them has null bytes, just return other
      if(lStartMarker == null) {
        return rStartMarker;
      } else if (rStartMarker == null) {
        return lStartMarker;
      }
      TypeInfo expectedType =
          TypeInfoUtils.getTypeInfoFromTypeString(lStartMarker.type);
      ObjectInspector outputOI =
          TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(expectedType);
      Converter lConverter = ObjectInspectorConverters.getConverter(
          PrimitiveObjectInspectorFactory.javaStringObjectInspector, outputOI);
      Converter rConverter = ObjectInspectorConverters.getConverter(
          PrimitiveObjectInspectorFactory.javaStringObjectInspector, outputOI);
      Comparable lValue = (Comparable)lConverter.convert(lStartMarker.value);
      Comparable rValue = (Comparable)rConverter.convert(rStartMarker.value);

      int compareRes = lValue.compareTo(rValue);
      if (compareRes == 0) {
        // bytes are equal, now compare the isInclusive flags
        if (lStartMarker.isInclusive == rStartMarker.isInclusive) {
          // actually equal, so return any one
          return lStartMarker;
        }
        boolean isInclusive = true;
        // one that does not include the current bytes is greater
        if (getGreater) {
          isInclusive = false;
        }
        // else
        return new ScanMarker(lStartMarker.value, isInclusive, lStartMarker.type);
      }
      if (getGreater) {
        return compareRes == 1 ? lStartMarker : rStartMarker;
      }
      // else
      return compareRes == -1 ? lStartMarker : rStartMarker;
    }


    @Override
    public FilterPlan or(FilterPlan other) {
      List<ScanPlan> plans = new ArrayList<ScanPlan>(getPlans());
      plans.addAll(other.getPlans());
      return new MultiScanPlan(plans);
    }

    @Override
    public List<ScanPlan> getPlans() {
      return Arrays.asList(this);
    }


    /**
     * @return row suffix - This is appended to db + table, to generate start row for the Scan
     */
    public byte[] getStartRowSuffix(String dbName, String tableName, List<FieldSchema> parts) {
      int majorPartsCount = getMajorPartsCount(parts);
      List<String> majorPartTypes = new ArrayList<String>();
      List<String> components = new ArrayList<String>();
      boolean endPrefix = false;
      for (int i=0;i<majorPartsCount;i++) {
        majorPartTypes.add(parts.get(i).getType());
        ScanMarker marker = markers.get(parts.get(i).getName()).startMarker;
        if (marker != null) {
          components.add(marker.value);
          if (i==majorPartsCount-1) {
            endPrefix = !marker.isInclusive;
          }
        } else {
          components.add(null);
          if (i==majorPartsCount-1) {
            endPrefix = false;
          }
        }
      }
      byte[] bytes = HBaseUtils.buildPartitionKey(dbName, tableName, majorPartTypes, components, endPrefix);
      return bytes;
    }

    /**
     * @return row suffix - This is appended to db + table, to generate end row for the Scan
     */
    public byte[] getEndRowSuffix(String dbName, String tableName, List<FieldSchema> parts) {
      int majorPartsCount = getMajorPartsCount(parts);
      List<String> majorPartTypes = new ArrayList<String>();
      List<String> components = new ArrayList<String>();
      boolean endPrefix = false;
      for (int i=0;i<majorPartsCount;i++) {
        majorPartTypes.add(parts.get(i).getType());
        ScanMarker marker = markers.get(parts.get(i).getName()).endMarker;
        if (marker != null) {
          components.add(marker.value);
          if (i==majorPartsCount-1) {
            endPrefix = marker.isInclusive;
          }
        } else {
          components.add(null);
          if (i==majorPartsCount-1) {
            endPrefix = true;
          }
        }
      }
      byte[] bytes = HBaseUtils.buildPartitionKey(dbName, tableName, majorPartTypes, components, endPrefix);
      if (components.isEmpty()) {
        bytes[bytes.length-1]++;
      }
      return bytes;
    }

    @Override
    public String toString() {
      StringBuffer sb = new StringBuffer();
      sb.append("ScanPlan:\n");
      for (Map.Entry<String, ScanMarkerPair> entry : markers.entrySet()) {
        sb.append("key=" + entry.getKey() + "[startMarker=" + entry.getValue().startMarker
            + ", endMarker=" + entry.getValue().endMarker + "]");
      }
      return sb.toString();
    }

  }

  /**
   * Visitor for ExpressionTree.
   * It first generates the ScanPlan for the leaf nodes. The higher level nodes are
   * either AND or OR operations. It then calls FilterPlan.and and FilterPlan.or with
   * the child nodes to generate the plans for higher level nodes.
   */
  @VisibleForTesting
  static class PartitionFilterGenerator extends TreeVisitor {
    private FilterPlan curPlan;

    // this tells us if there is a condition that did not get included in the plan
    // such condition would be treated as getting evaluated to TRUE
    private boolean hasUnsupportedCondition = false;

    //Need to cache the left plans for the TreeNode. Use IdentityHashMap here
    // as we don't want to dedupe on two TreeNode that are otherwise considered equal
    Map<TreeNode, FilterPlan> leftPlans = new IdentityHashMap<TreeNode, FilterPlan>();

    // temporary params for current left and right side plans, for AND, OR
    private FilterPlan rPlan;

    private Map<String, String> nameToType = new HashMap<String, String>();

    public PartitionFilterGenerator(List<FieldSchema> parts) {
      for (FieldSchema part : parts) {
        nameToType.put(part.getName(), part.getType());
      }
    }

    FilterPlan getPlan() {
      return curPlan;
    }

    @Override
    protected void beginTreeNode(TreeNode node) throws MetaException {
      // reset the params
      curPlan = rPlan = null;
    }

    @Override
    protected void midTreeNode(TreeNode node) throws MetaException {
      leftPlans.put(node, curPlan);
      curPlan = null;
    }

    @Override
    protected void endTreeNode(TreeNode node) throws MetaException {
      rPlan = curPlan;
      FilterPlan lPlan = leftPlans.get(node);
      leftPlans.remove(node);

      switch (node.getAndOr()) {
      case AND:
        curPlan = lPlan.and(rPlan);
        break;
      case OR:
        curPlan = lPlan.or(rPlan);
        break;
      default:
        throw new AssertionError("Unexpected logical operation " + node.getAndOr());
      }

    }


    @Override
    public void visit(LeafNode node) throws MetaException {
      ScanPlan leafPlan = new ScanPlan();
      curPlan = leafPlan;

      // this is a condition on first partition column, so might influence the
      // start and end of the scan
      final boolean INCLUSIVE = true;
      switch (node.operator) {
      case EQUALS:
        leafPlan.setStartMarker(node.keyName, nameToType.get(node.keyName), node.value.toString(), INCLUSIVE);
        leafPlan.setEndMarker(node.keyName, nameToType.get(node.keyName), node.value.toString(), INCLUSIVE);
        break;
      case GREATERTHAN:
        leafPlan.setStartMarker(node.keyName, nameToType.get(node.keyName), node.value.toString(), !INCLUSIVE);
        break;
      case GREATERTHANOREQUALTO:
        leafPlan.setStartMarker(node.keyName, nameToType.get(node.keyName), node.value.toString(), INCLUSIVE);
        break;
      case LESSTHAN:
        leafPlan.setEndMarker(node.keyName, nameToType.get(node.keyName), node.value.toString(), !INCLUSIVE);
        break;
      case LESSTHANOREQUALTO:
        leafPlan.setEndMarker(node.keyName, nameToType.get(node.keyName), node.value.toString(), INCLUSIVE);
        break;
      case LIKE:
        leafPlan.ops.add(new Operator(Operator.Type.LIKE, node.keyName, node.value.toString()));
        break;
      case NOTEQUALS:
      case NOTEQUALS2:
        leafPlan.ops.add(new Operator(Operator.Type.NOTEQUALS, node.keyName, node.value.toString()));
        break;
      }
    }

    private boolean hasUnsupportedCondition() {
      return hasUnsupportedCondition;
    }

  }

  public static class PlanResult {
    public final FilterPlan plan;
    public final boolean hasUnsupportedCondition;
    PlanResult(FilterPlan plan, boolean hasUnsupportedCondition) {
      this.plan = plan;
      this.hasUnsupportedCondition = hasUnsupportedCondition;
    }
  }

  public static PlanResult getFilterPlan(ExpressionTree exprTree, List<FieldSchema> parts) throws MetaException {
    if (exprTree == null) {
      // TODO: if exprTree is null, we should do what ObjectStore does. See HIVE-10102
      return new PlanResult(new ScanPlan(), true);
    }
    PartitionFilterGenerator pGenerator = new PartitionFilterGenerator(parts);
    exprTree.accept(pGenerator);
    return new PlanResult(pGenerator.getPlan(), pGenerator.hasUnsupportedCondition());
  }

}