/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.accumulo.predicate; import org.apache.accumulo.core.data.Range; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.accumulo.serde.AccumuloIndexParameters; import org.apache.hadoop.hive.accumulo.AccumuloIndexScanner; import org.apache.hadoop.hive.accumulo.AccumuloIndexScannerException; import org.apache.hadoop.hive.accumulo.AccumuloIndexLexicoder; import org.apache.hadoop.hive.accumulo.columns.HiveAccumuloRowIdColumnMapping; import org.apache.hadoop.hive.accumulo.predicate.compare.CompareOp; import org.apache.hadoop.hive.accumulo.predicate.compare.Equal; import org.apache.hadoop.hive.accumulo.predicate.compare.GreaterThan; import org.apache.hadoop.hive.accumulo.predicate.compare.GreaterThanOrEqual; import org.apache.hadoop.hive.accumulo.predicate.compare.LessThan; import org.apache.hadoop.hive.accumulo.predicate.compare.LessThanOrEqual; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.lazy.LazyUtils; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.UTF8; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Stack; import static java.nio.charset.StandardCharsets.UTF_8; /** * */ public class AccumuloRangeGenerator implements NodeProcessor { private static final Logger LOG = LoggerFactory.getLogger(AccumuloRangeGenerator.class); private final AccumuloPredicateHandler predicateHandler; private final HiveAccumuloRowIdColumnMapping rowIdMapping; private final String hiveRowIdColumnName; private AccumuloIndexScanner indexScanner; public AccumuloRangeGenerator(Configuration conf, AccumuloPredicateHandler predicateHandler, HiveAccumuloRowIdColumnMapping rowIdMapping, String hiveRowIdColumnName) { this.predicateHandler = predicateHandler; this.rowIdMapping = rowIdMapping; this.hiveRowIdColumnName = hiveRowIdColumnName; try { this.indexScanner = new AccumuloIndexParameters(conf).createScanner(); } catch (AccumuloIndexScannerException e) { LOG.error(e.getLocalizedMessage(), e); this.indexScanner = null; } } public AccumuloIndexScanner getIndexScanner() { return indexScanner; } public void setIndexScanner(AccumuloIndexScanner indexScanner) { this.indexScanner = indexScanner; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // If it's not some operator, pass it back if (!(nd instanceof ExprNodeGenericFuncDesc)) { return nd; } ExprNodeGenericFuncDesc func = (ExprNodeGenericFuncDesc) nd; // 'and' nodes need to be intersected if (FunctionRegistry.isOpAnd(func)) { return processAndOpNode(nd, nodeOutputs); // 'or' nodes need to be merged } else if (FunctionRegistry.isOpOr(func)) { return processOrOpNode(nd, nodeOutputs); } else if (FunctionRegistry.isOpNot(func)) { // TODO handle negations throw new IllegalArgumentException("Negations not yet implemented"); } else { return processExpression(func, nodeOutputs); } } protected Object processAndOpNode(Node nd, Object[] nodeOutputs) { // We might have multiple ranges coming from children List<Range> andRanges = null; for (Object nodeOutput : nodeOutputs) { // null signifies nodes that are irrelevant to the generation // of Accumulo Ranges if (null == nodeOutput) { continue; } // When an AND has no children (some conjunction over a field that isn't the column // mapped to the Accumulo rowid) and when a conjunction generates Ranges which are empty // (the children of the conjunction are disjoint), these two cases need to be kept separate. // // A null `andRanges` implies that ranges couldn't be computed, while an empty List // of Ranges implies that there are no possible Ranges to lookup. if (null == andRanges) { andRanges = new ArrayList<Range>(); } // The child is a single Range if (nodeOutput instanceof Range) { Range childRange = (Range) nodeOutput; // No existing ranges, just accept the current if (andRanges.isEmpty()) { andRanges.add(childRange); } else { // For each range we have, intersect them. If they don't overlap // the range can be discarded List<Range> newRanges = new ArrayList<Range>(); for (Range andRange : andRanges) { Range intersectedRange = andRange.clip(childRange, true); if (null != intersectedRange) { newRanges.add(intersectedRange); } } // Set the newly-constructed ranges as the current state andRanges = newRanges; } } else if (nodeOutput instanceof List) { @SuppressWarnings("unchecked") List<Range> childRanges = (List<Range>) nodeOutput; // No ranges, use the ranges from the child if (andRanges.isEmpty()) { andRanges.addAll(childRanges); } else { List<Range> newRanges = new ArrayList<Range>(); // Cartesian product of our ranges, to the child ranges for (Range andRange : andRanges) { for (Range childRange : childRanges) { Range intersectedRange = andRange.clip(childRange, true); // Retain only valid intersections (discard disjoint ranges) if (null != intersectedRange) { newRanges.add(intersectedRange); } } } // Set the newly-constructed ranges as the current state andRanges = newRanges; } } else { LOG.error("Expected Range from {} but got {}", nd, nodeOutput); throw new IllegalArgumentException("Expected Range but got " + nodeOutput.getClass().getName()); } } return andRanges; } protected Object processOrOpNode(Node nd, Object[] nodeOutputs) { List<Range> orRanges = new ArrayList<Range>(nodeOutputs.length); for (Object nodeOutput : nodeOutputs) { if (nodeOutput instanceof Range) { orRanges.add((Range) nodeOutput); } else if (nodeOutput instanceof List) { @SuppressWarnings("unchecked") List<Range> childRanges = (List<Range>) nodeOutput; orRanges.addAll(childRanges); } else { LOG.error("Expected Range from {} but got {}", nd, nodeOutput); throw new IllegalArgumentException("Expected Range but got " + nodeOutput.getClass().getName()); } } // Try to merge multiple ranges together if (orRanges.size() > 1) { return Range.mergeOverlapping(orRanges); } else if (1 == orRanges.size()) { // Return just the single Range return orRanges.get(0); } else { // No ranges, just return the empty list return orRanges; } } protected Object processExpression(ExprNodeGenericFuncDesc func, Object[] nodeOutputs) throws SemanticException { // a binary operator (gt, lt, ge, le, eq, ne) GenericUDF genericUdf = func.getGenericUDF(); // Find the argument to the operator which is a constant ExprNodeConstantDesc constantDesc = null; ExprNodeColumnDesc columnDesc = null; ExprNodeDesc leftHandNode = null; for (Object nodeOutput : nodeOutputs) { if (nodeOutput instanceof ExprNodeConstantDesc) { // Ordering of constant and column in expression is important in correct range generation if (null == leftHandNode) { leftHandNode = (ExprNodeDesc) nodeOutput; } constantDesc = (ExprNodeConstantDesc) nodeOutput; } else if (nodeOutput instanceof ExprNodeColumnDesc) { // Ordering of constant and column in expression is important in correct range generation if (null == leftHandNode) { leftHandNode = (ExprNodeDesc) nodeOutput; } columnDesc = (ExprNodeColumnDesc) nodeOutput; } } // If it's constant = constant or column = column, we can't fetch any ranges // TODO We can try to be smarter and push up the value to some node which // we can generate ranges from e.g. rowid > (4 + 5) if (null == constantDesc || null == columnDesc) { return null; } ConstantObjectInspector objInspector = constantDesc.getWritableObjectInspector(); // Reject any clauses that are against a column that isn't the rowId mapping or indexed if (!this.hiveRowIdColumnName.equals(columnDesc.getColumn())) { if (this.indexScanner != null && this.indexScanner.isIndexed(columnDesc.getColumn())) { return getIndexedRowIds(genericUdf, leftHandNode, columnDesc.getColumn(), objInspector); } return null; } Text constText = getConstantText(objInspector); return getRange(genericUdf, leftHandNode, constText); } private Range getRange(GenericUDF genericUdf, ExprNodeDesc leftHandNode, Text constText) { Class<? extends CompareOp> opClz; try { opClz = predicateHandler.getCompareOpClass(genericUdf.getUdfName()); } catch (NoSuchCompareOpException e) { throw new IllegalArgumentException("Unhandled UDF class: " + genericUdf.getUdfName()); } if (leftHandNode instanceof ExprNodeConstantDesc) { return getConstantOpColumnRange(opClz, constText); } else if (leftHandNode instanceof ExprNodeColumnDesc) { return getColumnOpConstantRange(opClz, constText); } else { throw new IllegalStateException("Expected column or constant on LHS of expression"); } } private Text getConstantText(ConstantObjectInspector objInspector) throws SemanticException { Text constText; switch (rowIdMapping.getEncoding()) { case STRING: constText = getUtf8Value(objInspector); break; case BINARY: try { constText = getBinaryValue(objInspector); } catch (IOException e) { throw new SemanticException(e); } break; default: throw new SemanticException("Unable to parse unknown encoding: " + rowIdMapping.getEncoding()); } return constText; } protected Range getConstantOpColumnRange(Class<? extends CompareOp> opClz, Text constText) { if (opClz.equals(Equal.class)) { // 100 == x return new Range(constText); // single row } else if (opClz.equals(GreaterThanOrEqual.class)) { // 100 >= x return new Range(null, constText); // neg-infinity to end inclusive } else if (opClz.equals(GreaterThan.class)) { // 100 > x return new Range(null, false, constText, false); // neg-infinity to end exclusive } else if (opClz.equals(LessThanOrEqual.class)) { // 100 <= x return new Range(constText, true, null, false); // start inclusive to infinity } else if (opClz.equals(LessThan.class)) { // 100 < x return new Range(constText, false, null, false); // start exclusive to infinity } else { throw new IllegalArgumentException("Could not process " + opClz); } } protected Range getColumnOpConstantRange(Class<? extends CompareOp> opClz, Text constText) { if (opClz.equals(Equal.class)) { return new Range(constText); // start inclusive to end inclusive } else if (opClz.equals(GreaterThanOrEqual.class)) { return new Range(constText, null); // start inclusive to infinity inclusive } else if (opClz.equals(GreaterThan.class)) { return new Range(constText, false, null, false); // start exclusive to infinity inclusive } else if (opClz.equals(LessThanOrEqual.class)) { return new Range(null, false, constText, true); // neg-infinity to start inclusive } else if (opClz.equals(LessThan.class)) { return new Range(null, false, constText, false); // neg-infinity to start exclusive } else { throw new IllegalArgumentException("Could not process " + opClz); } } protected Object getIndexedRowIds(GenericUDF genericUdf, ExprNodeDesc leftHandNode, String columnName, ConstantObjectInspector objInspector) throws SemanticException { Text constText = getConstantText(objInspector); byte[] value = constText.toString().getBytes(UTF_8); byte[] encoded = AccumuloIndexLexicoder.encodeValue(value, leftHandNode.getTypeString(), true); Range range = getRange(genericUdf, leftHandNode, new Text(encoded)); if (indexScanner != null) { return indexScanner.getIndexRowRanges(columnName, range); } return null; } protected Text getUtf8Value(ConstantObjectInspector objInspector) { // TODO is there a more correct way to get the literal value for the Object? return new Text(objInspector.getWritableConstantValue().toString()); } /** * Attempts to construct the binary value from the given inspector. Falls back to UTF8 encoding * when the value cannot be coerced into binary. * * @return Binary value when possible, utf8 otherwise * @throws IOException */ protected Text getBinaryValue(ConstantObjectInspector objInspector) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); if (objInspector instanceof PrimitiveObjectInspector) { LazyUtils.writePrimitive(out, objInspector.getWritableConstantValue(), (PrimitiveObjectInspector) objInspector); } else { return getUtf8Value(objInspector); } out.close(); return new Text(out.toByteArray()); } }