/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator;
import org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator;
import org.apache.hadoop.hive.ql.exec.LimitOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.PTFOperator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.UDTFOperator;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.parse.RowResolver;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PTFDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef;
import org.apache.hadoop.hive.ql.plan.ptf.PTFInputDef;
import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef;
import org.apache.hadoop.hive.ql.plan.ptf.ShapeDetails;
import org.apache.hadoop.hive.ql.plan.ptf.WindowFunctionDef;
import org.apache.hadoop.hive.ql.plan.ptf.WindowTableFunctionDef;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.fromColumnNames;
import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.lookupColumn;
import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.mergeFieldNodesWithDesc;
import static org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx.toColumnNames;
import static org.apache.hadoop.hive.ql.optimizer.FieldNode.mergeFieldNodes;
/**
* Factory for generating the different node processors used by ColumnPruner.
*/
public final class ColumnPrunerProcFactory {
protected static final Logger LOG = LoggerFactory.getLogger(ColumnPrunerProcFactory.class.getName());
private ColumnPrunerProcFactory() {
// prevent instantiation
}
/**
* Node Processor for Column Pruning on Filter Operators.
*/
public static class ColumnPrunerFilterProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
FilterOperator op = (FilterOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
ExprNodeDesc condn = op.getConf().getPredicate();
List<FieldNode> filterOpPrunedColLists = mergeFieldNodesWithDesc(cppCtx.genColLists(op), condn);
List<FieldNode> filterOpPrunedColListsOrderPreserved = preserveColumnOrder(op,
filterOpPrunedColLists);
cppCtx.getPrunedColLists().put(op,
filterOpPrunedColListsOrderPreserved);
pruneOperator(cppCtx, op, cppCtx.getPrunedColLists().get(op));
cppCtx.handleFilterUnionChildren(op);
return null;
}
}
/**
* Factory method to get the ColumnPrunerFilterProc class.
*
* @return ColumnPrunerFilterProc
*/
public static ColumnPrunerFilterProc getFilterProc() {
return new ColumnPrunerFilterProc();
}
/**
* Node Processor for Column Pruning on Group By Operators.
*/
public static class ColumnPrunerGroupByProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
GroupByOperator gbOp = (GroupByOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
List<FieldNode> colLists = new ArrayList<>();
GroupByDesc conf = gbOp.getConf();
ArrayList<ExprNodeDesc> keys = conf.getKeys();
for (ExprNodeDesc key : keys) {
colLists = mergeFieldNodesWithDesc(colLists, key);
}
ArrayList<AggregationDesc> aggrs = conf.getAggregators();
for (AggregationDesc aggr : aggrs) {
ArrayList<ExprNodeDesc> params = aggr.getParameters();
for (ExprNodeDesc param : params) {
colLists = mergeFieldNodesWithDesc(colLists, param);
}
}
int groupingSetPosition = conf.getGroupingSetPosition();
if (groupingSetPosition >= 0) {
List<FieldNode> neededCols = cppCtx.genColLists(gbOp);
String groupingColumn = conf.getOutputColumnNames().get(groupingSetPosition);
if (lookupColumn(neededCols, groupingColumn) == null) {
conf.getOutputColumnNames().remove(groupingSetPosition);
if (gbOp.getSchema() != null) {
gbOp.getSchema().getSignature().remove(groupingSetPosition);
}
}
}
// If the child has a different schema, we create a Project operator between them both,
// as we cannot prune the columns in the GroupBy operator
for (Operator<?> child : gbOp.getChildOperators()) {
if (child instanceof SelectOperator || child instanceof ReduceSinkOperator) {
continue;
}
List<FieldNode> colList = cppCtx.genColLists(gbOp, child);
Set<FieldNode> neededCols = new HashSet<>();
if (colList != null) {
neededCols.addAll(colList);
} else {
// colList will be null for FS operators.
continue;
}
if (neededCols.size() < gbOp.getSchema().getSignature().size()) {
ArrayList<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputColNames = new ArrayList<String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ColumnInfo> outputRS = new ArrayList<ColumnInfo>();
for (ColumnInfo colInfo : gbOp.getSchema().getSignature()) {
if (lookupColumn(neededCols, colInfo.getInternalName()) == null) {
continue;
}
ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(),
colInfo.getInternalName(), colInfo.getTabAlias(), colInfo.getIsVirtualCol());
exprs.add(colDesc);
outputColNames.add(colInfo.getInternalName());
ColumnInfo newCol = new ColumnInfo(colInfo.getInternalName(), colInfo.getType(),
colInfo.getTabAlias(), colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol());
newCol.setAlias(colInfo.getAlias());
outputRS.add(newCol);
colExprMap.put(colInfo.getInternalName(), colDesc);
}
SelectDesc select = new SelectDesc(exprs, outputColNames, false);
gbOp.removeChild(child);
SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(
select, new RowSchema(outputRS), gbOp);
OperatorFactory.makeChild(sel, child);
sel.setColumnExprMap(colExprMap);
}
}
cppCtx.getPrunedColLists().put(gbOp, colLists);
return null;
}
}
/**
* Factory method to get the ColumnPrunerGroupByProc class.
*
* @return ColumnPrunerGroupByProc
*/
public static ColumnPrunerGroupByProc getGroupByProc() {
return new ColumnPrunerGroupByProc();
}
public static class ColumnPrunerScriptProc implements NodeProcessor {
@Override
@SuppressWarnings("unchecked")
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) nd;
RowSchema inputRS = op.getSchema();
List<FieldNode> prunedCols = cppCtx.getPrunedColList(op.getChildOperators()
.get(0));
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
RowSchema parentRS = parent.getSchema();
List<ColumnInfo> sig = parentRS.getSignature();
List<FieldNode> colList = new ArrayList<>();
for (ColumnInfo cI : sig) {
colList.add(new FieldNode(cI.getInternalName()));
}
if (prunedCols.size() != inputRS.getSignature().size()
&& !(op.getChildOperators().get(0) instanceof SelectOperator)) {
ArrayList<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputs = new ArrayList<String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ColumnInfo> outputRS = new ArrayList<ColumnInfo>();
for (FieldNode internalCol: prunedCols) {
String internalName = internalCol.getFieldName();
ColumnInfo valueInfo = inputRS.getColumnInfo(internalName);
ExprNodeDesc colDesc = new ExprNodeColumnDesc(valueInfo.getType(),
valueInfo.getInternalName(), valueInfo.getTabAlias(), valueInfo.getIsVirtualCol());
exprs.add(colDesc);
outputs.add(internalName);
ColumnInfo newCol = new ColumnInfo(internalName, valueInfo.getType(), valueInfo.getTabAlias(),
valueInfo.getIsVirtualCol(), valueInfo.isHiddenVirtualCol());
newCol.setAlias(valueInfo.getAlias());
outputRS.add(newCol);
colExprMap.put(internalName, colDesc);
}
SelectDesc select = new SelectDesc(exprs, outputs, false);
Operator<? extends OperatorDesc> child = op.getChildOperators().get(0);
op.removeChild(child);
SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(
select, new RowSchema(outputRS), op);
OperatorFactory.makeChild(sel, child);
sel.setColumnExprMap(colExprMap);
}
cppCtx.getPrunedColLists().put(op, colList);
return null;
}
}
public static class ColumnPrunerLimitProc extends ColumnPrunerDefaultProc {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
super.process(nd, stack, ctx, nodeOutputs);
List<FieldNode> cols = ((ColumnPrunerProcCtx) ctx).getPrunedColLists().get(nd);
if (null != cols) {
pruneOperator(ctx, (LimitOperator) nd, cols);
}
return null;
}
}
public static ColumnPrunerLimitProc getLimitProc() {
return new ColumnPrunerLimitProc();
}
public static ColumnPrunerScriptProc getScriptProc() {
return new ColumnPrunerScriptProc();
}
/**
* - Pruning can only be done for Windowing. PTFs are black boxes,
* we assume all columns are needed.
* - add column names referenced in WindowFn args and in WindowFn expressions
* to the pruned list of the child Select Op.
* - finally we set the prunedColList on the ColumnPrunerContx;
* and update the RR & signature on the PTFOp.
*/
public static class ColumnPrunerPTFProc extends ColumnPrunerScriptProc {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
PTFOperator op = (PTFOperator) nd;
PTFDesc conf = op.getConf();
//Since we cannot know what columns will be needed by a PTF chain,
//we do not prune columns on PTFOperator for PTF chains.
PartitionedTableFunctionDef funcDef = conf.getFuncDef();
List<String> referencedColumns = funcDef.getReferencedColumns();
if (!conf.forWindowing() && !conf.forNoop() && referencedColumns == null) {
return super.process(nd, stack, cppCtx, nodeOutputs);
}
List<FieldNode> prunedCols = cppCtx.getPrunedColList(op.getChildOperators().get(0));
if (conf.forWindowing()) {
WindowTableFunctionDef def = (WindowTableFunctionDef) funcDef;
prunedCols = mergeFieldNodes(prunedCols, getWindowFunctionColumns(def));
} else if (conf.forNoop()) {
prunedCols = new ArrayList(cppCtx.getPrunedColList(op.getChildOperators().get(0)));
} else {
prunedCols = fromColumnNames(referencedColumns);
}
List<ColumnInfo> newRS = prunedColumnsList(prunedCols, op.getSchema(), funcDef);
op.getSchema().setSignature(new ArrayList<ColumnInfo>(newRS));
ShapeDetails outputShape = funcDef.getStartOfChain().getInput().getOutputShape();
cppCtx.getPrunedColLists().put(op, fromColumnNames(outputShape.getColumnNames()));
return null;
}
private List<ColumnInfo> buildPrunedRS(List<FieldNode> prunedCols, RowSchema oldRS)
throws SemanticException {
ArrayList<ColumnInfo> sig = new ArrayList<ColumnInfo>();
HashSet<FieldNode> prunedColsSet = new HashSet<>(prunedCols);
for (ColumnInfo cInfo : oldRS.getSignature()) {
if (lookupColumn(prunedColsSet, cInfo.getInternalName()) != null) {
sig.add(cInfo);
}
}
return sig;
}
// always should be in this order (see PTFDeserializer#initializeWindowing)
private List<FieldNode> getWindowFunctionColumns(WindowTableFunctionDef tDef) {
List<FieldNode> columns = new ArrayList<>();
if (tDef.getWindowFunctions() != null) {
for (WindowFunctionDef wDef : tDef.getWindowFunctions()) {
columns.add(new FieldNode(wDef.getAlias()));
}
}
return columns;
}
private RowResolver buildPrunedRR(List<FieldNode> prunedCols, RowSchema oldRS) throws SemanticException {
RowResolver resolver = new RowResolver();
HashSet<FieldNode> prunedColsSet = new HashSet<>(prunedCols);
for (ColumnInfo cInfo : oldRS.getSignature()) {
if (lookupColumn(prunedColsSet, cInfo.getInternalName()) != null) {
resolver.put(cInfo.getTabAlias(), cInfo.getAlias(), cInfo);
}
}
return resolver;
}
/*
* add any input columns referenced in WindowFn args or expressions.
*/
private List<ColumnInfo> prunedColumnsList(List<FieldNode> prunedCols, RowSchema oldRS,
PartitionedTableFunctionDef pDef) throws SemanticException {
pDef.getOutputShape().setRr(null);
pDef.getOutputShape().setColumnNames(null);
if (pDef instanceof WindowTableFunctionDef) {
WindowTableFunctionDef tDef = (WindowTableFunctionDef) pDef;
if (tDef.getWindowFunctions() != null) {
for (WindowFunctionDef wDef : tDef.getWindowFunctions()) {
if (wDef.getArgs() == null) {
continue;
}
for (PTFExpressionDef arg : wDef.getArgs()) {
ExprNodeDesc exprNode = arg.getExprNode();
prunedCols = mergeFieldNodesWithDesc(prunedCols, exprNode);
}
}
}
if (tDef.getPartition() != null) {
for (PTFExpressionDef col : tDef.getPartition().getExpressions()) {
ExprNodeDesc exprNode = col.getExprNode();
prunedCols = mergeFieldNodesWithDesc(prunedCols, exprNode);
}
}
if (tDef.getOrder() != null) {
for (PTFExpressionDef col : tDef.getOrder().getExpressions()) {
ExprNodeDesc exprNode = col.getExprNode();
prunedCols = mergeFieldNodesWithDesc(prunedCols, exprNode);
}
}
} else {
pDef.getOutputShape().setRr(buildPrunedRR(prunedCols, oldRS));
}
PTFInputDef input = pDef.getInput();
if (input instanceof PartitionedTableFunctionDef) {
return prunedColumnsList(prunedCols, oldRS, (PartitionedTableFunctionDef)input);
}
ArrayList<FieldNode> inputColumns = prunedInputList(prunedCols, input);
input.getOutputShape().setRr(buildPrunedRR(inputColumns, oldRS));
input.getOutputShape().setColumnNames(toColumnNames(inputColumns));
return buildPrunedRS(prunedCols, oldRS);
}
/*
* from the prunedCols list filter out columns that refer to WindowFns or WindowExprs
* the returned list is set as the prunedList needed by the PTFOp.
*/
private ArrayList<FieldNode> prunedInputList(List<FieldNode> prunedCols, PTFInputDef tDef) {
ArrayList<FieldNode> prunedInputCols = new ArrayList<>();
StructObjectInspector OI = tDef.getOutputShape().getOI();
for(StructField f : OI.getAllStructFieldRefs()) {
String fName = f.getFieldName();
FieldNode fn = lookupColumn(prunedCols, fName);
if (fn != null) {
prunedInputCols.add(fn);
}
}
return prunedInputCols;
}
}
/**
* Factory method to get the ColumnPrunerGroupByProc class.
*
* @return ColumnPrunerGroupByProc
*/
public static ColumnPrunerPTFProc getPTFProc() {
return new ColumnPrunerPTFProc();
}
/**
* The Default Node Processor for Column Pruning.
*/
public static class ColumnPrunerDefaultProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
cppCtx.getPrunedColLists().put((Operator<? extends OperatorDesc>) nd,
cppCtx.genColLists((Operator<? extends OperatorDesc>) nd));
return null;
}
}
/**
* Factory method to get the ColumnPrunerDefaultProc class.
*
* @return ColumnPrunerDefaultProc
*/
public static ColumnPrunerDefaultProc getDefaultProc() {
return new ColumnPrunerDefaultProc();
}
/**
* The Node Processor for Column Pruning on Table Scan Operators. It will
* store needed columns in tableScanDesc.
*/
public static class ColumnPrunerTableScanProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
TableScanOperator scanOp = (TableScanOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
List<FieldNode> cols = cppCtx
.genColLists((Operator<? extends OperatorDesc>) nd);
if (cols == null && !scanOp.getConf().isGatherStats() ) {
scanOp.setNeededColumnIDs(null);
return null;
}
cols = cols == null ? new ArrayList<FieldNode>() : cols;
cppCtx.getPrunedColLists().put((Operator<? extends OperatorDesc>) nd, cols);
RowSchema inputRS = scanOp.getSchema();
setupNeededColumns(scanOp, inputRS, cols);
return null;
}
}
/** Sets up needed columns for TSOP. Mainly, transfers column names from input
* RowSchema as well as the needed virtual columns, into TableScanDesc.
*/
public static void setupNeededColumns(TableScanOperator scanOp, RowSchema inputRS,
List<FieldNode> cols) throws SemanticException {
List<Integer> neededColumnIds = new ArrayList<Integer>();
List<String> neededColumnNames = new ArrayList<String>();
List<String> neededNestedColumnPaths = new ArrayList<>();
List<String> referencedColumnNames = new ArrayList<String>();
TableScanDesc desc = scanOp.getConf();
List<VirtualColumn> virtualCols = desc.getVirtualCols();
List<VirtualColumn> newVirtualCols = new ArrayList<VirtualColumn>();
// add virtual columns for ANALYZE TABLE
if(scanOp.getConf().isGatherStats()) {
cols.add(new FieldNode(VirtualColumn.RAWDATASIZE.getName()));
}
for (FieldNode fn : cols) {
String column = fn.getFieldName();
ColumnInfo colInfo = inputRS.getColumnInfo(column);
if (colInfo == null) {
continue;
}
referencedColumnNames.add(column);
if (colInfo.getIsVirtualCol()) {
// part is also a virtual column, but part col should not in this
// list.
for (int j = 0; j < virtualCols.size(); j++) {
VirtualColumn vc = virtualCols.get(j);
if (vc.getName().equals(colInfo.getInternalName())) {
newVirtualCols.add(vc);
}
}
//no need to pass virtual columns to reader.
continue;
}
int position = inputRS.getPosition(column);
if (position >= 0) {
// get the needed columns by id and name
neededColumnIds.add(position);
neededColumnNames.add(column);
neededNestedColumnPaths.addAll(fn.toPaths());
}
}
desc.setVirtualCols(newVirtualCols);
scanOp.setNeededColumnIDs(neededColumnIds);
scanOp.setNeededColumns(neededColumnNames);
scanOp.setNeededNestedColumnPaths(neededNestedColumnPaths);
scanOp.setReferencedColumns(referencedColumnNames);
}
/**
* Factory method to get the ColumnPrunerDefaultProc class.
*
* @return ColumnPrunerTableScanProc
*/
public static ColumnPrunerTableScanProc getTableScanProc() {
return new ColumnPrunerTableScanProc();
}
/**
* The Node Processor for Column Pruning on Reduce Sink Operators.
*/
public static class ColumnPrunerReduceSinkProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator op = (ReduceSinkOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
ReduceSinkDesc conf = op.getConf();
List<FieldNode> colLists = new ArrayList<>();
ArrayList<ExprNodeDesc> keys = conf.getKeyCols();
LOG.debug("Reduce Sink Operator " + op.getIdentifier() + " key:" + keys);
for (ExprNodeDesc key : keys) {
colLists = mergeFieldNodesWithDesc(colLists, key);
}
for (ExprNodeDesc key : conf.getPartitionCols()) {
colLists = mergeFieldNodesWithDesc(colLists, key);
}
assert op.getNumChild() == 1;
Operator<? extends OperatorDesc> child = op.getChildOperators().get(0);
List<FieldNode> childCols = null;
if (child instanceof CommonJoinOperator) {
childCols = cppCtx.getJoinPrunedColLists().get(child) == null
? null : cppCtx.getJoinPrunedColLists().get(child)
.get((byte) conf.getTag());
} else {
childCols = cppCtx.getPrunedColList(child);
}
List<ExprNodeDesc> valCols = conf.getValueCols();
List<String> valColNames = conf.getOutputValueColumnNames();
if (childCols != null) {
boolean[] flags = new boolean[valCols.size()];
for (FieldNode childCol : childCols) {
int index = valColNames.indexOf(Utilities.removeValueTag(childCol.getFieldName()));
if (index < 0) {
continue;
}
flags[index] = true;
colLists = mergeFieldNodesWithDesc(colLists, valCols.get(index));
}
Collections.sort(colLists, new Comparator<FieldNode>() {
@Override
public int compare(FieldNode o1, FieldNode o2) {
return o1.getFieldName().compareTo(o2.getFieldName());
}
});
pruneReduceSinkOperator(flags, op, cppCtx);
cppCtx.getPrunedColLists().put(op, colLists);
return null;
}
// Reduce Sink contains the columns needed - no need to aggregate from
// children
for (ExprNodeDesc val : valCols) {
colLists = mergeFieldNodesWithDesc(colLists, val);
}
cppCtx.getPrunedColLists().put(op, colLists);
return null;
}
}
/**
* The Factory method to get ColumnPrunerReduceSinkProc class.
*
* @return ColumnPrunerReduceSinkProc
*/
public static ColumnPrunerReduceSinkProc getReduceSinkProc() {
return new ColumnPrunerReduceSinkProc();
}
/**
* The Node Processor for Column Pruning on Lateral View Join Operators.
*/
public static class ColumnPrunerLateralViewJoinProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
LateralViewJoinOperator op = (LateralViewJoinOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
List<FieldNode> cols = cppCtx.genColLists(op);
if (cols == null) {
return null;
}
Map<String, ExprNodeDesc> colExprMap = op.getColumnExprMap();
// As columns go down the DAG, the LVJ will transform internal column
// names from something like 'key' to '_col0'. Because of this, we need
// to undo this transformation using the column expression map as the
// column names propagate up the DAG.
// this is SEL(*) cols + UDTF cols
List<String> outputCols = op.getConf().getOutputInternalColNames();
// cause we cannot prune columns from UDTF branch currently, extract
// columns from SEL(*) branch only and append all columns from UDTF branch to it
int numSelColumns = op.getConf().getNumSelColumns();
List<FieldNode> colsAfterReplacement = new ArrayList<>();
List<FieldNode> newCols = new ArrayList<>();
for (int index = 0; index < numSelColumns; index++) {
String colName = outputCols.get(index);
FieldNode col = lookupColumn(cols, colName);
// colExprMap.size() == size of cols from SEL(*) branch
if (col != null) {
ExprNodeDesc transformed = colExprMap.get(col.getFieldName());
colsAfterReplacement = mergeFieldNodesWithDesc(colsAfterReplacement, transformed);
newCols.add(col);
}
}
// update number of columns from sel(*)
op.getConf().setNumSelColumns(newCols.size());
// add all UDTF columns
// following SEL will do CP for columns from UDTF, not adding SEL in here
newCols.addAll(fromColumnNames(outputCols.subList(numSelColumns, outputCols.size())));
op.getConf().setOutputInternalColNames(toColumnNames(newCols));
pruneOperator(ctx, op, newCols);
cppCtx.getPrunedColLists().put(op, colsAfterReplacement);
return null;
}
}
/**
* The Node Processor for Column Pruning on Lateral View Forward Operators.
*/
public static class ColumnPrunerLateralViewForwardProc extends ColumnPrunerDefaultProc {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
super.process(nd, stack, ctx, nodeOutputs);
LateralViewForwardOperator op = (LateralViewForwardOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
// get the SEL(*) branch
Operator<?> select = op.getChildOperators().get(LateralViewJoinOperator.SELECT_TAG);
// Update the info of SEL operator based on the pruned reordered columns
// these are from ColumnPrunerSelectProc
List<FieldNode> cols = cppCtx.getPrunedColList(select);
RowSchema rs = op.getSchema();
ArrayList<ExprNodeDesc> colList = new ArrayList<>();
List<FieldNode> outputCols = new ArrayList<>();
for (ColumnInfo colInfo : rs.getSignature()) {
FieldNode col = lookupColumn(cols, colInfo.getInternalName());
if (col != null) {
// revert output cols of SEL(*) to ExprNodeColumnDesc
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(colInfo);
colList.add(colExpr);
outputCols.add(col);
}
}
// replace SEL(*) to SEL(exprs)
((SelectDesc)select.getConf()).setSelStarNoCompute(false);
((SelectDesc)select.getConf()).setColList(colList);
((SelectDesc)select.getConf()).setOutputColumnNames(toColumnNames(outputCols));
pruneOperator(ctx, select, outputCols);
Operator<?> udtfPath = op.getChildOperators().get(LateralViewJoinOperator.UDTF_TAG);
List<FieldNode> lvFCols = new ArrayList<>(cppCtx.getPrunedColLists().get(udtfPath));
lvFCols = mergeFieldNodes(lvFCols, outputCols);
pruneOperator(ctx, op, lvFCols);
return null;
}
}
/**
* The Node Processor for Column Pruning on Select Operators.
*/
public static class ColumnPrunerSelectProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
SelectOperator op = (SelectOperator) nd;
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
if (op.getChildOperators() != null) {
for (Operator<? extends OperatorDesc> child : op.getChildOperators()) {
// UDTF is not handled yet, so the parent SelectOp of UDTF should just assume
// all columns.
if ((child instanceof UDTFOperator)) {
cppCtx.getPrunedColLists()
.put(op, cppCtx.getColsFromSelectExpr(op));
return null;
}
}
}
LateralViewJoinOperator lvJoin = null;
if (op.getConf().isSelStarNoCompute()) {
assert op.getNumChild() == 1;
Operator<? extends OperatorDesc> child = op.getChildOperators().get(0);
if (child instanceof LateralViewJoinOperator) { // this SEL is SEL(*)
// for LV
lvJoin = (LateralViewJoinOperator) child;
}
}
List<FieldNode> cols = cppCtx.genColLists(op);
SelectDesc conf = op.getConf();
if (lvJoin != null) {
// get columns for SEL(*) from LVJ
if (cols != null) {
RowSchema rs = op.getSchema();
cppCtx.getPrunedColLists().put(op,
cppCtx.getSelectColsFromLVJoin(rs, cols));
}
return null;
}
// The input to the select does not matter. Go over the expressions
// and return the ones which have a marked column
cppCtx.getPrunedColLists().put(op,
cppCtx.getSelectColsFromChildren(op, cols));
if (cols == null || conf.isSelStarNoCompute()) {
return null;
}
// do we need to prune the select operator?
List<ExprNodeDesc> originalColList = op.getConf().getColList();
// by now, 'prunedCols' are columns used by child operators, and 'columns'
// are columns used by this select operator.
List<String> originalOutputColumnNames = conf.getOutputColumnNames();
// get view column authorization.
if (cppCtx.getParseContext().getColumnAccessInfo() != null
&& cppCtx.getParseContext().getViewProjectToTableSchema() != null
&& cppCtx.getParseContext().getViewProjectToTableSchema().containsKey(op)) {
for (FieldNode col : cols) {
int index = originalOutputColumnNames.indexOf(col.getFieldName());
Table tab = cppCtx.getParseContext().getViewProjectToTableSchema().get(op);
cppCtx.getParseContext().getColumnAccessInfo()
.add(tab.getCompleteName(), tab.getCols().get(index).getName());
}
}
if (cols.size() < originalOutputColumnNames.size()) {
ArrayList<ExprNodeDesc> newColList = new ArrayList<ExprNodeDesc>();
ArrayList<String> newOutputColumnNames = new ArrayList<String>();
ArrayList<ColumnInfo> rs_oldsignature = op.getSchema().getSignature();
ArrayList<ColumnInfo> rs_newsignature = new ArrayList<ColumnInfo>();
// The pruning needs to preserve the order of columns in the input schema
Set<String> colNames = new HashSet<String>();
for (FieldNode col : cols) {
colNames.add(col.getFieldName());
}
for (int i = 0; i < originalOutputColumnNames.size(); i++) {
String colName = originalOutputColumnNames.get(i);
if (colNames.contains(colName)) {
newOutputColumnNames.add(colName);
newColList.add(originalColList.get(i));
rs_newsignature.add(rs_oldsignature.get(i));
}
}
op.getSchema().setSignature(rs_newsignature);
conf.setColList(newColList);
conf.setOutputColumnNames(newOutputColumnNames);
handleChildren(op, toColumnNames(cols), cppCtx);
}
return null;
}
/**
* since we pruned the select operator, we should let its children operator
* know that. ReduceSinkOperator may send out every output columns of its
* parent select. When the select operator is pruned, its child reduce
* sink(direct child) operator should also be pruned.
*
* @param op
* @param retainedSelOutputCols
* @throws SemanticException
*/
private void handleChildren(SelectOperator op,
List<String> retainedSelOutputCols, ColumnPrunerProcCtx cppCtx) throws SemanticException {
for (Operator<? extends OperatorDesc> child : op.getChildOperators()) {
if (child instanceof ReduceSinkOperator) {
boolean[] flags = getPruneReduceSinkOpRetainFlags(
retainedSelOutputCols, (ReduceSinkOperator) child);
pruneReduceSinkOperator(flags, (ReduceSinkOperator) child, cppCtx);
} else if (child instanceof FilterOperator) {
// filter operator has the same output columns as its parent
for (Operator<? extends OperatorDesc> filterChild : child
.getChildOperators()) {
if (filterChild instanceof ReduceSinkOperator) {
boolean[] flags = getPruneReduceSinkOpRetainFlags(
retainedSelOutputCols, (ReduceSinkOperator) filterChild);
pruneReduceSinkOperator(flags, (ReduceSinkOperator) filterChild,
cppCtx);
}
}
}
}
}
}
private static boolean[] getPruneReduceSinkOpRetainFlags(
List<String> retainedParentOpOutputCols, ReduceSinkOperator reduce) {
ReduceSinkDesc reduceConf = reduce.getConf();
java.util.ArrayList<ExprNodeDesc> originalValueEval = reduceConf
.getValueCols();
boolean[] flags = new boolean[originalValueEval.size()];
for (int i = 0; i < originalValueEval.size(); i++) {
flags[i] = false;
List<String> current = originalValueEval.get(i).getCols();
if (current == null || current.size() == 0) {
flags[i] = true;
} else {
for (int j = 0; j < current.size(); j++) {
if (retainedParentOpOutputCols.contains(current.get(j))) {
flags[i] = true;
break;
}
}
}
}
return flags;
}
private static void pruneReduceSinkOperator(boolean[] retainFlags,
ReduceSinkOperator reduce, ColumnPrunerProcCtx cppCtx) throws SemanticException {
ReduceSinkDesc reduceConf = reduce.getConf();
Map<String, ExprNodeDesc> oldMap = reduce.getColumnExprMap();
LOG.info("RS " + reduce.getIdentifier() + " oldColExprMap: " + oldMap);
RowSchema oldRS = reduce.getSchema();
ArrayList<ColumnInfo> old_signature = oldRS.getSignature();
ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>(old_signature);
List<String> valueColNames = reduceConf.getOutputValueColumnNames();
ArrayList<String> newValueColNames = new ArrayList<String>();
List<ExprNodeDesc> keyExprs = reduceConf.getKeyCols();
List<ExprNodeDesc> valueExprs = reduceConf.getValueCols();
ArrayList<ExprNodeDesc> newValueExprs = new ArrayList<ExprNodeDesc>();
for (int i = 0; i < retainFlags.length; i++) {
String outputCol = valueColNames.get(i);
ExprNodeDesc outputColExpr = valueExprs.get(i);
if (!retainFlags[i]) {
ColumnInfo colInfo = oldRS.getColumnInfo(outputCol);
if (colInfo == null) {
outputCol = Utilities.ReduceField.VALUE.toString() + "." + outputCol;
colInfo = oldRS.getColumnInfo(outputCol);
}
// In case there are multiple columns referenced to the same column name, we won't
// do row resolve once more because the ColumnInfo in row resolver is already removed
if (colInfo == null) {
continue;
}
// Only remove information of a column if it is not a key,
// i.e. this column is not appearing in keyExprs of the RS
if (ExprNodeDescUtils.indexOf(outputColExpr, keyExprs) == -1) {
oldMap.remove(outputCol);
signature.remove(colInfo);
}
} else {
newValueColNames.add(outputCol);
newValueExprs.add(outputColExpr);
}
}
oldRS.setSignature(signature);
reduce.getSchema().setSignature(signature);
reduceConf.setOutputValueColumnNames(newValueColNames);
reduceConf.setValueCols(newValueExprs);
TableDesc newValueTable = PlanUtils.getReduceValueTableDesc(PlanUtils
.getFieldSchemasFromColumnList(reduceConf.getValueCols(),
newValueColNames, 0, ""));
reduceConf.setValueSerializeInfo(newValueTable);
LOG.info("RS " + reduce.getIdentifier() + " newColExprMap: " + oldMap);
}
/**
* The Factory method to get the ColumnPrunerSelectProc class.
*
* @return ColumnPrunerSelectProc
*/
public static ColumnPrunerSelectProc getSelectProc() {
return new ColumnPrunerSelectProc();
}
public static ColumnPrunerLateralViewJoinProc getLateralViewJoinProc() {
return new ColumnPrunerLateralViewJoinProc();
}
public static ColumnPrunerLateralViewForwardProc getLateralViewForwardProc() {
return new ColumnPrunerLateralViewForwardProc();
}
/**
* The Node Processor for Column Pruning on Join Operators.
*/
public static class ColumnPrunerJoinProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
JoinOperator op = (JoinOperator) nd;
pruneJoinOperator(ctx, op, op.getConf(), op.getColumnExprMap(), null,
false);
return null;
}
}
/**
* The Factory method to get ColumnJoinProc class.
*
* @return ColumnPrunerJoinProc
*/
public static ColumnPrunerJoinProc getJoinProc() {
return new ColumnPrunerJoinProc();
}
/**
* The Node Processor for Column Pruning on Map Join Operators.
*/
public static class ColumnPrunerMapJoinProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
Object... nodeOutputs) throws SemanticException {
AbstractMapJoinOperator<MapJoinDesc> op = (AbstractMapJoinOperator<MapJoinDesc>) nd;
pruneJoinOperator(ctx, op, op.getConf(), op.getColumnExprMap(), op
.getConf().getRetainList(), true);
return null;
}
}
/**
* The Factory method to get UnionProc class.
*
* @return UnionProc
*/
public static ColumnPrunerUnionProc getUnionProc() {
return new ColumnPrunerUnionProc();
}
/**
* The Node Processor for Column Pruning on Union Operators.
*/
public static class ColumnPrunerUnionProc implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs)
throws SemanticException {
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
UnionOperator op = (UnionOperator) nd;
List<FieldNode> childColLists = cppCtx.genColLists(op);
if (childColLists == null) {
return null;
}
RowSchema inputSchema = op.getSchema();
if (inputSchema != null) {
List<FieldNode> prunedCols = new ArrayList<>();
for (int index = 0; index < inputSchema.getSignature().size(); index++) {
ColumnInfo colInfo = inputSchema.getSignature().get(index);
FieldNode fn = lookupColumn(childColLists, colInfo.getInternalName());
if (fn != null) {
prunedCols.add(fn);
}
}
cppCtx.getPrunedColLists().put(op, prunedCols);
}
return null;
}
}
private static void pruneOperator(NodeProcessorCtx ctx,
Operator<? extends OperatorDesc> op,
List<FieldNode> cols)
throws SemanticException {
// the pruning needs to preserve the order of columns in the input schema
RowSchema inputSchema = op.getSchema();
if (inputSchema != null) {
ArrayList<ColumnInfo> rs = new ArrayList<ColumnInfo>();
RowSchema oldRS = op.getSchema();
for(ColumnInfo i : oldRS.getSignature()) {
if (lookupColumn(cols, i.getInternalName()) != null) {
rs.add(i);
}
}
op.getSchema().setSignature(rs);
}
}
/**
* The pruning needs to preserve the order of columns in the input schema
* @param op
* @param cols
* @return
* @throws SemanticException
*/
private static List<FieldNode> preserveColumnOrder(Operator<? extends OperatorDesc> op,
List<FieldNode> cols)
throws SemanticException {
RowSchema inputSchema = op.getSchema();
if (inputSchema != null) {
ArrayList<FieldNode> rs = new ArrayList<>();
ArrayList<ColumnInfo> inputCols = inputSchema.getSignature();
for (ColumnInfo i: inputCols) {
FieldNode fn = lookupColumn(cols, i.getInternalName());
if (fn != null) {
rs.add(fn);
}
}
return rs;
} else {
return cols;
}
}
private static void pruneJoinOperator(NodeProcessorCtx ctx,
CommonJoinOperator op, JoinDesc conf,
Map<String, ExprNodeDesc> columnExprMap,
Map<Byte, List<Integer>> retainMap, boolean mapJoin) throws SemanticException {
ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
List<Operator<? extends OperatorDesc>> childOperators = op
.getChildOperators();
LOG.info("JOIN " + op.getIdentifier() + " oldExprs: " + conf.getExprs());
if (cppCtx.genColLists(op) == null) {
return;
}
List<FieldNode> neededColList = new ArrayList<>(cppCtx.genColLists(op));
Map<Byte, List<FieldNode>> prunedColLists = new HashMap<>();
for (byte tag : conf.getTagOrder()) {
prunedColLists.put(tag, new ArrayList<FieldNode>());
}
//add the columns in join filters
Set<Map.Entry<Byte, List<ExprNodeDesc>>> filters =
conf.getFilters().entrySet();
Iterator<Map.Entry<Byte, List<ExprNodeDesc>>> iter = filters.iterator();
while (iter.hasNext()) {
Map.Entry<Byte, List<ExprNodeDesc>> entry = iter.next();
Byte tag = entry.getKey();
for (ExprNodeDesc desc : entry.getValue()) {
List<FieldNode> cols = prunedColLists.get(tag);
cols = mergeFieldNodesWithDesc(cols, desc);
prunedColLists.put(tag, cols);
}
}
//add the columns in residual filters
if (conf.getResidualFilterExprs() != null) {
for (ExprNodeDesc desc : conf.getResidualFilterExprs()) {
neededColList = mergeFieldNodesWithDesc(neededColList, desc);
}
}
RowSchema joinRS = op.getSchema();
ArrayList<String> outputCols = new ArrayList<String>();
ArrayList<ColumnInfo> rs = new ArrayList<ColumnInfo>();
Map<String, ExprNodeDesc> newColExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < conf.getOutputColumnNames().size(); i++) {
String internalName = conf.getOutputColumnNames().get(i);
ExprNodeDesc desc = columnExprMap.get(internalName);
Byte tag = conf.getReversedExprs().get(internalName);
if (lookupColumn(neededColList, internalName) == null) {
int index = conf.getExprs().get(tag).indexOf(desc);
if (index < 0) {
continue;
}
conf.getExprs().get(tag).remove(desc);
if (retainMap != null) {
retainMap.get(tag).remove(index);
}
} else {
List<FieldNode> prunedRSList = prunedColLists.get(tag);
if (prunedRSList == null) {
prunedRSList = new ArrayList<>();
prunedColLists.put(tag, prunedRSList);
}
prunedColLists.put(tag, mergeFieldNodesWithDesc(prunedRSList, desc));
outputCols.add(internalName);
newColExprMap.put(internalName, desc);
}
}
if (mapJoin) {
// regenerate the valueTableDesc
List<TableDesc> valueTableDescs = new ArrayList<TableDesc>();
for (int pos = 0; pos < op.getParentOperators().size(); pos++) {
List<ExprNodeDesc> valueCols = conf.getExprs()
.get(Byte.valueOf((byte) pos));
StringBuilder keyOrder = new StringBuilder();
for (int i = 0; i < valueCols.size(); i++) {
keyOrder.append("+");
}
TableDesc valueTableDesc = PlanUtils.getMapJoinValueTableDesc(PlanUtils
.getFieldSchemasFromColumnList(valueCols, "mapjoinvalue"));
valueTableDescs.add(valueTableDesc);
}
((MapJoinDesc) conf).setValueTblDescs(valueTableDescs);
Set<Map.Entry<Byte, List<ExprNodeDesc>>> exprs = ((MapJoinDesc) conf)
.getKeys().entrySet();
Iterator<Map.Entry<Byte, List<ExprNodeDesc>>> iters = exprs.iterator();
while (iters.hasNext()) {
Map.Entry<Byte, List<ExprNodeDesc>> entry = iters.next();
List<ExprNodeDesc> lists = entry.getValue();
for (int j = 0; j < lists.size(); j++) {
ExprNodeDesc desc = lists.get(j);
Byte tag = entry.getKey();
List<FieldNode> cols = prunedColLists.get(tag);
cols = mergeFieldNodesWithDesc(cols, desc);
prunedColLists.put(tag, cols);
}
}
}
for (Operator<? extends OperatorDesc> child : childOperators) {
if (child instanceof ReduceSinkOperator) {
boolean[] flags = getPruneReduceSinkOpRetainFlags(toColumnNames(neededColList),
(ReduceSinkOperator) child);
pruneReduceSinkOperator(flags, (ReduceSinkOperator) child, cppCtx);
}
}
for (int i = 0; i < outputCols.size(); i++) {
String internalName = outputCols.get(i);
ColumnInfo col = joinRS.getColumnInfo(internalName);
rs.add(col);
}
LOG.info("JOIN " + op.getIdentifier() + " newExprs: " + conf.getExprs());
op.setColumnExprMap(newColExprMap);
conf.setOutputColumnNames(outputCols);
op.getSchema().setSignature(rs);
cppCtx.getJoinPrunedColLists().put(op, prunedColLists);
}
/**
* The Factory method to get ColumnMapJoinProc class.
*
* @return ColumnPrunerMapJoinProc
*/
public static ColumnPrunerMapJoinProc getMapJoinProc() {
return new ColumnPrunerMapJoinProc();
}
}