/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.lineage;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.LineageInfo;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyType;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.Predicate;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* Expression processor factory for lineage. Each processor is responsible to
* create the leaf level column info objects that the expression depends upon
* and also generates a string representation of the expression.
*/
public class ExprProcFactory {
/**
* Processor for column expressions.
*/
public static class ColumnExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
ExprNodeColumnDesc cd = (ExprNodeColumnDesc) nd;
ExprProcCtx epc = (ExprProcCtx) procCtx;
// assert that the input operator is not null as there are no
// exprs associated with table scans.
Operator<? extends OperatorDesc> operator = epc.getInputOperator();
assert (operator != null);
RowSchema schema = epc.getSchema();
ColumnInfo ci = schema.getColumnInfo(cd.getColumn());
if (ci == null && operator instanceof ReduceSinkOperator) {
ci = schema.getColumnInfo(Utilities.removeValueTag(cd.getColumn()));
}
// Insert the dependencies of inp_ci to that of the current operator, ci
LineageCtx lc = epc.getLineageCtx();
Dependency dep = lc.getIndex().getDependency(operator, ci);
return dep;
}
}
/**
* Processor for any function or field expression.
*/
public static class GenericExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
assert (nd instanceof ExprNodeGenericFuncDesc || nd instanceof ExprNodeFieldDesc);
// Concatenate the dependencies of all the children to compute the new
// dependency.
Dependency dep = new Dependency();
LinkedHashSet<BaseColumnInfo> bci_set = new LinkedHashSet<BaseColumnInfo>();
LineageInfo.DependencyType new_type = LineageInfo.DependencyType.EXPRESSION;
for (Object child : nodeOutputs) {
if (child == null) {
continue;
}
Dependency child_dep = (Dependency) child;
new_type = LineageCtx.getNewDependencyType(child_dep.getType(), new_type);
bci_set.addAll(child_dep.getBaseCols());
}
dep.setBaseCols(bci_set);
dep.setType(new_type);
return dep;
}
}
/**
* Processor for constants and null expressions. For such expressions the
* processor simply returns a null dependency vector.
*/
public static class DefaultExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
assert (nd instanceof ExprNodeConstantDesc);
// Create a dependency that has no basecols
Dependency dep = new Dependency();
dep.setType(LineageInfo.DependencyType.SIMPLE);
dep.setBaseCols(new LinkedHashSet<BaseColumnInfo>());
return dep;
}
}
public static NodeProcessor getDefaultExprProcessor() {
return new DefaultExprProcessor();
}
public static NodeProcessor getGenericFuncProcessor() {
return new GenericExprProcessor();
}
public static NodeProcessor getFieldProcessor() {
return new GenericExprProcessor();
}
public static NodeProcessor getColumnProcessor() {
return new ColumnExprProcessor();
}
private static boolean findSourceColumn(
LineageCtx lctx, Predicate cond, String tabAlias, String alias) {
for (Map.Entry<String, TableScanOperator> topOpMap: lctx.getParseCtx().getTopOps().entrySet()) {
TableScanOperator tableScanOp = topOpMap.getValue();
Table tbl = tableScanOp.getConf().getTableMetadata();
if (tbl.getTableName().equals(tabAlias)
|| tabAlias.equals(tableScanOp.getConf().getAlias())) {
for (FieldSchema column: tbl.getCols()) {
if (column.getName().equals(alias)) {
TableAliasInfo table = new TableAliasInfo();
table.setTable(tbl.getTTable());
table.setAlias(tabAlias);
BaseColumnInfo colInfo = new BaseColumnInfo();
colInfo.setColumn(column);
colInfo.setTabAlias(table);
cond.getBaseCols().add(colInfo);
return true;
}
}
}
}
return false;
}
/**
* Get the expression string of an expression node.
*/
public static String getExprString(RowSchema rs, ExprNodeDesc expr,
LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, Predicate cond) {
if (expr instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc col = (ExprNodeColumnDesc) expr;
String internalName = col.getColumn();
String alias = internalName;
String tabAlias = col.getTabAlias();
ColumnInfo ci = rs.getColumnInfo(internalName);
if (ci != null) {
if (ci.getAlias() != null) {
alias = ci.getAlias();
}
if (ci.getTabAlias() != null) {
tabAlias = ci.getTabAlias();
}
}
Dependency dep = lctx.getIndex().getDependency(inpOp, internalName);
if ((tabAlias == null || tabAlias.startsWith("_") || tabAlias.startsWith("$"))
&& (dep != null && dep.getType() == DependencyType.SIMPLE)) {
Set<BaseColumnInfo> baseCols = dep.getBaseCols();
if (baseCols != null && !baseCols.isEmpty()) {
BaseColumnInfo baseCol = baseCols.iterator().next();
tabAlias = baseCol.getTabAlias().getAlias();
alias = baseCol.getColumn().getName();
}
}
if (tabAlias != null && tabAlias.length() > 0
&& !tabAlias.startsWith("_") && !tabAlias.startsWith("$")) {
if (cond != null && !findSourceColumn(lctx, cond, tabAlias, alias) && dep != null) {
cond.getBaseCols().addAll(dep.getBaseCols());
}
return tabAlias + "." + alias;
}
if (dep != null) {
if (cond != null) {
cond.getBaseCols().addAll(dep.getBaseCols());
}
if (dep.getExpr() != null) {
return dep.getExpr();
}
}
if (alias.startsWith("_")) {
ci = inpOp.getSchema().getColumnInfo(internalName);
if (ci != null && ci.getAlias() != null) {
alias = ci.getAlias();
}
}
return alias;
} else if (expr instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc func = (ExprNodeGenericFuncDesc) expr;
List<ExprNodeDesc> children = func.getChildren();
String[] childrenExprStrings = new String[children.size()];
for (int i = 0; i < childrenExprStrings.length; i++) {
childrenExprStrings[i] = getExprString(rs, children.get(i), lctx, inpOp, cond);
}
return func.getGenericUDF().getDisplayString(childrenExprStrings);
}
return expr.getExprString();
}
/**
* Gets the expression dependencies for the expression.
*
* @param lctx
* The lineage context containing the input operators dependencies.
* @param inpOp
* The input operator to the current operator.
* @param expr
* The expression that is being processed.
* @throws SemanticException
*/
public static Dependency getExprDependency(LineageCtx lctx,
Operator<? extends OperatorDesc> inpOp, ExprNodeDesc expr)
throws SemanticException {
// Create the walker, the rules dispatcher and the context.
ExprProcCtx exprCtx = new ExprProcCtx(lctx, inpOp);
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack. The dispatcher
// generates the plan from the operator tree
Map<Rule, NodeProcessor> exprRules = new LinkedHashMap<Rule, NodeProcessor>();
exprRules.put(
new RuleRegExp("R1", ExprNodeColumnDesc.class.getName() + "%"),
getColumnProcessor());
exprRules.put(
new RuleRegExp("R2", ExprNodeFieldDesc.class.getName() + "%"),
getFieldProcessor());
exprRules.put(new RuleRegExp("R3", ExprNodeGenericFuncDesc.class.getName()
+ "%"), getGenericFuncProcessor());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(getDefaultExprProcessor(),
exprRules, exprCtx);
GraphWalker egw = new DefaultGraphWalker(disp);
List<Node> startNodes = new ArrayList<Node>();
startNodes.add(expr);
HashMap<Node, Object> outputMap = new HashMap<Node, Object>();
egw.startWalking(startNodes, outputMap);
return (Dependency)outputMap.get(expr);
}
}