/* * (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. */ package com.linkedin.cubert.analyzer.physical; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import com.linkedin.cubert.analyzer.physical.SemanticAnalyzer; import com.linkedin.cubert.analyzer.physical.LineageGraph.*; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.operator.PostCondition; import com.linkedin.cubert.operator.aggregate.AggregationType; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.Pair; import com.linkedin.cubert.utils.RewriteUtils; public class Lineage { public static class LineageException extends Exception { public LineageException(String mesg) { super(mesg); } } private static final int LEFT = 0; private static final int RIGHT = 0; private SemanticAnalyzer.Node nodeInformation; private LineageBuilder lineageInfo = null; private JsonNode programNode = null; public Lineage(SemanticAnalyzer.Node nodeInfo) { this.nodeInformation = nodeInfo; } public Lineage() { } public LineageBuilder getLineage() { return lineageInfo; } // 0423: cleanup starts here // A bunch of methods that walk operators. public String[] getSchemaOutputColumns(ObjectNode opNode) { BlockSchema bschema = new BlockSchema(opNode.get("schema")); return bschema.getColumnNames(); } public boolean isBlockgenByIndex(ObjectNode bgNode) { // LineageHelper.trace("BGI check invoked for " + bgNode); String bgType = bgNode.get("blockgenType").getTextValue(); return (bgType.equals("BY_INDEX") || bgType.equals("BY_BLOCK")); } public static String getDatedPathRoot(ArrayNode pathArray) { for (JsonNode pathNode : pathArray) { if (pathNode instanceof ObjectNode && ((ObjectNode) pathNode.get("startDate") != null)) return pathNode.get("root").getTextValue(); } return null; } public static List<String> getPaths(JsonNode pathNode) { return LineageHelper.getPaths(pathNode); } public static interface OperatorVisitor { public boolean inspect(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode); } public static void visitOperators(ObjectNode programNode, ObjectNode jobNode, OperatorVisitor tracerObj) { visitOperators(programNode, jobNode, tracerObj, false); } private static int increment(boolean reverse) { return reverse ? -1 : 1; } public static void visitOperators(ObjectNode programNode, ObjectNode jobNode, OperatorVisitor tracerObj, boolean reverse) { ArrayNode jobs = (ArrayNode) programNode.get("jobs"); int si = (reverse ? jobs.size() - 1 : 0); int ei = (reverse ? -1 : jobs.size()); for (int i = si; i != ei; i = i + increment(reverse)) { if (jobNode != null && jobNode != jobs.get(i)) continue; if (!visitOperatorsInJob(programNode, (ObjectNode) jobs.get(i), tracerObj, reverse)) return; } } private static boolean visitOperatorsInJob(ObjectNode programNode, ObjectNode jobNode, OperatorVisitor visitorObj, boolean reverse) { if (!visitMappers(jobNode, visitorObj, reverse)) return false; if (!visitReducers(jobNode, visitorObj, reverse)) return false; return true; } private static boolean visitReducers(ObjectNode jobNode, OperatorVisitor visitorObj, boolean reverse) { ArrayNode reduceOperators; if (jobNode.get("reduce") != null && !jobNode.get("reduce").isNull()) reduceOperators = (ArrayNode) jobNode.get("reduce"); else return true; if (reverse) { if (!visitorObj.inspect(jobNode, (JsonNode) reduceOperators, (ObjectNode) jobNode.get("output"))) return false; if (!visitOperatorsInArray(jobNode, reduceOperators, reduceOperators, visitorObj, reverse)) return false; } else { if (!visitOperatorsInArray(jobNode, reduceOperators, reduceOperators, visitorObj, reverse)) return false; if (!visitorObj.inspect(jobNode, (JsonNode) reduceOperators, (ObjectNode) jobNode.get("output"))) return false; } return true; } private static boolean visitMappers(ObjectNode jobNode, OperatorVisitor visitorObj, boolean reverse) { ArrayNode mappers = (ArrayNode) jobNode.get("map"); int si = (reverse ? mappers.size() - 1 : 0); int ei = (reverse ? -1 : mappers.size()); // TODO Auto-generated method stub for (int i = si; i != ei; i += increment(reverse)) { if (!visitMapNode(jobNode, mappers.get(i), visitorObj, reverse)) return false; } return true; } private static boolean visitMapNode(ObjectNode jobNode, JsonNode mapNode, OperatorVisitor visitorObj, boolean reverse) { if (reverse) { if (!visitOperatorsInArray(jobNode, mapNode, (ArrayNode) mapNode.get("operators"), visitorObj, reverse)) return false; // LineageHelper.trace("Visiting map node input node reverse"); if (!visitorObj.inspect(jobNode, (JsonNode) mapNode, (ObjectNode) mapNode.get("input"))) return false; } else { // LineageHelper.trace("Visiting mapNode input forward"); if (!visitorObj.inspect(jobNode, (JsonNode) mapNode, (ObjectNode) mapNode.get("input"))) return false; if (!visitOperatorsInArray(jobNode, mapNode, (ArrayNode) mapNode.get("operators"), visitorObj, reverse)) return false; } return true; } private static boolean visitOperatorsInArray(ObjectNode jobNode, JsonNode phaseNode, ArrayNode operatorArray, OperatorVisitor visitorObj, boolean reverse) { int si = reverse ? operatorArray.size() - 1 : 0; int ei = reverse ? -1 : operatorArray.size(); for (int i = si; i != ei; i += increment(reverse)) { if (!visitorObj.inspect(jobNode, phaseNode, (ObjectNode) operatorArray.get(i))) return false; } return true; } private static boolean isAvroLoad(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode) { if (operatorNode.get("operator") != null || !operatorNode.get("type").getTextValue().equalsIgnoreCase("AVRO")) return false; return true; } // captures the notion of an output column which is a (opNode, columnName) public static class OutputColumn { @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((columnName == null) ? 0 : columnName.hashCode()); result = prime * result + (opNode == null ? 0 : System.identityHashCode(opNode)); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; OutputColumn other = (OutputColumn) obj; if (columnName == null) { if (other.columnName != null) return false; } else if (!columnName.equals(other.columnName)) return false; if (opNode == null) { if (other.opNode != null) return false; } else if (opNode != other.opNode) return false; return true; } public OutputColumn(ObjectNode opNode, String columnName) { this.opNode = opNode; this.columnName = columnName; } public String toString() { return ("opNode=" + opNode.toString() + " columnName = " + columnName.toString()); } public ObjectNode opNode; public String columnName; } // Given a column referenced in the output of the specified operator // (destinationOperator) where did the column // actually get loaded public List<OutputColumn> traceLoadColumn(ObjectNode programNode, OutputColumn destColumn) throws LineageException { List<OutputColumn> loadColumns = new ArrayList<OutputColumn>(); ColumnLineage columnNode = this.lineageInfo.getColumnLineageNode(destColumn); List<LineageGraphVertex> list1 = LineageGraph.traceTerminalNodes(columnNode, new String[] { "LOAD" }, false); List<LineageGraphVertex> list2 = LineageGraph.traceTerminalNodes(columnNode, new String[] { "LOAD-BLOCK" }, false); list1.addAll(list2); for (LineageGraphVertex graphNode : list1) { ColumnLineage colInfo = (ColumnLineage) graphNode; Pair<ObjectNode, JsonNode> phaseInfo = this.getLineage().getPreLineage().getJobPhase(colInfo.node.opNode); if (LineageHelper.isLoadOperator(phaseInfo.getFirst(), phaseInfo.getSecond(), colInfo.node.opNode)) loadColumns.add(colInfo.node); } return loadColumns; } public static String getOperatorType(ObjectNode jobNode, JsonNode phaseNode, ObjectNode opNode) { if (opNode.get("operator") != null) return opNode.get("operator").getTextValue(); if (LineageHelper.isStoreCommand(jobNode, phaseNode, opNode)) return "STORE"; if (LineageHelper.isLoadOperator(jobNode, phaseNode, opNode)) return "LOAD"; // LineageHelper.trace("NULL operator type for " + opNode.toString() + "\n**********"); // LineageHelper.trace("phaseNode = " + phaseNode.toString() + "\n*************"); // LineageHelper.trace("jobNode = " + jobNode.toString() + "\n****************"); throw new RuntimeException("Cannot find operatorType"); } public static int getColumnIndexFromOutSchema(ObjectNode opNode, String outputColumnName) { String[] outColNames = new BlockSchema(opNode.get("schema")).getColumnNames(); int colid; for (colid = 0; colid < outColNames.length; colid++) { if (outColNames[colid].equals(outputColumnName)) return colid; } return -1; } public static class LineageBuilder implements OperatorVisitor { public HashMap<Integer, OperatorLineage> opLineageMap = new HashMap<Integer, OperatorLineage>(); public HashMap<OutputColumn, ColumnLineage> columnLineageMap = new HashMap<OutputColumn, ColumnLineage>(); private LineageHelper preLineageInfo; public boolean exception = false; public LineageBuilder(LineageHelper preLineageInfo) { this.preLineageInfo = preLineageInfo; } public LineageHelper getPreLineage() { return this.preLineageInfo; } private <T> void operatorMapPut(HashMap<Integer, T> operatorMap, ObjectNode operatorNode, T valueObj) { Integer opSequence = this.preLineageInfo.getOpSequence(operatorNode); operatorMap.put(opSequence, valueObj); } private <T> T operatorMapGet(HashMap<Integer, T> operatorMap, ObjectNode operatorNode) { Integer opSequence = this.preLineageInfo.getOpSequence(operatorNode); return operatorMap.get(opSequence); } public boolean inspect(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode) { ArrayList<ObjectNode> sourceOperators = preLineageInfo.findAllOperatorSources(jobNode, phaseNode, operatorNode); if (sourceOperators != null) { for (ObjectNode sourceNode : sourceOperators) addOperatorLineage(sourceNode, operatorNode); } Pair<ObjectNode, JsonNode> jobPhase = preLineageInfo.getJobPhase(operatorNode); if (jobPhase.getSecond() != phaseNode) throw new RuntimeException("mis-matched phaseNode stored in phaseMap for \nopNode= " + operatorNode.toString() + "\nphaseNode = " + jobPhase.getSecond().toString() + "\noriginal phaseNode = " + phaseNode.toString()); LineageHelper.trace("Lineage Visitor visiting " + operatorNode.toString()); // Now capture columnLineage for all output columns at this node BlockSchema outSchema = new BlockSchema(operatorNode.get("schema")); for (String colName : outSchema.getColumnNames()) { OutputColumn destColumn = new OutputColumn(operatorNode, colName); List<OutputColumn> sourceColumns = getSourceColumns(operatorNode, colName); if (sourceColumns != null) { for (OutputColumn sourceColumn : sourceColumns) addColumnLineage(sourceColumn, destColumn); } } return true; } private void addColumnLineage(OutputColumn sourceColumn, OutputColumn destColumn) { // TODO Auto-generated method stub ColumnLineage sourceLineage = getColumnLineageNode(sourceColumn); ColumnLineage destLineage = getColumnLineageNode(destColumn); if (CommonUtils.indexOfByRef(destLineage.parentNodes, sourceLineage) == -1) destLineage.parentNodes.add(sourceLineage); if (CommonUtils.indexOfByRef(sourceLineage.childNodes, destLineage) == -1) { sourceLineage.childNodes.add(destLineage); if (true) { LineageHelper.trace("Adding columnLineage "); LineageHelper.trace("****************\n Source = " + sourceLineage); LineageHelper.trace("****************\n Dest =" + destLineage); } } } private void addOperatorLineage(ObjectNode sourceNode, ObjectNode operatorNode) { if (sourceNode == null) throw new RuntimeException("Null source node for operator " + operatorNode); OperatorLineage sourceLineage = getOperatorLineageNode(sourceNode); OperatorLineage destLineage = getOperatorLineageNode(operatorNode); if (CommonUtils.indexOfByRef(destLineage.parentNodes, sourceLineage) == -1) destLineage.parentNodes.add(sourceLineage); if (CommonUtils.indexOfByRef(sourceLineage.childNodes, destLineage) == -1) { sourceLineage.childNodes.add(destLineage); if (true) { LineageHelper.trace("adding operator lineage\n*********** \nsourceNode\n*********************\n" + sourceNode.toString()); LineageHelper.trace("\ndestNode\n*************\n" + operatorNode.toString()); } } } public OperatorLineage getOperatorLineageNode(ObjectNode sourceNode) { OperatorLineage result = this.operatorMapGet(opLineageMap, sourceNode); if (result != null) return result; result = new OperatorLineage(sourceNode, getNodeType(sourceNode)); operatorMapPut(opLineageMap, sourceNode, result); return result; } private String getNodeType(ObjectNode sourceNode) { Pair<ObjectNode, JsonNode> jobPhase = this.preLineageInfo.getJobPhase(sourceNode); if (jobPhase == null) { throw new RuntimeException("PhaseInformation missing for " + sourceNode.toString() + "\n*********"); } return getOperatorType(jobPhase.getFirst(), jobPhase.getSecond(), sourceNode); } public ColumnLineage getColumnLineageNode(OutputColumn sourceColumn) { ColumnLineage result = this.columnLineageMap.get(sourceColumn); if (result != null) return result; result = new ColumnLineage(sourceColumn, getNodeType(sourceColumn.opNode)); this.columnLineageMap.put(sourceColumn, result); return result; } /* Main module that traces the lineage of a column (specified in outputColumnName) across an operator specified in opNode. */ public List<OutputColumn> getSourceColumns(ObjectNode opNode, String outputColumnName) { List<OutputColumn> sourceColumns = new ArrayList<OutputColumn>(); Pair<ObjectNode, JsonNode> jobPhase = this.preLineageInfo.getJobPhase(opNode); String opType = getOperatorType(jobPhase.getFirst(), jobPhase.getSecond(), opNode); if (opType.equalsIgnoreCase("LOAD") || opType.equalsIgnoreCase("LOAD_BLOCK")) { List<ObjectNode> storeNodes = this.preLineageInfo.findAllParentStores(jobPhase.getFirst(), jobPhase.getSecond(), opNode); if (storeNodes == null || storeNodes.size() == 0) { LineageHelper.trace("Cannot find matching parent STORE for " + opNode.toString()); return sourceColumns; } // LineageHelper.trace("Found matching parent store for opNode " + // opNode.toString() + " storeNodes = " + // CommonUtils.listAsString(storeNodes)); for (ObjectNode storeNode : storeNodes) { int colid = getColumnIndexFromOutSchema(opNode, outputColumnName); if (colid == -1) throw new RuntimeException("Cannot find column " + outputColumnName + " in operator " + opNode.toString()); String inputColName = new BlockSchema(storeNode.get("schema")).getColumnNames()[colid]; sourceColumns.add(new OutputColumn(storeNode, inputColName)); } } else if (opType.equals("STORE")) { List<ObjectNode> sourceOps = preLineageInfo.findOperatorInputSources(opNode, opNode.get("name") .getTextValue()); for (ObjectNode sourceOp : sourceOps) sourceColumns.add(new OutputColumn(sourceOp, outputColumnName)); } else if (opType.equalsIgnoreCase("GENERATE")) { List<ObjectNode> sourceOps = preLineageInfo.findOperatorInputSources(opNode, opNode.get("input") .getTextValue()); for (ObjectNode sourceOp : sourceOps) { for (JsonNode exprNode : (ArrayNode) (opNode.get("outputTuple"))) { if (exprNode.get("col_name") .getTextValue() .equals(outputColumnName)) { List<String> inputColumns = getExpressionColumns(sourceOp, exprNode.get("expression")); if (inputColumns == null) continue; for (String icol : inputColumns) sourceColumns.add(new OutputColumn(sourceOp, icol)); } } } } else if (opType.equalsIgnoreCase("HASHJOIN") || opType.equalsIgnoreCase("JOIN")) { String[] splits = outputColumnName.split("___"); List<ObjectNode> sourceOps = preLineageInfo.findOperatorInputSources(opNode, splits[0]); for (ObjectNode sourceOp : sourceOps) sourceColumns.add(new OutputColumn(sourceOp, splits[1])); // TODO : for join key add lineage to both parents. addJoinKeyLineageToOtherInput(opNode, sourceColumns, splits[0], splits[1]); } else if (opType.equalsIgnoreCase("GROUP_BY") || opType.equalsIgnoreCase("CUBE")) { // add lineage from each input measure to group by output List<ObjectNode> sourceOps = preLineageInfo.findOperatorInputSources(opNode, opNode.get("input") .getTextValue()); for (ObjectNode sourceOp : sourceOps) { ArrayNode aggregates = (ArrayNode) opNode.get("aggregates"); for (JsonNode aggNode : aggregates) { String[] inputCols = JsonUtils.asArray(aggNode, "input"); String dest = ((ObjectNode) aggNode).get("output").getTextValue(); if (dest.equals(outputColumnName)){ for (String inputCol: inputCols) sourceColumns.add(new OutputColumn(sourceOp, inputCol)); } } // add lineage from gbyCols or dimensions String[] gbyCols = JsonUtils.asArray(opType.equals("GROUP_BY") ? opNode.get("groupBy") : opNode.get("dimensions")); if (opNode.get("innerDimensions") != null) gbyCols = CommonUtils.concat(gbyCols, JsonUtils.asArray(opNode.get("innerDimensions"))); for (String gbyCol: gbyCols) { if (gbyCol.equals(outputColumnName)) sourceColumns.add(new OutputColumn(sourceOp, gbyCol)); } } } else if (opType.equals("FLATTEN")){ // TODO. this.exception = true; } // handle exception cases else if (opType.equals("USER_DEFINED_TUPLE_OPERATOR") || opType.equals("USER_DEFINED_BLOCK_OPERATOR") || opType.equals("RANK")) { this.exception = true; } else if (opNode.get("input") != null) { String[] inputRelations = RewriteUtils.getInputRelations(opNode); for (String inputRelation : inputRelations) { List<ObjectNode> sourceOps = preLineageInfo.findOperatorInputSources(opNode, inputRelation); for (ObjectNode sourceOp : sourceOps) sourceColumns.add(new OutputColumn(sourceOp, outputColumnName)); } } return sourceColumns; } private void addJoinKeyLineageToOtherInput(ObjectNode opNode, List<OutputColumn> sourceCols, String relationName, String colName) { // nothing to do for outer joins if (opNode.get("joinType") != null) return; String[] inputRelations = JsonUtils.asArray(opNode.get("input")); int relpos = inputRelations[0].equals(relationName) ? LEFT: RIGHT; String[] joinKeys = getJoinKeys(opNode,relpos); List<String> joinKeysList = Arrays.asList(joinKeys); int colIdx = joinKeysList.indexOf(colName); if (colIdx == -1) return; String[] otherJoinKeys = getJoinKeys(opNode, (relpos == LEFT ? RIGHT: LEFT)); String otherColName = otherJoinKeys[colIdx]; String otherRelationName = inputRelations[(relpos==LEFT? 1: 0)]; List<ObjectNode> sourceOps = preLineageInfo.findOperatorInputSources(opNode, otherRelationName); for (ObjectNode sourceOp: sourceOps) sourceCols.add(new OutputColumn(sourceOp, otherColName)); } // Find top level column names private List<String> getExpressionColumns(ObjectNode sourceOp, JsonNode exprNode) { List<String> resultList = new ArrayList<String>(); getExpressionColumns(sourceOp, exprNode, resultList); if (resultList.size() == 0 && false) LineageHelper.trace("Get expression columns returning empty set\n SourceOp = " + sourceOp.toString()); return resultList; } private void getExpressionColumns(ObjectNode sourceOp, JsonNode exprNode, List<String> resultList) { String topColumn = null; if ((topColumn = checkTopLevelColumn(sourceOp, exprNode)) != null) { resultList.add(topColumn); return; } ArrayNode argsNode = (ArrayNode) exprNode.get("arguments"); if (argsNode == null) return; for (JsonNode argNode : argsNode) getExpressionColumns(sourceOp, argNode, resultList); } private static String checkTopLevelColumn(ObjectNode sourceOp, JsonNode genExprNode) { String topColName = null; if (!(genExprNode instanceof ObjectNode)) return null; ObjectNode opNode = (ObjectNode) genExprNode; if (opNode.get("function") == null || !opNode.get("function").getTextValue().equals("INPUT_PROJECTION")) return null; ArrayNode argsNode = (ArrayNode) opNode.get("arguments"); if (argsNode.size() != 1 || ((topColName = findNamedColumn(argsNode)) == null) && (topColName = findIndexedColumn(sourceOp, argsNode)) == null) return null; return topColName; } private static String findIndexedColumn(ObjectNode sourceOp, ArrayNode argsNode) { BlockSchema sourceSchema = new BlockSchema(sourceOp.get("schema")); for (JsonNode arg : argsNode) { if (!arg.isNumber()) return null; String colName = sourceSchema.getColumnNames()[arg.getIntValue()]; return colName; } return null; } private static String findNamedColumn(ArrayNode argsNode) { for (JsonNode arg : argsNode) { if (!arg.isTextual()) continue; return arg.getTextValue(); } return null; } } public void buildLineage(ObjectNode programNode) throws LineageException { LineageHelper preLineageInfo = new LineageHelper(); visitOperators(programNode, null, preLineageInfo); LineageBuilder lineageInfo = new LineageBuilder(preLineageInfo); visitOperators(programNode, null, lineageInfo); if (lineageInfo.exception) throw new LineageException("Cannot trace lineage"); this.lineageInfo = lineageInfo; this.programNode = programNode; } public static boolean isJoinOperator(ObjectNode operatorNode) { if (operatorNode.get("operator") != null && (operatorNode.get("operator").getTextValue().equalsIgnoreCase("JOIN") || operatorNode.get("operator") .getTextValue() .equalsIgnoreCase("HASHJOIN"))) return true; return false; } public boolean isDistinctGroupBy(ObjectNode gbyNode) { if (!JsonUtils.getText(gbyNode, "operator").equals("GROUP_BY")) return false; if (gbyNode.get("aggregates") != null) return false; ObjectNode inpNode = this.lineageInfo.preLineageInfo.findOperatorSource(gbyNode, gbyNode.get("input") .getTextValue()); String[] inColumns = getSchemaOutputColumns(inpNode); String[] outColumns = getSchemaOutputColumns(gbyNode); if (inColumns.length != outColumns.length) return false; for (int i = 0; i < inColumns.length; i++) { if (inColumns[i].equals(outColumns[i])) return false; } return true; } public List<ObjectNode> traceColumnJoins(ObjectNode programNode, OutputColumn inputColumn) throws LineageException { LineageGraphVertex startVertex = this.lineageInfo.getOperatorLineageNode(inputColumn.opNode); List<LineageGraphVertex> allJoinDescendants = LineageGraph.traceTerminalNodes(startVertex, new String[] { "JOIN", "HASHJOIN" }, true); List<ObjectNode> columnJoins = new ArrayList<ObjectNode>(); LineageGraphVertex columnVertex = this.lineageInfo.getColumnLineageNode(inputColumn); List<OutputColumn> colDescendants = traceStraightLineColumnDescendants(inputColumn); for (LineageGraphVertex vertex : allJoinDescendants) { OperatorLineage opl = (OperatorLineage) vertex; boolean columnJoin = false; for (OutputColumn colDesc : colDescendants) { if (onlyJoinKey(opl, colDesc)) columnJoin = true; } if (columnJoin) columnJoins.add(opl.node); } return columnJoins; } private static String[] getJoinKeys(ObjectNode joinNode, int leftRight) { if (leftRight == LEFT) return (joinNode.get("operator").getTextValue().equals("JOIN") ? JsonUtils.asArray(joinNode.get("leftCubeColumns")) : JsonUtils.asArray(joinNode.get("leftJoinKeys"))); else return (joinNode.get("operator").getTextValue().equals("JOIN") ? JsonUtils.asArray(joinNode.get("rightCubeColumns")) : JsonUtils.asArray(joinNode.get("rightJoinKeys"))); } private boolean onlyJoinKey(OperatorLineage opl, OutputColumn colDesc) { List<LineageGraphVertex> opInputs = opl.getParentVertices(); boolean columnIsInput = false; for (LineageGraphVertex opInput : opInputs) { if (((OperatorLineage) opInput).node == colDesc.opNode) columnIsInput = true; } if (!columnIsInput) return false; Pair<ObjectNode, JsonNode> jobPhase = this.lineageInfo.preLineageInfo.getJobPhase(colDesc.opNode); String inRelation = LineageHelper.getOperatorOutput(jobPhase.getFirst(), jobPhase.getSecond(), colDesc.opNode); String[] joinKeys = null; if (inRelation.equals(opl.node.get("leftBlock").getTextValue())) joinKeys = getJoinKeys(opl.node, LEFT); else joinKeys = getJoinKeys(opl.node, RIGHT); if (joinKeys.length == 1 && joinKeys[0].equals(colDesc.columnName)) return true; return false; } public LineagePath tracePath(ObjectNode startOp, ObjectNode endOp) { return LineageGraph.tracePath(lineageInfo.getOperatorLineageNode(startOp), lineageInfo.getOperatorLineageNode(endOp)); } private static class StraightLineColumnDescendantVisitor implements LineageGraphVisitor { public List<OutputColumn> columnDescendants = new ArrayList<OutputColumn>(); @Override public boolean visit(LineageGraphVertex graphNode) { ColumnLineage columnLineage = (ColumnLineage) graphNode; if (columnLineage.isExpressionOutput()) return false; columnDescendants.add(columnLineage.node); return true; } @Override public void finishSubtree(LineageGraphVertex graphNode) { } } private List<OutputColumn> traceStraightLineColumnDescendants(OutputColumn inputColumn) { StraightLineColumnDescendantVisitor visitorObj = new StraightLineColumnDescendantVisitor(); LineageGraphVertex startVertex = this.lineageInfo.getColumnLineageNode(inputColumn); LineageGraph.visitLineageGraph(startVertex, visitorObj, true); return visitorObj.columnDescendants; } public List<ObjectNode> getOperatorSources(ObjectNode opNode) { Pair<ObjectNode, JsonNode> jobPhase = this.lineageInfo.preLineageInfo.getJobPhase(opNode); return this.lineageInfo.preLineageInfo.findAllOperatorSources(jobPhase.getFirst(), jobPhase.getSecond(), opNode); } public ArrayNode getPhaseOperators(JsonNode phaseNode) { return (ArrayNode) (LineageHelper.isReducePhase(phaseNode) ? (ArrayNode) phaseNode : ((ObjectNode) phaseNode).get("operators")); } public static void setPhaseOperators(ObjectNode jobNode, JsonNode phaseNode, ArrayNode phaseOps) { if (LineageHelper.isReducePhase(phaseNode)) jobNode.put("reduce", phaseOps); else ((ObjectNode) phaseNode).put("operators", phaseOps); } public JsonNode getPhase(ObjectNode opNode) { return this.lineageInfo.preLineageInfo.getJobPhase(opNode).getSecond(); } public Pair<ObjectNode, JsonNode> getJobPhase(ObjectNode opNode) { return this.lineageInfo.preLineageInfo.getJobPhase(opNode); } public static String traceIndexPath(ObjectNode jobNode, String indexName) { ArrayNode cacheIndexNode = (ArrayNode) (jobNode.get("cacheIndex")); for (JsonNode cacheIndex : cacheIndexNode) { if (((ObjectNode) cacheIndex).get("name").getTextValue().equals(indexName)) return ((ObjectNode) cacheIndex).get("path").getTextValue(); } return null; } public String[] getBlockgenPartitionKeys(ObjectNode bgNode) { ArrayNode pkeysNode = (isBlockgenByIndex(bgNode) ? (ArrayNode) bgNode.get("originalPartitionKeys") : (ArrayNode) bgNode.get("partitionKeys")); if (pkeysNode == null) return null; return JsonUtils.asArray(pkeysNode); } public String getBlockgenStorePath(ObjectNode programNode, ObjectNode bgNode) { List<ObjectNode> storeNodes = new ArrayList<ObjectNode>(); List<ObjectNode> bgNodeDescents = traceOperatorDescendants(bgNode); Pair<ObjectNode, JsonNode> bgPhase = this.lineageInfo.preLineageInfo.getJobPhase(bgNode); for (ObjectNode descNode : bgNodeDescents) { if (getPhase(descNode) == bgPhase.getSecond() && LineageHelper.isStoreCommand(bgPhase.getFirst(), bgPhase.getSecond(), descNode)) storeNodes.add(descNode); } if (storeNodes.size() > 1 || storeNodes.size() == 0) return null; return storeNodes.get(0).get("path").getTextValue(); } public ObjectNode getMatchingLoadInJob(ObjectNode jobNode, String pathName) { for (Integer opNodeSequence : this.lineageInfo.preLineageInfo.loadPathsMap.keySet()) { ObjectNode opNode = this.lineageInfo.preLineageInfo.operatorList.get(opNodeSequence); if (getJobPhase(opNode).getFirst() != jobNode) continue; List<String> pathNames = this.lineageInfo.preLineageInfo.operatorMapGet(this.lineageInfo.preLineageInfo.loadPathsMap, opNode); if (pathNames.indexOf(pathName) != -1) return opNode; } return null; } public List<ObjectNode> traceOperatorDescendants(ObjectNode opNode) { LineageGraphVertex startVertex = this.getLineage().getOperatorLineageNode(opNode); List<LineageGraphVertex> opDescents = LineageGraph.traceAllReachable(startVertex, true); List<ObjectNode> result = new ArrayList<ObjectNode>(); for (LineageGraphVertex gv : opDescents) result.add(((OperatorLineage) gv).node); return result; } public static boolean isCountDistinctAggregate(ObjectNode operatorNode) { if (operatorNode.get("operator") == null) return false; String type = operatorNode.get("operator").getTextValue(); if (!type.equals("GROUP_BY") && !type.equals("CUBE")) return false; if (!operatorNode.has("aggregates")) return false; for (JsonNode aggregateJson : operatorNode.path("aggregates")) { // Create the aggregator object JsonNode typeNode = aggregateJson.get("type"); // Group by case if (typeNode.isTextual()){ AggregationType aggType = AggregationType.valueOf(JsonUtils.getText(aggregateJson, "type")); String measureColumn = JsonUtils.getText(aggregateJson, "input"); if (aggType != AggregationType.COUNT_DISTINCT) return false; } else if (typeNode instanceof ArrayNode){ String[] typeArray = JsonUtils.asArray(aggregateJson, "type"); if (!typeArray[0].equals("SUM") || !typeArray[1].equals("COUNT_TO_ONE")) return false; } } return true; } public ObjectNode getOperatorJobNode(ObjectNode opNode) { Pair<ObjectNode, JsonNode> jobPhase = this.lineageInfo.preLineageInfo.getJobPhase(opNode); return jobPhase.getFirst(); } public List<ObjectNode> computeJoinsInJob(ObjectNode programNode, ObjectNode jobNode) { List<ObjectNode> joinsList = new ArrayList<ObjectNode>(); for (ObjectNode opNode : this.lineageInfo.preLineageInfo.operatorList) { if (getOperatorJobNode(opNode) != jobNode || !isJoinOperator(opNode)) continue; joinsList.add(opNode); } return joinsList; } public boolean isParentOperator(ObjectNode node1, ObjectNode node2) { LineageGraphVertex v1 = this.lineageInfo.getOperatorLineageNode(node1); LineageGraphVertex v2 = this.lineageInfo.getOperatorLineageNode(node2); if (CommonUtils.indexOfByRef(v1.getChildVertices(), v2) != -1) return true; return false; } public boolean isDescendantOf(ObjectNode ancestorNode, ObjectNode descNode) { List<ObjectNode> descList = this.traceOperatorDescendants(ancestorNode); if (CommonUtils.indexOfByRef(descList, descNode) != -1) return true; return false; } public int getDescendantInputIndex(String[] inputRelations, ObjectNode opNode, ObjectNode factNode) { ObjectNode[] inputSources = new ObjectNode[inputRelations.length]; List<ObjectNode> factDescents = traceOperatorDescendants(factNode); for (int i = 0; i < inputRelations.length; i++) { inputSources[i] = this.getLineage() .getPreLineage() .findOperatorSource(opNode, inputRelations[i]); if (CommonUtils.indexOfByRef(factDescents, inputSources[i]) != -1) return i; } return -1; } public List<LineagePath> traceMatchingPaths(ObjectNode startNode, ArrayList<String> nodeTypes, ObjectNode terminalNode, boolean isForward) { LineageGraphVertex terminalOpLineage = (terminalNode != null ? this.lineageInfo.operatorMapGet(this.lineageInfo.opLineageMap, terminalNode) : null); LineageGraphVertex startOpLineage = this.lineageInfo.operatorMapGet(this.lineageInfo.opLineageMap, startNode); return LineageGraph.traceMatchingPaths(startOpLineage, nodeTypes, terminalOpLineage, isForward); } }