/* * (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. */ package com.linkedin.cubert.analyzer.physical; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import com.linkedin.cubert.analyzer.physical.SemanticAnalyzer; import com.linkedin.cubert.analyzer.physical.Lineage.*; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.operator.PostCondition; import com.linkedin.cubert.operator.aggregate.AggregationType; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.Pair; import com.linkedin.cubert.utils.RewriteUtils; public class LineageHelper implements OperatorVisitor { private static final boolean TRACE_ON = false; public static void trace(String traceMsg) { if (TRACE_ON) System.out.println(traceMsg); } private int curSequence = 0; public ArrayList<ObjectNode> operatorList = new ArrayList<ObjectNode>(); public HashMap<Integer, Pair<ObjectNode, JsonNode>> operatorPhaseMap = new HashMap<Integer, Pair<ObjectNode, JsonNode>>(); public HashMap<Integer, List<String>> loadPathsMap = new HashMap<Integer, List<String>>(); public HashMap<Integer, List<String>> storePathsMap = new HashMap<Integer, List<String>>(); public HashMap<Integer, String> outputRelations = new HashMap<Integer, String>(); public boolean inspect(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode) { int opSequence = operatorList.size(); operatorPhaseMap.put(opSequence, new Pair(jobNode, phaseNode)); operatorList.add(operatorNode); curSequence++; if (isLoadOperator(jobNode, phaseNode, operatorNode)) { List<String> loadPaths = getPaths(operatorNode.get("path")); // trace("Load operator visited " + operatorNode.toString()); operatorMapPut(loadPathsMap, operatorNode, loadPaths); } if (isStoreCommand(jobNode, phaseNode, operatorNode)) { List<String> storePaths = getPaths(operatorNode.get("path")); operatorMapPut(storePathsMap, operatorNode, storePaths); // trace("Pre Lineage Visitor visited store command = " + // operatorNode.toString()); } operatorMapPut(outputRelations, operatorNode, getOperatorOutput(jobNode, phaseNode, operatorNode)); return true; } public <T> void operatorMapPut(HashMap<Integer, T> operatorMap, ObjectNode operatorNode, T valueObj) { Integer opSequence = this.getOpSequence(operatorNode); operatorMap.put(opSequence, valueObj); } public <T> T operatorMapGet(HashMap<Integer, T> operatorMap, ObjectNode operatorNode) { Integer opSequence = this.getOpSequence(operatorNode); return operatorMap.get(opSequence); } public List<ObjectNode> findAllParentStores(ObjectNode jobNode, JsonNode phaseNode, ObjectNode opNode) { if (!isLoadOperator(jobNode, phaseNode, opNode)) throw new RuntimeException("Cannot find parent store for non-LOAD " + opNode.toString()); List<ObjectNode> result = new ArrayList<ObjectNode>(); List<String> loadPaths = operatorMapGet(loadPathsMap, opNode); for (String loadPath : loadPaths) { ObjectNode storeNode = findPrecedingStore(opNode, loadPath); if (storeNode != null) result.add(storeNode); } return result; } public ArrayList<ObjectNode> findAllOperatorSources(ObjectNode jobNode, JsonNode phaseNode, ObjectNode opNode) { ArrayList<ObjectNode> sourceNodes = new ArrayList<ObjectNode>(); if (isLoadOperator(jobNode, phaseNode, opNode)) { List<String> loadPaths = operatorMapGet(loadPathsMap, opNode); for (String loadPath : loadPaths) { ObjectNode storeNode = findPrecedingStore(opNode, loadPath); if (storeNode != null) sourceNodes.add(storeNode); } return sourceNodes; } JsonNode inputsNode = (isStoreCommand(jobNode, phaseNode, opNode) ? opNode.get("name") : opNode.get("input")); if (inputsNode == null) { // trace("Getting sources for " + opNode.toString() + " ?"); return null; } if (!(inputsNode instanceof ArrayNode)) sourceNodes.addAll(findOperatorInputSources(opNode, inputsNode.getTextValue())); else { for (JsonNode inputNode : (ArrayNode) inputsNode) sourceNodes.addAll(findOperatorInputSources(opNode, inputNode.getTextValue())); } return sourceNodes; } public ObjectNode findPrecedingStore(ObjectNode opNode, String loadPath) { int opSequence = this.getOpSequence(opNode); // trace("findPrecedingStore called for opNode " + opNode + // " loadPath= " + loadPath); for (int i = opSequence; i >= 0; i--) { ObjectNode candidateOp = operatorList.get(i); Pair<ObjectNode, JsonNode> phaseInfo = getJobPhase(candidateOp); if (!isStoreCommand(phaseInfo.getFirst(), phaseInfo.getSecond(), candidateOp)) continue; // trace("Examining store command " + candidateOp); List<String> storePaths = operatorMapGet(storePathsMap, candidateOp); if (storePaths.indexOf(loadPath) == -1) continue; return candidateOp; } return null; } public ObjectNode findOperatorSource(ObjectNode opNode, String inputRelation) { int opSequence = this.getOpSequence(opNode); JsonNode phaseNode = this.getJobPhase(opNode).getSecond(); ObjectNode sourceOp = findOperatorSourcePrior(opSequence - 1, inputRelation); if (sourceOp == null) throw new RuntimeException("Cannot find source for opNode " + opNode.toString() + "for relation " + inputRelation); return sourceOp; } // Find the most recent operator source from a list of operators, given // an input relation. private ObjectNode getOperatorSourceInPhase(ObjectNode jobNode, JsonNode phaseNode, ObjectNode opNode, String inputRelation) { ArrayNode opsNode = isReducePhase(phaseNode) ? (ArrayNode) phaseNode : (ArrayNode) (((ObjectNode) phaseNode).get("operators")); boolean reachedDest = false; // don't expect to find store in the list of oeprators. if (isReducePhase(phaseNode) && isStoreCommand(jobNode, phaseNode, opNode)) reachedDest = true; for (int i = opsNode.size() - 1; i >= 0; i--) { ObjectNode candidateOp = (ObjectNode) (opsNode.get(i)); if (candidateOp == opNode) { reachedDest = true; continue; } if (!reachedDest) continue; if (isOutputOf(candidateOp, inputRelation)) return candidateOp; } return null; } public List<ObjectNode> findOperatorInputSources(ObjectNode opNode, String inputRelation) { int opSequence = this.getOpSequence(opNode); ObjectNode jobNode = (ObjectNode) (this.getJobPhase(opNode).getFirst()); JsonNode phaseNode = this.getJobPhase(opNode).getSecond(); List<ObjectNode> result = new ArrayList<ObjectNode>(); if (isReducePhase(phaseNode) && (opSequence == 0 || getOperatorSourceInPhase(jobNode, (ArrayNode) phaseNode, opNode, inputRelation) == null)) { // if either first operator in reduce phase or a matching source within // the same phase cannot be found, // look inside all the map jobs. ArrayNode mapsArray = (ArrayNode) getJobPhase(opNode).getFirst().get("map"); for (JsonNode mapNode : mapsArray) { ArrayNode mapOps = (ArrayNode) ((ObjectNode) mapNode).get("operators"); if (mapOps == null || mapOps.size() == 0) continue; ObjectNode lastOp = (ObjectNode) mapOps.get(mapOps.size() - 1); result.add(findOperatorSourcePrior(getOpSequence(lastOp), inputRelation)); } } else result.add(findOperatorSource(opNode, inputRelation)); return result; } public ObjectNode findOperatorSourcePrior(int startSequence, String inputRelation) { for (int i = startSequence; i >= 0; i--) { ObjectNode candidateOp = operatorList.get(i); if (isOutputOf(candidateOp, inputRelation)) return candidateOp; } return null; } public boolean isOutputOf(ObjectNode candidateOp, String inputRelation) { Pair<ObjectNode, JsonNode> jobPhase = this.getJobPhase(candidateOp); JsonNode phaseNode = jobPhase.getSecond(); String outRelation = getOperatorOutput(jobPhase.getFirst(), phaseNode, candidateOp); if (outRelation != null && outRelation.equals(inputRelation)) return true; // if this is an INPUT load, then match against "input" if (!isReducePhase(phaseNode) && phaseNode.get("input") == candidateOp && candidateOp.get("name").getTextValue().equals(inputRelation)) return true; return false; } public Pair<ObjectNode, JsonNode> getJobPhase(ObjectNode opNode) { return operatorMapGet(operatorPhaseMap, (opNode)); } public int getOpSequence(ObjectNode opNode) { int result = CommonUtils.indexOfByRef(this.operatorList, opNode); if (result == -1) throw new RuntimeException("No operatorList reference found \n opNode = " + opNode.toString()); return result; } public static boolean isLoadOperator(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode) { if ((operatorNode.get("operator") == null && !isReducePhase(phaseNode) && ((ObjectNode) (phaseNode.get("input"))) == operatorNode) || operatorNode.get("operator") != null && operatorNode.get("operator") .getTextValue() .equalsIgnoreCase("LOAD_BLOCK")) return true; // LineageHelper.trace("operator-txt= " + operatorNode.get("operator") + // " phase type= " + (isReducePhase(phaseNode) ? "reduce" : "Map") + // " ,inputToPhase = " + (phaseNode.get("input") == operatorNode ? "true" : // "false" )); return false; } public static boolean isReducePhase(JsonNode phaseNode) { return phaseNode instanceof ArrayNode; } public static boolean isStoreCommand(ObjectNode jobNode, JsonNode phaseNode, ObjectNode opNode) { return (jobNode.get("output") == opNode || opNode.get("operator") != null && opNode.get("operator").getTextValue().equals("TEE") ? true : false); } public static String getPathRoot(JsonNode pathNode) { if (pathNode instanceof ObjectNode && ((ObjectNode) pathNode).get("startDate") != null) return (pathNode.get("root").getTextValue()); else return (pathNode.getTextValue()); } public static List<String> getPaths(JsonNode pathNode) { List<String> resultPaths = new ArrayList<String>(); if (!(pathNode instanceof ArrayNode)) { resultPaths.add(getPathRoot(pathNode)); return resultPaths; } ArrayNode pathArray = (ArrayNode) pathNode; for (JsonNode pathElement : pathArray) { resultPaths.add(getPathRoot(pathElement)); } return resultPaths; } public static String getOperatorOutput(ObjectNode jobNode, JsonNode phaseNode, ObjectNode operatorNode) { if (operatorNode.get("output") != null) { return operatorNode.get("output").getTextValue(); } if (LineageHelper.isStoreCommand(jobNode, phaseNode, operatorNode)) return operatorNode.get("name").getTextValue(); return null; } }