/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.analyzer.physical; import com.linkedin.cubert.io.rubix.RubixFile; import com.linkedin.cubert.operator.OperatorType; import com.linkedin.cubert.utils.JsonUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.pig.data.Tuple; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.ObjectNode; import java.io.IOException; import java.util.*; import static com.linkedin.cubert.utils.JsonUtils.getText; /** * Analyzes the lineage of blockgens across the different CubertStore datasets. * * @author Maneesh Varshney * */ public class BlockgenLineageAnalyzer extends PhysicalPlanVisitor implements PlanRewriter { private Configuration conf; private final Map<String, String> blockgenIdMap = new HashMap<String, String>(); private String currentBlockgenId = null; @Override public JsonNode rewrite(JsonNode plan, Set<String> namesUsed, boolean debugMode, boolean revisit) throws IOException { conf = new JobConf(); // first get the blockgen id from global input cubert files JsonNode inputs = plan.get("input"); if (inputs != null && !inputs.isNull()) { Iterator<String> inputsIt = inputs.getFieldNames(); while (inputsIt.hasNext()) { String input = inputsIt.next(); JsonNode json = inputs.get(input); String type = getText(json, "type"); if (type.equalsIgnoreCase("RUBIX")) { try { blockgenIdMap.put(input, getBlockgenId(input)); } catch (ClassNotFoundException e) { throw new PlanRewriteException(e); } } } } new PhysicalPlanWalker(plan, this).walk(); return plan; } private String getBlockgenId(String input) throws IOException, ClassNotFoundException { Path path = new Path(input); Path afile = RubixFile.getARubixFile(conf, path); RubixFile<Tuple, Object> rubixFile = new RubixFile<Tuple, Object>(conf, afile); return rubixFile.getBlockgenId(); } @Override public void enterJob(JsonNode json) { currentBlockgenId = null; } @Override public void visitInput(JsonNode json) { JsonNode pathJson = json.get("path"); String path; if (pathJson.isArray()) path = JsonUtils.encodePath(pathJson.get(0)); else path = JsonUtils.encodePath(pathJson); // blockgenId related if (getText(json, "type").equalsIgnoreCase("RUBIX")) { currentBlockgenId = this.blockgenIdMap.get(path); if (currentBlockgenId == null) error(json, "Attempting to load a rubix file that was not created by BLOCKGEN or BLOCKGEN BY INDEX"); } } @Override public void visitOperator(JsonNode json, boolean isMapper) { // special cases for individual operators OperatorType type = OperatorType.valueOf(getText(json, "operator")); switch (type) { case LOAD_BLOCK: { String path = getText(json, "path"); String blockgenId = this.blockgenIdMap.get(path); if (blockgenId == null) error(json, "Attempting to load a rubix block that was not BLOCKGEN or BLOCKGEN BY INDEX"); if (currentBlockgenId == null) error(json, "Attempting to load a rubix block without reference to a valid MATCHING rubix block."); if (!currentBlockgenId.equals(blockgenId)) error(json, "Attempting to load rubix block that is inconsistently partitioned as the MATCHING rubix block."); break; } case CREATE_BLOCK: { boolean isIndexed = getText(json, "blockgenType").equalsIgnoreCase("BY_INDEX"); if (isIndexed) { String parentPath = getText(json, "indexPath"); currentBlockgenId = this.blockgenIdMap.get(parentPath); if (currentBlockgenId == null) error(json, "Attempting to create rubix block BY INDEX from an invalid rubix file."); } else { currentBlockgenId = UUID.randomUUID().toString(); } break; } } } @Override public void visitShuffle(JsonNode json) { currentBlockgenId = null; } @Override public void visitOutput(JsonNode json) { // blockgen Id related if (getText(json, "type").equalsIgnoreCase("RUBIX")) { if (currentBlockgenId == null) error(json, "Attempting to write to rubix file data that is not BLOCKGEN or BLOCKGEN BY INDEX"); this.blockgenIdMap.put(getText(json, "path"), currentBlockgenId); } } @Override public void exitJob(JsonNode json) { JsonNode outputJson = json.get("output"); boolean isRubixStorage = outputJson.has("type") && getText(outputJson, "type").equalsIgnoreCase("rubix"); if (isRubixStorage) { ((ObjectNode) outputJson).put("blockgenId", currentBlockgenId); } currentBlockgenId = null; } private void error(JsonNode json, String format, Object... args) { error(null, json, format, args); } private void error(Exception e, JsonNode json, String format, Object... args) { // if (debugMode && e != null) // e.printStackTrace(); // // hasErrors = true; System.err.println(String.format("ERROR: " + format, args)); if (json != null) { System.err.print("At:\t"); if (json.has("line")) System.err.println(json.get("line").getTextValue()); else System.err.println(json.toString()); } if (e != null) throw new PlanRewriteException(e); else throw new PlanRewriteException(); } }