/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.UUID; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PlanPrinter; import org.apache.pig.data.Tuple; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.Pair; import org.apache.pig.pen.util.ExampleTuple; public class POCache extends PhysicalOperator { private static final Log LOG = LogFactory.getLog(POCache.class); private static final long serialVersionUID = 1L; // The expression plan transient PhysicalPlan plan; String key; public POCache(OperatorKey k, PhysicalPlan plan) { super(k); this.plan = plan; } /** * Counts the number of tuples processed into static variable soFar, if the number of tuples processed reach the * limit, return EOP; Otherwise, return the tuple */ @Override public Result getNextTuple() throws ExecException { return processInput(); } @Override public String name() { return getAliasString() + "Cache - " + mKey.toString(); } @Override public boolean supportsMultipleInputs() { return false; } @Override public boolean supportsMultipleOutputs() { return false; } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitCache(this); } @Override public POCache clone() throws CloneNotSupportedException { POCache newCache = new POCache(new OperatorKey(this.mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(this.mKey.scope)), this.plan.clone()); newCache.setInputs(inputs); return newCache; } @Override public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) { if(illustrator != null) { ExampleTuple tIn = (ExampleTuple) in; illustrator.getEquivalenceClasses().get(eqClassIndex).add(tIn); illustrator.addData((Tuple) in); } return (Tuple) in; } /** * Get a cache key for the given operator, or null if we don't know how to handle its type (or one of * its predcesessors' types) and want to not cache this subplan at all. * * Right now, this only handles loads. Unless we figure out a nice way to turn the PO plan into a * string or compare two PO plans, we'll probably have to handle each type of physical operator * recursively to generate a cache key. * @param plan * @throws IOException */ public String computeCacheKey() throws IOException { if (key == null) { key = computeRawCacheKey(inputs); if (key != null) { // TODO deal with collisions!! key = UUID.nameUUIDFromBytes(key.getBytes()).toString(); } } return key; } private String computeRawCacheKey(List<PhysicalOperator> preds) throws IOException { if (preds == null) { return ""; } StringBuilder sb = new StringBuilder(); for (PhysicalOperator operator : preds) { if (operator instanceof POLoad) { // Load operators are equivalent if the file is the same // and the loader is the same // Potential problems down the line: // * not checking LoadFunc arguments sb.append("LOAD: " + ((POLoad) operator).getLFile().getFileName() + ((POLoad) operator).getLoadFunc().getClass().getName()); } else if (operator instanceof POForEach) { // We consider ForEach operators to be equivalent if their inner plans // have the same explain plan after dropping scope markers. // Potential problems downstream: // * not checking for Nondeterministic UDFs // * jars / class defs changing under us StringBuilder foreachPlanKeysBuilder = new StringBuilder(); for (PhysicalPlan innerPlan : ((POForEach) operator).getInputPlans()) { foreachPlanKeysBuilder.append(innerPlanKey(innerPlan)); } sb.append(foreachPlanKeysBuilder.toString()); String inputKey = computeRawCacheKey(operator.getInputs()); if (inputKey == null) { return null; } else { sb.append(inputKey); LOG.info("Input key: " + inputKey); } } else if (operator instanceof POFilter) { // Similar to foreach. PhysicalPlan innerPlan = ((POFilter) operator).getPlan(); sb.append(innerPlanKey(innerPlan)); String inputKey = computeRawCacheKey(operator.getInputs()); if (inputKey == null) { return null; } else { sb.append(inputKey); } } else if (operator instanceof POLocalRearrange) { POLocalRearrange localRearrange = (POLocalRearrange) operator; sb.append("LocRearrange"); sb.append("ProjCol"); for (Map.Entry<Integer, Integer> entry : localRearrange.getProjectedColsMap().entrySet()) { sb.append(entry.getKey() + "+" + entry.getValue()); } sb.append("SecProjCol"); for (Map.Entry<Integer, Integer> entry : localRearrange.getSecondaryProjectedColsMap().entrySet()) { sb.append(entry.getKey() + "+" + entry.getValue()); } sb.append(localRearrange.getIndex()); sb.append(localRearrange.getKeyType()); for (PhysicalPlan plan : localRearrange.getPlans()) { sb.append(innerPlanKey(plan)); } } else if (operator instanceof POGlobalRearrange) { sb.append("POGLOBALREARRANGE"); } else if (operator instanceof POPackage) { POPackage pkg = (POPackage) operator; sb.append("POPakage"); for (Map.Entry<Integer, Pair<Boolean, Map<Integer, Integer>>> entry : pkg.getKeyInfo().entrySet()) { sb.append(entry.getKey()).append("-").append(entry.getValue().first); sb.append("->"); for (Map.Entry<Integer, Integer> valentry : entry.getValue().second.entrySet()) { sb.append(valentry.getKey()).append("-").append(valentry.getValue()); } sb.append("."); } } else { LOG.info("Don't know how to generate cache key for " + operator.getClass() + "; not caching"); return null; } sb.append(computeRawCacheKey(operator.getInputs())); } return sb.toString(); } private String innerPlanKey(PhysicalPlan plan) throws VisitorException, IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PlanPrinter<PhysicalOperator, PhysicalPlan> pp = new PlanPrinter<PhysicalOperator, PhysicalPlan>(plan); pp.print(baos); String explained = baos.toString(); // get rid of scope numbers in these inner plans. return explained.replaceAll("scope-\\d+", ""); } }