/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer; import java.io.IOException; import java.util.List; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.pig.LoadCaster; import org.apache.pig.LoadFunc; import org.apache.pig.OrderedLoadFunc; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.util.ObjectSerializer; /** Merge Join indexer is used to generate on the fly index for doing Merge Join efficiently. * It samples first record from every block of right side input. * and returns tuple in the following format : * (key0, key1,...,position,splitIndex) * These tuples are then sorted before being written out to index file on HDFS. */ public class MergeJoinIndexer extends LoadFunc{ private boolean firstRec = true; private transient TupleFactory mTupleFactory; private POLocalRearrange lr; private PhysicalPlan precedingPhyPlan; private int keysCnt; private PhysicalOperator rightPipelineLeaf; private PhysicalOperator rightPipelineRoot; private LoadFunc loader; private PigSplit pigSplit = null; private boolean ignoreNullKeys; /** @param funcSpec : Loader specification. * @param innerPlan : This is serialized version of LR plan. We * want to keep only keys in our index file and not the whole tuple. So, we need LR and thus its plan * to get keys out of the sampled tuple. * @param serializedPhyPlan Serialized physical plan on right side. * @throws ExecException */ @SuppressWarnings("unchecked") public MergeJoinIndexer(String funcSpec, String innerPlan, String serializedPhyPlan, String udfCntxtSignature, String scope, String ignoreNulls) throws ExecException{ loader = (LoadFunc)PigContext.instantiateFuncFromSpec(funcSpec); loader.setUDFContextSignature(udfCntxtSignature); this.ignoreNullKeys = Boolean.parseBoolean(ignoreNulls); try { List<PhysicalPlan> innerPlans = (List<PhysicalPlan>)ObjectSerializer.deserialize(innerPlan); lr = new POLocalRearrange(new OperatorKey(scope,NodeIdGenerator.getGenerator().getNextNodeId(scope))); lr.setPlans(innerPlans); keysCnt = innerPlans.size(); precedingPhyPlan = (PhysicalPlan)ObjectSerializer.deserialize(serializedPhyPlan); if(precedingPhyPlan != null){ if(precedingPhyPlan.getLeaves().size() != 1 || precedingPhyPlan.getRoots().size() != 1){ int errCode = 2168; String errMsg = "Expected physical plan with exactly one root and one leaf."; throw new ExecException(errMsg,errCode,PigException.BUG); } this.rightPipelineLeaf = precedingPhyPlan.getLeaves().get(0); this.rightPipelineRoot = precedingPhyPlan.getRoots().get(0); this.rightPipelineRoot.setInputs(null); } } catch (IOException e) { int errCode = 2094; String msg = "Unable to deserialize plans in Indexer."; throw new ExecException(msg,errCode,e); } mTupleFactory = TupleFactory.getInstance(); } @Override public Tuple getNext() throws IOException { if(!firstRec) // We sample only one record per block. return null; WritableComparable<?> position = ((OrderedLoadFunc)loader).getSplitComparable(pigSplit.getWrappedSplit()); Object key = null; Tuple wrapperTuple = mTupleFactory.newTuple(keysCnt+2); while(true){ Tuple readTuple = loader.getNext(); if(null == readTuple){ // We hit the end. for(int i =0; i < keysCnt; i++) wrapperTuple.set(i, null); wrapperTuple.set(keysCnt, position); firstRec = false; return wrapperTuple; } if (null == precedingPhyPlan){ lr.attachInput(readTuple); key = ((Tuple)lr.getNextTuple().result).get(1); lr.detachInput(); if ( null == key && ignoreNullKeys) // Tuple with null key. Drop it. continue; break; } // There is a physical plan. rightPipelineRoot.attachInput(readTuple); boolean fetchNewTup; while(true){ Result res = rightPipelineLeaf.getNextTuple(); switch(res.returnStatus){ case POStatus.STATUS_OK: lr.attachInput((Tuple)res.result); key = ((Tuple)lr.getNextTuple().result).get(1); lr.detachInput(); if ( null == key && ignoreNullKeys) // Tuple with null key. Drop it. continue; fetchNewTup = false; break; case POStatus.STATUS_EOP: fetchNewTup = true; break; default: int errCode = 2164; String errMsg = "Expected EOP/OK as return status. Found: "+res.returnStatus; throw new ExecException(errMsg,errCode); } break; } if (!fetchNewTup) break; } if(key instanceof Tuple){ Tuple tupKey = (Tuple)key; for(int i =0; i < tupKey.size(); i++) wrapperTuple.set(i, tupKey.get(i)); } else wrapperTuple.set(0, key); wrapperTuple.set(keysCnt, position); wrapperTuple.set(keysCnt+1, pigSplit.getSplitIndex()); firstRec = false; return wrapperTuple; } /* (non-Javadoc) * @see org.apache.pig.LoadFunc#getInputFormat() */ @Override public InputFormat getInputFormat() throws IOException { return loader.getInputFormat(); } /* (non-Javadoc) * @see org.apache.pig.LoadFunc#getLoadCaster() */ @Override public LoadCaster getLoadCaster() throws IOException { return loader.getLoadCaster(); } /* (non-Javadoc) * @see org.apache.pig.LoadFunc#prepareToRead(org.apache.hadoop.mapreduce.RecordReader, org.apache.hadoop.mapreduce.InputSplit) */ @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { loader.prepareToRead(reader, split); pigSplit = split; } /* (non-Javadoc) * @see org.apache.pig.LoadFunc#setLocation(java.lang.String, org.apache.hadoop.mapreduce.Job) */ @Override public void setLocation(String location, Job job) throws IOException { loader.setLocation(location, job); } }