/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.ExecType;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.ConstantExpression;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.VisitorException;
/**
* The operator models the join keys using the Local Rearrange operators which
* are configured with the plan specified by the user. It also sets up
* one Hashtable per replicated input which maps the Key(k) stored as a Tuple
* to a DataBag which holds all the values in the input having the same key(k)
* The getNext() reads an input from its predecessor and separates them into
* key & value. It configures a foreach operator with the databags obtained from
* each Hashtable for the key and also with the value for the fragment input.
* It then returns tuples returned by this foreach operator.
*/
public class POFRJoin extends PhysicalOperator {
/**
*
*/
private static final long serialVersionUID = 1L;
private Log log = LogFactory.getLog(getClass());
//The number in the input list which denotes the fragmented input
private int fragment;
//There can be n inputs each being a List<PhysicalPlan>
//Ex. join A by ($0+$1,$0-$1), B by ($0*$1,$0/$1);
private List<List<PhysicalPlan>> phyPlanLists;
//The key type for each Local Rearrange operator
private List<Byte> keyTypes;
//The Local Rearrange operators modeling the join key
private POLocalRearrange[] LRs;
//The set of files that represent the replicated inputs
private FileSpec[] replFiles;
//Used to configure the foreach operator
private ConstantExpression[] constExps;
//Used to produce the cross product of various bags
private POForEach fe;
//The array of Hashtables one per replicated input. replicates[fragment] = null
private Map<Tuple,List<Tuple>> replicates[];
//varaible which denotes whether we are returning tuples from the foreach operator
private boolean processingPlan;
//A dummy tuple
private Tuple dumTup = TupleFactory.getInstance().newTuple(1);
//An instance of tuple factory
private transient TupleFactory mTupleFactory;
private transient BagFactory mBagFactory;
private boolean setUp;
public POFRJoin(OperatorKey k) throws PlanException, ExecException {
this(k,-1,null, null, null, null, -1);
}
public POFRJoin(OperatorKey k, int rp) throws PlanException, ExecException {
this(k, rp, null, null, null, null, -1);
}
public POFRJoin(OperatorKey k, List<PhysicalOperator> inp) throws PlanException, ExecException {
this(k, -1, inp, null, null, null, -1);
}
public POFRJoin(OperatorKey k, int rp, List<PhysicalOperator> inp) throws PlanException, ExecException {
this(k,rp,inp,null, null, null, -1);
}
public POFRJoin(OperatorKey k, int rp, List<PhysicalOperator> inp, List<List<PhysicalPlan>> ppLists, List<Byte> keyTypes, FileSpec[] replFiles, int fragment) throws ExecException{
super(k,rp,inp);
phyPlanLists = ppLists;
this.fragment = fragment;
this.keyTypes = keyTypes;
this.replFiles = replFiles;
replicates = new Map[ppLists.size()];
LRs = new POLocalRearrange[ppLists.size()];
constExps = new ConstantExpression[ppLists.size()];
createJoinPlans(k);
processingPlan = false;
mTupleFactory = TupleFactory.getInstance();
mBagFactory = BagFactory.getInstance();
}
public List<List<PhysicalPlan>> getJoinPlans(){
return phyPlanLists;
}
private OperatorKey genKey(OperatorKey old){
return new OperatorKey(old.scope,NodeIdGenerator.getGenerator().getNextNodeId(old.scope));
}
/**
* Configures the Local Rearrange operators & the foreach operator
* @param old
* @throws ExecException
*/
private void createJoinPlans(OperatorKey old) throws ExecException{
List<PhysicalPlan> fePlans = new ArrayList<PhysicalPlan>();
List<Boolean> flatList = new ArrayList<Boolean>();
int i=-1;
for (List<PhysicalPlan> ppLst : phyPlanLists) {
++i;
POLocalRearrange lr = new POLocalRearrange(genKey(old));
lr.setIndex(i);
lr.setResultType(DataType.TUPLE);
lr.setKeyType(keyTypes.get(i));
try {
lr.setPlans(ppLst);
} catch (PlanException pe) {
int errCode = 2071;
String msg = "Problem with setting up local rearrange's plans.";
throw new ExecException(msg, errCode, PigException.BUG, pe);
}
LRs[i]= lr;
ConstantExpression ce = new ConstantExpression(genKey(old));
ce.setResultType((i==fragment)?DataType.TUPLE:DataType.BAG);
constExps[i] = ce;
PhysicalPlan pp = new PhysicalPlan();
pp.add(ce);
fePlans.add(pp);
flatList.add(true);
}
fe = new POForEach(genKey(old),-1,fePlans,flatList);
}
@Override
public void visit(PhyPlanVisitor v) throws VisitorException {
v.visitFRJoin(this);
}
@Override
public String name() {
return "FRJoin[" + DataType.findTypeName(resultType) + "]" +" - " + mKey.toString();
}
@Override
public boolean supportsMultipleInputs() {
// TODO Auto-generated method stub
return true;
}
@Override
public boolean supportsMultipleOutputs() {
// TODO Auto-generated method stub
return false;
}
@Override
public Result getNext(Tuple t) throws ExecException {
Result res = null;
Result inp = null;
if(!setUp){
setUpHashMap();
setUp = true;
}
if(processingPlan){
//Return tuples from the for each operator
//Assumes that it is configured appropriately with
//the bags for the current key.
while(true) {
res = fe.getNext(dummyTuple);
if(res.returnStatus==POStatus.STATUS_OK){
return res;
}
if(res.returnStatus==POStatus.STATUS_EOP){
processingPlan = false;
break;
}
if(res.returnStatus==POStatus.STATUS_ERR) {
return res;
}
if(res.returnStatus==POStatus.STATUS_NULL) {
continue;
}
}
}
while (true) {
//Process the current input
inp = processInput();
if (inp.returnStatus == POStatus.STATUS_EOP
|| inp.returnStatus == POStatus.STATUS_ERR)
return inp;
if (inp.returnStatus == POStatus.STATUS_NULL) {
continue;
}
//Separate Key & Value using the fragment's LR operator
POLocalRearrange lr = LRs[fragment];
lr.attachInput((Tuple)inp.result);
Result lrOut = lr.getNext(dummyTuple);
if(lrOut.returnStatus!=POStatus.STATUS_OK) {
log.error("LocalRearrange isn't configured right or is not working");
return new Result();
}
Tuple lrOutTuple = (Tuple) lrOut.result;
Tuple key = TupleFactory.getInstance().newTuple(1);
key.set(0,lrOutTuple.get(1));
Tuple value = getValueTuple(lr, lrOutTuple);
//Configure the for each operator with the relevant bags
int i=-1;
boolean noMatch = false;
for (ConstantExpression ce : constExps) {
++i;
if(i==fragment){
ce.setValue(value);
continue;
}
Map<Tuple, List<Tuple>> replicate = replicates[i];
if(!replicate.containsKey(key)){
noMatch = true;
break;
}
ce.setValue(mBagFactory.newDefaultBag(replicate.get(key)));
}
if(noMatch)
continue;
fe.attachInput(dumTup);
processingPlan = true;
Result gn = getNext(dummyTuple);
return gn;
}
}
/**
* Builds the HashMaps by reading each replicated input from the DFS
* using a Load operator
* @throws ExecException
*/
private void setUpHashMap() throws ExecException {
int i=-1;
long time1 = System.currentTimeMillis();
for (FileSpec replFile : replFiles) {
++i;
if(i==fragment){
replicates[i] = null;
continue;
}
POLoad ld = new POLoad(new OperatorKey("Repl File Loader", 1L), replFile, false);
PigContext pc = new PigContext(ExecType.MAPREDUCE,ConfigurationUtil.toProperties(PigMapReduce.sJobConf));
pc.connect();
ld.setPc(pc);
POLocalRearrange lr = LRs[i];
lr.setInputs(Arrays.asList((PhysicalOperator)ld));
Map<Tuple, List<Tuple>> replicate = new HashMap<Tuple, List<Tuple>>(1000);
log.debug("Completed setup. Trying to build replication hash table");
int cnt = 0;
for(Result res=lr.getNext(dummyTuple);res.returnStatus!=POStatus.STATUS_EOP;res=lr.getNext(dummyTuple)){
++cnt;
if(reporter!=null) reporter.progress();
Tuple tuple = (Tuple) res.result;
Tuple key = mTupleFactory.newTuple(1);
key.set(0,tuple.get(1));
Tuple value = getValueTuple(lr, tuple);
if(!replicate.containsKey(key))
replicate.put(key, new ArrayList<Tuple>());
replicate.get(key).add(value);
}
replicates[i] = replicate;
}
long time2 = System.currentTimeMillis();
log.debug("Hash Table built. Time taken: " + (time2-time1));
}
private void readObject(ObjectInputStream is) throws IOException, ClassNotFoundException, ExecException{
is.defaultReadObject();
mTupleFactory = TupleFactory.getInstance();
mBagFactory = BagFactory.getInstance();
// setUpHashTable();
}
/*
* Extracts the value tuple from the LR operator's output tuple
*/
private Tuple getValueTuple(POLocalRearrange lr, Tuple tuple) throws ExecException {
Tuple val = (Tuple) tuple.get(2);
Tuple retTup = null;
boolean isProjectStar = lr.isProjectStar();
Map<Integer, Integer> keyLookup = lr.getProjectedColsMap();
int keyLookupSize = keyLookup.size();
Object key = tuple.get(1);
boolean isKeyTuple = lr.isKeyTuple();
Tuple keyAsTuple = isKeyTuple ? (Tuple)tuple.get(1) : null;
if( keyLookupSize > 0) {
// we have some fields of the "value" in the
// "key".
retTup = mTupleFactory.newTuple();
int finalValueSize = keyLookupSize + val.size();
int valIndex = 0; // an index for accessing elements from
// the value (val) that we have currently
for(int i = 0; i < finalValueSize; i++) {
Integer keyIndex = keyLookup.get(i);
if(keyIndex == null) {
// the field for this index is not in the
// key - so just take it from the "value"
// we were handed
retTup.append(val.get(valIndex));
valIndex++;
} else {
// the field for this index is in the key
if(isKeyTuple) {
// the key is a tuple, extract the
// field out of the tuple
retTup.append(keyAsTuple.get(keyIndex));
} else {
retTup.append(key);
}
}
}
} else if (isProjectStar) {
// the whole "value" is present in the "key"
retTup = mTupleFactory.newTuple(keyAsTuple.getAll());
} else {
// there is no field of the "value" in the
// "key" - so just make a copy of what we got
// as the "value"
retTup = mTupleFactory.newTuple(val.getAll());
}
return retTup;
}
public int getFragment() {
return fragment;
}
public void setFragment(int fragment) {
this.fragment = fragment;
}
public FileSpec[] getReplFiles() {
return replFiles;
}
public void setReplFiles(FileSpec[] replFiles) {
this.replFiles = replFiles;
}
}