/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
import org.apache.pig.data.AccumulativeBag;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.InternalCachedBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.NullableTuple;
import org.apache.pig.impl.io.PigNullableWritable;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.IdentityHashSet;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.pen.util.ExampleTuple;
import org.apache.pig.pen.util.LineageTracer;
/**
* The package operator that packages
* the globally rearranged tuples into
* output format as required by co-group.
* This is last stage of processing co-group.
* This operator has a slightly different
* format than other operators in that, it
* takes two things as input. The key being
* worked on and the iterator of bags that
* contain indexed tuples that just need to
* be packaged into their appropriate output
* bags based on the index.
*/
public class POPackage extends PhysicalOperator {
/**
*
*/
private static final long serialVersionUID = 1L;
public static enum PackageType { GROUP, JOIN };
//The iterator of indexed Tuples
//that is typically provided by
//Hadoop
transient Iterator<NullableTuple> tupIter;
//The key being worked on
Object key;
// marker to indicate if key is a tuple
protected boolean isKeyTuple = false;
// marker to indicate if the tuple key is compound in nature
protected boolean isKeyCompound = false;
// key as a Tuple object (if the key is a tuple)
protected Tuple keyAsTuple;
//key's type
byte keyType;
//The number of inputs to this
//co-group. 0 indicates a distinct, which means there will only be a
//key, no value.
int numInputs;
// If the attaching map-reduce plan use secondary sort key
boolean useSecondaryKey = false;
//Denotes if inner is specified
//on a particular input
boolean[] inner;
// flag to denote whether there is a distinct
// leading to this package
protected boolean distinct = false;
// A mapping of input index to key information got from LORearrange
// for that index. The Key information is a pair of boolean, Map.
// The boolean indicates whether there is a lone project(*) in the
// cogroup by. If not, the Map has a mapping of column numbers in the
// "value" to column numbers in the "key" which contain the fields in
// the "value"
protected Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo;
protected static final BagFactory mBagFactory = BagFactory.getInstance();
protected static final TupleFactory mTupleFactory = TupleFactory.getInstance();
private boolean firstTime = true;
private boolean useDefaultBag = false;
private PackageType pkgType;
public POPackage(OperatorKey k) {
this(k, -1, null);
}
public POPackage(OperatorKey k, int rp) {
this(k, rp, null);
}
public POPackage(OperatorKey k, List<PhysicalOperator> inp) {
this(k, -1, inp);
}
public POPackage(OperatorKey k, int rp, List<PhysicalOperator> inp) {
super(k, rp, inp);
numInputs = -1;
keyInfo = new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>();
}
@Override
public String name() {
return getAliasString() + "Package" + "["
+ DataType.findTypeName(resultType) + "]" + "{"
+ DataType.findTypeName(keyType) + "}" + " - "
+ mKey.toString();
}
@Override
public boolean supportsMultipleInputs() {
return false;
}
@Override
public void visit(PhyPlanVisitor v) throws VisitorException {
v.visitPackage(this);
}
@Override
public boolean supportsMultipleOutputs() {
return false;
}
/**
* Attaches the required inputs
* @param k - the key being worked on
* @param inp - iterator of indexed tuples typically
* obtained from Hadoop
*/
public void attachInput(PigNullableWritable k, Iterator<NullableTuple> inp) {
try {
tupIter = inp;
key = k.getValueAsPigType();
if (useSecondaryKey) {
key = ((Tuple)key).get(0);
}
if(isKeyTuple) {
// key is a tuple, cache the key as a
// tuple for use in the getNext()
keyAsTuple = (Tuple)key;
}
} catch (Exception e) {
throw new RuntimeException(
"Error attaching input for key " + k +
" in " + name() + " at location " + getOriginalLocations(), e);
}
}
/**
* attachInput's better half!
*/
public void detachInput() {
tupIter = null;
key = null;
}
public int getNumInps() {
return numInputs;
}
public void setNumInps(int numInps) {
this.numInputs = numInps;
}
public boolean[] getInner() {
return inner;
}
public void setInner(boolean[] inner) {
this.inner = inner;
}
/**
* From the inputs, constructs the output tuple
* for this co-group in the required format which
* is (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...)
*/
@Override
public Result getNextTuple() throws ExecException {
Tuple res;
if(firstTime){
firstTime = false;
if (PigMapReduce.sJobConfInternal.get() != null) {
String bagType = PigMapReduce.sJobConfInternal.get().get("pig.cachedbag.type");
if (bagType != null && bagType.equalsIgnoreCase("default")) {
useDefaultBag = true;
}
}
}
if(distinct) {
// only set the key which has the whole
// tuple
res = mTupleFactory.newTuple(1);
res.set(0, key);
} else {
//Create numInputs bags
DataBag[] dbs = null;
dbs = new DataBag[numInputs];
if (isAccumulative()) {
// create bag wrapper to pull tuples in many batches
// all bags have reference to the sample tuples buffer
// which contains tuples from one batch
POPackageTupleBuffer buffer = new POPackageTupleBuffer();
for (int i = 0; i < numInputs; i++) {
dbs[i] = new AccumulativeBag(buffer, i);
}
} else {
// create bag to pull all tuples out of iterator
for (int i = 0; i < numInputs; i++) {
dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag()
// In a very rare case if there is a POStream after this
// POPackage in the pipeline and is also blocking the pipeline;
// constructor argument should be 2 * numInputs. But for one obscure
// case we don't want to pay the penalty all the time.
: new InternalCachedBag(numInputs);
}
//For each indexed tup in the inp, sort them
//into their corresponding bags based
//on the index
while (tupIter.hasNext()) {
NullableTuple ntup = tupIter.next();
int index = ntup.getIndex();
Tuple copy = getValueTuple(ntup, index);
if (numInputs == 1) {
// this is for multi-query merge where
// the numInputs is always 1, but the index
// (the position of the inner plan in the
// enclosed operator) may not be 1.
dbs[0].add(copy);
} else {
dbs[index].add(copy);
}
if(getReporter()!=null) {
getReporter().progress();
}
}
}
//Construct the output tuple by appending
//the key and all the above constructed bags
//and return it.
res = mTupleFactory.newTuple(numInputs+1);
res.set(0,key);
int i=-1;
for (DataBag bag : dbs) {
i++;
if(inner[i] && !isAccumulative()){
if(bag.size()==0){
detachInput();
Result r = new Result();
r.returnStatus = POStatus.STATUS_NULL;
return r;
}
}
res.set(i+1,bag);
}
}
Result r = new Result();
r.returnStatus = POStatus.STATUS_OK;
if (!isAccumulative())
r.result = illustratorMarkup(null, res, 0);
else
r.result = res;
detachInput();
return r;
}
protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException {
// Need to make a copy of the value, as hadoop uses the same ntup
// to represent each value.
Tuple val = (Tuple)ntup.getValueAsPigType();
Tuple copy = null;
// The "value (val)" that we just got may not
// be the complete "value". It may have some portions
// in the "key" (look in POLocalRearrange for more comments)
// If this is the case we need to stitch
// the "value" together.
Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
if (lrKeyInfo == null) {
throw new RuntimeException("No keyInfo for index "+index+" in "+keyInfo);
}
boolean isProjectStar = lrKeyInfo.first;
Map<Integer, Integer> keyLookup = lrKeyInfo.second;
int keyLookupSize = keyLookup.size();
if( keyLookupSize > 0) {
// we have some fields of the "value" in the
// "key".
int finalValueSize = keyLookupSize + val.size();
copy = mTupleFactory.newTuple(finalValueSize);
int valIndex = 0; // an index for accessing elements from
// the value (val) that we have currently
for(int i = 0; i < finalValueSize; i++) {
Integer keyIndex = keyLookup.get(i);
if(keyIndex == null) {
// the field for this index is not in the
// key - so just take it from the "value"
// we were handed
copy.set(i, val.get(valIndex));
valIndex++;
} else {
// the field for this index is in the key
if(isKeyTuple && isKeyCompound) {
// the key is a tuple, extract the
// field out of the tuple
copy.set(i, keyAsTuple.get(keyIndex));
} else {
copy.set(i, key);
}
}
}
copy = illustratorMarkup2(val, copy);
} else if (isProjectStar) {
// the whole "value" is present in the "key"
copy = mTupleFactory.newTuple(keyAsTuple.getAll());
copy = illustratorMarkup2(keyAsTuple, copy);
} else {
// there is no field of the "value" in the
// "key" - so just make a copy of what we got
// as the "value"
copy = mTupleFactory.newTuple(val.getAll());
copy = illustratorMarkup2(val, copy);
}
return copy;
}
public byte getKeyType() {
return keyType;
}
public void setKeyType(byte keyType) {
this.keyType = keyType;
}
/**
* Make a deep copy of this operator.
* @throws CloneNotSupportedException
*/
@Override
public POPackage clone() throws CloneNotSupportedException {
POPackage clone = (POPackage)super.clone();
clone.mKey = new OperatorKey(mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(mKey.scope));
clone.requestedParallelism = requestedParallelism;
clone.resultType = resultType;
clone.keyType = keyType;
clone.numInputs = numInputs;
if (inner!=null)
{
clone.inner = new boolean[inner.length];
for (int i = 0; i < inner.length; i++) {
clone.inner[i] = inner[i];
}
}
else
clone.inner = null;
return clone;
}
/**
* @param keyInfo the keyInfo to set
*/
public void setKeyInfo(Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo) {
this.keyInfo = keyInfo;
}
/**
* @param keyTuple the keyTuple to set
*/
public void setKeyTuple(boolean keyTuple) {
this.isKeyTuple = keyTuple;
}
/**
* @param keyCompound the keyCompound to set
*/
public void setKeyCompound(boolean keyCompound) {
this.isKeyCompound = keyCompound;
}
/**
* @return the keyInfo
*/
public Map<Integer, Pair<Boolean, Map<Integer, Integer>>> getKeyInfo() {
return keyInfo;
}
/**
* @return the distinct
*/
public boolean isDistinct() {
return distinct;
}
/**
* @param distinct the distinct to set
*/
public void setDistinct(boolean distinct) {
this.distinct = distinct;
}
public void setUseSecondaryKey(boolean useSecondaryKey) {
this.useSecondaryKey = useSecondaryKey;
}
public void setPackageType(PackageType type) {
this.pkgType = type;
}
public PackageType getPackageType() {
return pkgType;
}
class POPackageTupleBuffer implements AccumulativeTupleBuffer {
private List<Tuple>[] bags;
private Iterator<NullableTuple> iter;
private int batchSize;
private Object currKey;
@SuppressWarnings("unchecked")
public POPackageTupleBuffer() {
batchSize = 20000;
if (PigMapReduce.sJobConfInternal.get() != null) {
String size = PigMapReduce.sJobConfInternal.get().get("pig.accumulative.batchsize");
if (size != null) {
batchSize = Integer.parseInt(size);
}
}
this.bags = new List[numInputs];
for(int i=0; i<numInputs; i++) {
this.bags[i] = new ArrayList<Tuple>();
}
this.iter = tupIter;
this.currKey = key;
}
@Override
public boolean hasNextBatch() {
return iter.hasNext();
}
@Override
public void nextBatch() throws IOException {
for(int i=0; i<bags.length; i++) {
bags[i].clear();
}
key = currKey;
for(int i=0; i<batchSize; i++) {
if (iter.hasNext()) {
NullableTuple ntup = iter.next();
int index = ntup.getIndex();
Tuple copy = getValueTuple(ntup, index);
if (numInputs == 1) {
// this is for multi-query merge where
// the numInputs is always 1, but the index
// (the position of the inner plan in the
// enclosed operator) may not be 1.
bags[0].add(copy);
} else {
bags[index].add(copy);
}
}else{
break;
}
}
}
public void clear() {
for(int i=0; i<bags.length; i++) {
bags[i].clear();
}
iter = null;
}
public Iterator<Tuple> getTuples(int index) {
return bags[index].iterator();
}
public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) {
return POPackage.this.illustratorMarkup(in, out, eqClassIndex);
}
};
private Tuple illustratorMarkup2(Object in, Object out) {
if(illustrator != null) {
ExampleTuple tOut = new ExampleTuple((Tuple) out);
illustrator.getLineage().insert(tOut);
tOut.synthetic = ((ExampleTuple) in).synthetic;
illustrator.getLineage().union(tOut, (Tuple) in);
return tOut;
} else
return (Tuple) out;
}
@Override
public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) {
if(illustrator != null) {
ExampleTuple tOut = new ExampleTuple((Tuple) out);
LineageTracer lineageTracer = illustrator.getLineage();
lineageTracer.insert(tOut);
Tuple tmp;
boolean synthetic = false;
if (illustrator.getEquivalenceClasses() == null) {
LinkedList<IdentityHashSet<Tuple>> equivalenceClasses = new LinkedList<IdentityHashSet<Tuple>>();
for (int i = 0; i < numInputs; ++i) {
IdentityHashSet<Tuple> equivalenceClass = new IdentityHashSet<Tuple>();
equivalenceClasses.add(equivalenceClass);
}
illustrator.setEquivalenceClasses(equivalenceClasses, this);
}
if (distinct) {
int count;
for (count = 0; tupIter.hasNext(); ++count) {
NullableTuple ntp = tupIter.next();
tmp = (Tuple) ntp.getValueAsPigType();
if (!tmp.equals(tOut))
lineageTracer.union(tOut, tmp);
}
if (count > 1) // only non-distinct tuples are inserted into the equivalence class
illustrator.getEquivalenceClasses().get(eqClassIndex).add(tOut);
illustrator.addData((Tuple) tOut);
return (Tuple) tOut;
}
boolean outInEqClass = true;
try {
for (int i = 1; i < numInputs+1; i++)
{
DataBag dbs = (DataBag) ((Tuple) out).get(i);
Iterator<Tuple> iter = dbs.iterator();
if (dbs.size() <= 1 && outInEqClass) // all inputs have >= 2 records
outInEqClass = false;
while (iter.hasNext()) {
tmp = iter.next();
// any of synthetic data in bags causes the output tuple to be synthetic
if (!synthetic && ((ExampleTuple)tmp).synthetic)
synthetic = true;
lineageTracer.union(tOut, tmp);
}
}
} catch (ExecException e) {
// TODO better exception handling
throw new RuntimeException("Illustrator exception :"+e.getMessage());
}
if (outInEqClass)
illustrator.getEquivalenceClasses().get(eqClassIndex).add(tOut);
tOut.synthetic = synthetic;
illustrator.addData((Tuple) tOut);
return tOut;
} else
return (Tuple) out;
}
}