/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.pen.physicalOperators; import java.util.Comparator; import java.util.Iterator; import java.util.List; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.SortedDataBag; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.pen.util.ExampleTuple; /** This is a local implementation of Cogroup. * The inputs need to be connected to LocalRearranges possibly by the * logical to physical translator. * * This is a blocking operator. The outputs of LRs are put into * SortedDataBags. They are sorted on the keys. We then start pulling * tuple out of these bags and start constructing output. * * */ //We intentionally skip type checking in backend for performance reasons @SuppressWarnings("unchecked") public class POCogroup extends PhysicalOperator { private static final long serialVersionUID = 1L; Tuple[] data = null; transient Iterator<Tuple>[] its = null; boolean[] inner; public POCogroup(OperatorKey k) { super(k); } public POCogroup(OperatorKey k, int rp) { super(k, rp); } public POCogroup(OperatorKey k, List<PhysicalOperator> inp) { super(k, inp); } public POCogroup(OperatorKey k, int rp, List<PhysicalOperator> inp) { super(k, rp, inp); } public void setInner(boolean[] inner) { this.inner = inner; } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitPenCogroup(this); } @Override public String name() { return getAliasString() + "POCogroup"+ ": POCogroup" + "[" + DataType.findTypeName(resultType) + "]" + " - " + mKey.toString(); } @Override public Result getNext(Tuple t) throws ExecException{ if(its == null) { accumulateData(); } boolean done = true; Result res = new Result(); for(int i = 0; i < data.length; i++) { done &= (data[i] == null); } if(done) { res.returnStatus = POStatus.STATUS_EOP; its = null; return res; } Tuple smallestTuple = getSmallest(data); Comparator<Tuple> comp = new groupComparator(); int size = data.length; Tuple output = TupleFactory.getInstance().newTuple(size + 1); output.set(0, smallestTuple.get(1)); for(int i = 1; i < size + 1; i++) { output.set(i, BagFactory.getInstance().newDefaultBag()); } ExampleTuple tOut = null; if(lineageTracer != null) { tOut = new ExampleTuple(output); lineageTracer.insert(tOut); } boolean loop = true; while(loop) { loop = false; for(int i = 0; i < size; i++) { if(data[i] != null && comp.compare(data[i], smallestTuple) == 0) { loop = true; DataBag bag = (DataBag) output.get(i + 1); //update lineage if it exists //Tuple temp = ((IndexedTuple) data[i].get(1)).toTuple(); Tuple temp = (Tuple) data[i].get(2); if(lineageTracer != null) { if(((ExampleTuple)temp).synthetic) tOut.synthetic = true; lineageTracer.union(temp, tOut); } //bag.add(((IndexedTuple) data[i].get(1)).toTuple()); bag.add(temp); if(its[i].hasNext()) data[i] = its[i].next(); else data[i] = null; } } } if(lineageTracer != null) res.result = tOut; else res.result = output; res.returnStatus = POStatus.STATUS_OK; // System.out.println(output); for(int i = 0; i < size; i++) { if(inner != null && inner[i] && ((DataBag)output.get(i+1)).size() == 0) { res.returnStatus = POStatus.STATUS_NULL; break; } } return res; } private void accumulateData() throws ExecException { int size = inputs.size(); its = new Iterator[size]; data = new Tuple[size]; for(int i = 0; i < size; i++) { DataBag bag = new SortedDataBag(new groupComparator()); for(Result input = inputs.get(i).getNext(dummyTuple); input.returnStatus != POStatus.STATUS_EOP; input = inputs.get(i).getNext(dummyTuple)) { if(input.returnStatus == POStatus.STATUS_ERR) { throw new ExecException("Error accumulating output at local Cogroup operator"); } if(input.returnStatus == POStatus.STATUS_NULL) continue; bag.add((Tuple) input.result); } its[i] = bag.iterator(); data[i] = its[i].next(); } } // private Tuple getSmallest(Tuple[] data) { // Tuple t = (Tuple) data[0]; // Comparator<Tuple> comp = new groupComparator(); // for(int i = 1; i < data.length; i++) { // if(comp.compare(t, (Tuple) data[i]) < 0) // t = data[i]; // } // return t; // } private Tuple getSmallest(Tuple[] data) { Tuple t = null; Comparator<Tuple> comp = new groupComparator(); for(int i = 0; i < data.length; i++) { if(data[i] == null) continue; if(t == null) { t = data[i]; continue; //since the previous data was probably null so we dont really need a comparison } if(comp.compare(t, data[i]) > 0) t = data[i]; } return t; } @Override public boolean supportsMultipleInputs() { // TODO Auto-generated method stub return true; } @Override public boolean supportsMultipleOutputs() { // TODO Auto-generated method stub return false; } private static class groupComparator implements Comparator<Tuple> { public int compare(Tuple o1, Tuple o2) { //We want to make it as efficient as possible by only comparing the keys Object t1 = null; Object t2 = null; try { // get the keys t1 = o1.get(1); t2 = o2.get(1); if(t1 == t2 && t1 == null) { // null keys from different inputs // are not treated as equals int firstInputIndex = (Byte)(o1.get(0)); int secondInputIndex = (Byte)(o2.get(0)); return firstInputIndex - secondInputIndex; } } catch (ExecException e) { // TODO Auto-generated catch block throw new RuntimeException("Error comparing tuples"); } int result = DataType.compare(t1, t2); // Further check if any field is null // See PIG-927 if (result == 0 && t1 instanceof Tuple && t2 instanceof Tuple) { try { int firstInputIndex = (Byte)(o1.get(0)); int secondInputIndex = (Byte)(o2.get(0)); for (int i=0;i<((Tuple)t1).size();i++) if (((Tuple)t1).get(i)==null) return firstInputIndex - secondInputIndex; } catch (ExecException e) { throw new RuntimeException("Error comparing tuple fields", e); } } return result; } } }