/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.pig.PigException; import org.apache.pig.PigWarning; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.SingleTupleBag; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; /** * Implements the overloaded form of the project operator. * Projects the specified column from the input tuple. * However, if asked for tuples when the input is a bag, * the overloaded form is invoked and the project streams * the tuples through instead of the bag. */ public class POProject extends ExpressionOperator { /** * */ private static final long serialVersionUID = 1L; private static TupleFactory tupleFactory = TupleFactory.getInstance(); protected static final BagFactory bagFactory = BagFactory.getInstance(); private boolean resultSingleTupleBag = false; //The column to project protected ArrayList<Integer> columns; //True if we are in the middle of streaming tuples //in a bag boolean processingBagOfTuples = false; //The bag iterator used while straeming tuple transient Iterator<Tuple> bagIterator = null; //Represents the fact that this instance of POProject //is overloaded to stream tuples in the bag rather //than passing the entire bag. It is the responsibility //of the translator to set this. boolean overloaded = false; boolean star = false; public POProject(OperatorKey k) { this(k,-1,0); } public POProject(OperatorKey k, int rp) { this(k, rp, 0); } public POProject(OperatorKey k, int rp, int col) { super(k, rp); columns = new ArrayList<Integer>(); columns.add(col); } public POProject(OperatorKey k, int rp, ArrayList<Integer> cols) { super(k, rp); columns = cols; } @Override public String name() { return "Project" + "[" + DataType.findTypeName(resultType) + "]" + ((star) ? "[*]" : columns) + " - " + mKey.toString(); } @Override public boolean supportsMultipleInputs() { return false; } @Override public boolean supportsMultipleOutputs() { return false; } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitProject(this); } /** * Overridden since the attachment of the new input * should cause the old processing to end. */ @Override public void attachInput(Tuple t) { super.attachInput(t); processingBagOfTuples = false; } /** * Fetches the input tuple and returns the requested * column * @return next value. * @throws ExecException */ public Result getNext() throws ExecException{ Result res = processInput(); Tuple inpValue = (Tuple)res.result; Object ret; if(res.returnStatus != POStatus.STATUS_OK){ return res; } if (star) { return res; } else if(columns.size() == 1) { try { ret = inpValue.get(columns.get(0)); } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } res.returnStatus = POStatus.STATUS_OK; ret = null; } catch (NullPointerException npe) { // the tuple is null, so a dereference should also produce a null // there is a slight danger here that the Tuple implementation // may have given the exception for a different reason but if we // don't catch it, we will die and the most common case for the // exception would be because the tuple is null res.returnStatus = POStatus.STATUS_OK; ret = null; } } else { ArrayList<Object> objList = new ArrayList<Object>(columns.size()); for(int i: columns) { try { objList.add(inpValue.get(i)); } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + i + " which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } objList.add(null); } catch (NullPointerException npe) { // the tuple is null, so a dereference should also produce a null // there is a slight danger here that the Tuple implementation // may have given the exception for a different reason but if we // don't catch it, we will die and the most common case for the // exception would be because the tuple is null objList.add(null); } } ret = tupleFactory.newTuple(objList); } res.result = ret; return res; } @Override public Result getNext(DataBag db) throws ExecException { Result res = processInputBag(); if(res.returnStatus!=POStatus.STATUS_OK) return res; return(consumeInputBag(res)); } /** * @param input * @throws ExecException */ protected Result consumeInputBag(Result input) throws ExecException { DataBag inpBag = (DataBag) input.result; Result retVal = new Result(); if(isInputAttached() || star){ retVal.result = inpBag; retVal.returnStatus = POStatus.STATUS_OK; detachInput(); return retVal; } DataBag outBag; if(resultSingleTupleBag) { // we have only one tuple in a bag - so create // A SingleTupleBag for the result and fill it // appropriately from the input bag Tuple tuple = inpBag.iterator().next(); Tuple tmpTuple = tupleFactory.newTuple(columns.size()); for (int i = 0; i < columns.size(); i++) { try { tmpTuple.set(i, tuple.get(columns.get(i))); } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } tmpTuple.set(i, null); } catch (NullPointerException npe) { // the tuple is null, so a dereference should also produce a null // there is a slight danger here that the Tuple implementation // may have given the exception for a different reason but if we // don't catch it, we will die and the most common case for the // exception would be because the tuple is null tmpTuple.set(i, null); } } outBag = new SingleTupleBag(tmpTuple); } else { outBag = bagFactory.newDefaultBag(); for (Tuple tuple : inpBag) { Tuple tmpTuple = tupleFactory.newTuple(columns.size()); for (int i = 0; i < columns.size(); i++) { try { tmpTuple.set(i, tuple.get(columns.get(i))); } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } tmpTuple.set(i, null); } catch (NullPointerException npe) { // the tuple is null, so a dereference should also produce a null // there is a slight danger here that the Tuple implementation // may have given the exception for a different reason but if we // don't catch it, we will die and the most common case for the // exception would be because the tuple is null tmpTuple.set(i, null); } } outBag.add(tmpTuple); } } retVal.result = outBag; retVal.returnStatus = POStatus.STATUS_OK; return retVal; } @Override public Result getNext(DataByteArray ba) throws ExecException { return getNext(); } @Override public Result getNext(Double d) throws ExecException { return getNext(); } @Override public Result getNext(Float f) throws ExecException { return getNext(); } @Override public Result getNext(Integer i) throws ExecException { return getNext(); } @Override public Result getNext(Long l) throws ExecException { return getNext(); } @Override public Result getNext(Boolean b) throws ExecException { return getNext(); } @Override public Result getNext(Map m) throws ExecException { return getNext(); } @Override public Result getNext(String s) throws ExecException { return getNext(); } /** * Asked for Tuples. Check if the input is a bag. * If so, stream the tuples in the bag instead of * the entire bag. */ @Override public Result getNext(Tuple t) throws ExecException { Result res = new Result(); if(!processingBagOfTuples){ Tuple inpValue = null; res = processInput(); if(res.returnStatus!=POStatus.STATUS_OK) return res; if(star) return res; inpValue = (Tuple)res.result; res.result = null; Object ret; if(columns.size() == 1) { try{ ret = inpValue.get(columns.get(0)); } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } ret = null; } catch (NullPointerException npe) { // the tuple is null, so a dereference should also produce a null // there is a slight danger here that the Tuple implementation // may have given the exception for a different reason but if we // don't catch it, we will die and the most common case for the // exception would be because the tuple is null ret = null; } } else { ArrayList<Object> objList = new ArrayList<Object>(columns.size()); for(int i: columns) { try { objList.add(inpValue.get(i)); } catch (IndexOutOfBoundsException ie) { if(pigLogger != null) { pigLogger.warn(this,"Attempt to access field " + "which was not found in the input", PigWarning.ACCESSING_NON_EXISTENT_FIELD); } objList.add(null); } catch (NullPointerException npe) { // the tuple is null, so a dereference should also produce a null // there is a slight danger here that the Tuple implementation // may have given the exception for a different reason but if we // don't catch it, we will die and the most common case for the // exception would be because the tuple is null objList.add(null); } } ret = tupleFactory.newTuple(objList); res.result = (Tuple)ret; return res; } if(overloaded){ DataBag retBag = (DataBag)ret; bagIterator = retBag.iterator(); if(bagIterator.hasNext()){ processingBagOfTuples = true; res.result = bagIterator.next(); } // If the bag contains no tuple, set the returnStatus to STATUS_EOP if (!processingBagOfTuples) res.returnStatus = POStatus.STATUS_EOP; } else { res.result = (Tuple)ret; } return res; } if(bagIterator.hasNext()){ res.result = bagIterator.next(); res.returnStatus = POStatus.STATUS_OK; return res; } else{ //done processing the bag of tuples processingBagOfTuples = false; return getNext(t); } } public ArrayList<Integer> getColumns() { return columns; } public int getColumn() throws ExecException { if(columns.size() != 1) { int errCode = 2068; String msg = "Internal error. Improper use of method getColumn() in " + POProject.class.getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG); } return columns.get(0); } public void setColumns(ArrayList<Integer> cols) { this.columns = cols; } public void setColumn(int col) { if(null == columns) { columns = new ArrayList<Integer>(); } else { columns.clear(); } columns.add(col); } public boolean isOverloaded() { return overloaded; } public void setOverloaded(boolean overloaded) { this.overloaded = overloaded; } public boolean isStar() { return star; } public void setStar(boolean star) { this.star = star; } @Override public POProject clone() throws CloneNotSupportedException { ArrayList<Integer> cols = new ArrayList<Integer>(columns.size()); // Can resuse the same Integer objects, as they are immutable for (Integer i : columns) { cols.add(i); } POProject clone = new POProject(new OperatorKey(mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(mKey.scope)), requestedParallelism, cols); clone.cloneHelper(this); clone.star = star; clone.overloaded = overloaded; return clone; } protected Result processInputBag() throws ExecException { Result res = new Result(); if (input==null && (inputs == null || inputs.size()==0)) { // log.warn("No inputs found. Signaling End of Processing."); res.returnStatus = POStatus.STATUS_EOP; return res; } //Should be removed once the model is clear if(reporter!=null) reporter.progress(); if(!isInputAttached()) return inputs.get(0).getNext(dummyBag); else{ res.result = (DataBag)input.get(columns.get(0)); res.returnStatus = POStatus.STATUS_OK; return res; } } public void setResultSingleTupleBag(boolean resultSingleTupleBag) { this.resultSingleTupleBag = resultSingleTupleBag; } @Override public List<ExpressionOperator> getChildExpressions() { return null; } }