/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.stratosphere.executionengine.pactLayer.expressionOperators; import java.io.IOException; import java.io.ObjectInputStream; import java.lang.reflect.Type; import java.util.List; import java.util.Map; import org.apache.pig.Accumulator; import org.apache.pig.Algebraic; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.MonitoredUDFExecutor; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.PactOperator; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.Result; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.SOStatus; import org.apache.pig.backend.stratosphere.executionengine.pactLayer.plans.PactPlanVisitor; import org.apache.pig.builtin.MonitoredUDF; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.plan.NodeIdGenerator; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; public class POUserFunc extends ExpressionOperator { /** * */ private static final long serialVersionUID = 1L; transient EvalFunc func; transient private String[] cacheFiles = null; FuncSpec funcSpec; FuncSpec origFSpec; public static final byte INITIAL = 0; public static final byte INTERMEDIATE = 1; public static final byte FINAL = 2; private boolean initialized = false; private MonitoredUDFExecutor executor = null; private PactOperator referencedOperator = null; private boolean isAccumulationDone; public PactOperator getReferencedOperator() { return referencedOperator; } public void setReferencedOperator(PactOperator referencedOperator) { this.referencedOperator = referencedOperator; } public POUserFunc(OperatorKey k, int rp, List<PactOperator> inp) { super(k, rp); inputs = inp; } public POUserFunc( OperatorKey k, int rp, List<PactOperator> inp, FuncSpec funcSpec) { this(k, rp, inp, funcSpec, null); } public POUserFunc( OperatorKey k, int rp, List<PactOperator> inp, FuncSpec funcSpec, EvalFunc func) { super(k, rp); super.setInputs(inp); this.funcSpec = funcSpec; this.origFSpec = funcSpec; this.func = func; instantiateFunc(funcSpec); } private void instantiateFunc(FuncSpec fSpec) { this.func = (EvalFunc) PigContext.instantiateFuncFromSpec(fSpec); if (func.getClass().isAnnotationPresent(MonitoredUDF.class)) { executor = new MonitoredUDFExecutor(func); } //the next couple of initializations do not work as intended for the following reasons //the reporter and pigLogger are member variables of PactOperator //when instanitateFunc is invoked at deserialization time, both //reporter and pigLogger are null. They are set during map and reduce calls, //making the initializations here basically useless. Look at the processInput //method where these variables are re-initialized. At that point, the PactOperator //is set up correctly with the reporter and pigLogger references this.func.setReporter(reporter); this.func.setPigLogger(pigLogger); } @Override public Result processInput() throws ExecException { // Make sure the reporter is set, because it isn't getting carried // across in the serialization (don't know why). I suspect it's as // cheap to call the setReporter call everytime as to check whether I // have (hopefully java will inline it). if(!initialized) { func.setReporter(reporter); func.setPigLogger(pigLogger); initialized = true; } Result res = new Result(); Tuple inpValue = null; if (input == null && (inputs == null || inputs.size()==0)) { // log.warn("No inputs found. Signaling End of Processing."); res.returnStatus = SOStatus.STATUS_EOP; return res; } //Should be removed once the model is clear if(reporter!=null) { reporter.progress(); } if(isInputAttached()) { res.result = input; res.returnStatus = SOStatus.STATUS_OK; detachInput(); return res; } else { res.result = TupleFactory.getInstance().newTuple(); Result temp = null; for(PactOperator op : inputs) { temp = op.getNext(getDummy(op.getResultType()), op.getResultType()); if(temp.returnStatus!=SOStatus.STATUS_OK) { return temp; } if(op instanceof POProject && op.getResultType() == DataType.TUPLE){ POProject projOp = (POProject)op; if(projOp.isProjectToEnd()){ Tuple trslt = (Tuple) temp.result; Tuple rslt = (Tuple) res.result; for(int i=0;i<trslt.size();i++) { rslt.append(trslt.get(i)); } continue; } } ((Tuple)res.result).append(temp.result); } res.returnStatus = temp.returnStatus; return res; } } private Result getNext() throws ExecException { Result result = processInput(); String errMsg = ""; try { if(result.returnStatus == SOStatus.STATUS_OK) { if (isAccumulative()) { if (isAccumStarted()) { ((Accumulator)func).accumulate((Tuple)result.result); result.returnStatus = SOStatus.STATUS_BATCH_OK; result.result = null; isAccumulationDone = false; }else{ if(isAccumulationDone){ //PORelationToExprProject does not return STATUS_EOP // so that udf gets called both when isAccumStarted // is first true and then set to false, even //when the input relation is empty. // so the STATUS_EOP has to be sent from POUserFunc, // after the results have been sent. result.result = null; result.returnStatus = SOStatus.STATUS_EOP; } else{ result.result = ((Accumulator)func).getValue(); result.returnStatus = SOStatus.STATUS_OK; ((Accumulator)func).cleanup(); isAccumulationDone = true; } } } else { if (executor != null) { result.result = executor.monitorExec((Tuple) result.result); } else { result.result = func.exec((Tuple) result.result); } } return result; } return result; } catch (ExecException ee) { throw ee; } catch (IOException ioe) { int errCode = 2078; String msg = "Caught error from UDF: " + funcSpec.getClassName(); String footer = " [" + ioe.getMessage() + "]"; if(ioe instanceof PigException) { int udfErrorCode = ((PigException)ioe).getErrorCode(); if(udfErrorCode != 0) { errCode = udfErrorCode; msg = ((PigException)ioe).getMessage(); } else { msg += " [" + ((PigException)ioe).getMessage() + " ]"; } } else { msg += footer; } throw new ExecException(msg, errCode, PigException.BUG, ioe); } catch (IndexOutOfBoundsException ie) { int errCode = 2078; String msg = "Caught error from UDF: " + funcSpec.getClassName() + ", Out of bounds access [" + ie.getMessage() + "]"; throw new ExecException(msg, errCode, PigException.BUG, ie); } } @Override public Result getNext(Tuple tIn) throws ExecException { return getNext(); } @Override public Result getNext(DataBag db) throws ExecException { return getNext(); } @Override public Result getNext(Integer i) throws ExecException { return getNext(); } @Override public Result getNext(Boolean b) throws ExecException { return getNext(); } @Override public Result getNext(DataByteArray ba) throws ExecException { return getNext(); } @Override public Result getNext(Double d) throws ExecException { return getNext(); } @Override public Result getNext(Float f) throws ExecException { return getNext(); } @Override public Result getNext(Long l) throws ExecException { return getNext(); } @Override public Result getNext(Map m) throws ExecException { return getNext(); } @Override public Result getNext(String s) throws ExecException { return getNext(); } public void setAlgebraicFunction(byte Function) throws ExecException { // This will only be used by the optimizer for putting correct functions // in the mapper, // combiner and reduce. This helps in maintaining the physical plan as // is without the // optimiser having to replace any operators. // You wouldn't be able to make two calls to this function on the same // algebraic EvalFunc as // func is being changed. switch (Function) { case INITIAL: funcSpec = new FuncSpec(getInitial()); break; case INTERMEDIATE: funcSpec = new FuncSpec(getIntermed()); break; case FINAL: funcSpec = new FuncSpec(getFinal()); break; } funcSpec.setCtorArgs(origFSpec.getCtorArgs()); instantiateFunc(funcSpec); setResultType(DataType.findType(((EvalFunc<?>) func).getReturnType())); } public String getInitial() throws ExecException { instantiateFunc(origFSpec); if (func instanceof Algebraic) { return ((Algebraic) func).getInitial(); } else { int errCode = 2072; String msg = "Attempt to run a non-algebraic function" + " as an algebraic function"; throw new ExecException(msg, errCode, PigException.BUG); } } public String getIntermed() throws ExecException { instantiateFunc(origFSpec); if (func instanceof Algebraic) { return ((Algebraic) func).getIntermed(); } else { int errCode = 2072; String msg = "Attempt to run a non-algebraic function" + " as an algebraic function"; throw new ExecException(msg, errCode, PigException.BUG); } } public String getFinal() throws ExecException { instantiateFunc(origFSpec); if (func instanceof Algebraic) { return ((Algebraic) func).getFinal(); } else { int errCode = 2072; String msg = "Attempt to run a non-algebraic function" + " as an algebraic function"; throw new ExecException(msg, errCode, PigException.BUG); } } public Type getReturnType() { return func.getReturnType(); } public void finish() { func.finish(); if (executor != null) { executor.terminate(); } } public Schema outputSchema(Schema input) { return func.outputSchema(input); } public Boolean isAsynchronous() { return func.isAsynchronous(); } @Override public String name() { return "POUserFunc" + "(" + func.getClass().getName() + ")" + "[" + DataType.findTypeName(resultType) + "]" + " - " + mKey.toString(); } @Override public boolean supportsMultipleInputs() { return true; } @Override public boolean supportsMultipleOutputs() { return false; } @Override public void visit(PactPlanVisitor v) throws VisitorException { v.visitUserFunc(this); } public FuncSpec getFuncSpec() { return funcSpec; } public String[] getCacheFiles() { return cacheFiles; } public void setCacheFiles(String[] cf) { cacheFiles = cf; } public boolean combinable() { return (func instanceof Algebraic); } @Override public POUserFunc clone() throws CloneNotSupportedException { // Inputs will be patched up later by PhysicalPlan.clone() POUserFunc clone = new POUserFunc(new OperatorKey(mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(mKey.scope)), requestedParallelism, null, funcSpec.clone()); clone.setResultType(resultType); return clone; } private void readObject(ObjectInputStream is) throws IOException, ClassNotFoundException{ is.defaultReadObject(); instantiateFunc(funcSpec); } /** * Get child expression of this expression */ @Override public List<ExpressionOperator> getChildExpressions() { return null; } @SuppressWarnings("unchecked") @Override public void setAccumStart() { if (isAccumulative() && !isAccumStarted()) { super.setAccumStart(); ((Accumulator)func).cleanup(); } } @Override public void setResultType(byte resultType) { this.resultType = resultType; } @Override public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) { return (Tuple) out; } public EvalFunc getFunc() { return func; } }