/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.piggybank.evaluation; import java.io.IOException; import java.util.Iterator; import org.apache.pig.Algebraic; import org.apache.pig.EvalFunc; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PigProgressable; import org.apache.pig.data.*; import org.apache.pig.impl.logicalLayer.schema.Schema; /** * MaxTupleBy1stField UDF returns a tuple with max value of the first field in a * given bag. * * Caveat: first field assumed to have type 'long'. You may need to enforece this * via schema when loading data, as sown in sample usage below. * * Sample usage: * * A = load 'test.tsv' as (first: long, second, third); * B = GROUP A by second; * C = FOREACH B GENERATE group, MaxTupleBy1stField(A); * * @author Vadim Zaliva <lord@codemindes.com> */ public class MaxTupleBy1stField extends EvalFunc<Tuple> implements Algebraic { /** * Indicates once for how many items progress hartbeat should be sent. */ private static final int PROGRESS_FREQUENCY = 10; static public class Initial extends EvalFunc<Tuple> { //TODO: private static TupleFactory tfact = TupleFactory.getInstance(); @Override public Tuple exec(Tuple input) throws IOException { try { // input is a bag with one tuple containing // the column we are trying to max on DataBag bg = (DataBag) input.get(0); Tuple tp = bg.iterator().next(); return tp; //TODO: copy? } catch(ExecException ee) { IOException oughtToBeEE = new IOException(); oughtToBeEE.initCause(ee); throw oughtToBeEE; } } } public Schema outputSchema(Schema input) { return input; } static public class Intermediate extends EvalFunc<Tuple> { //TODO: private static TupleFactory tfact = TupleFactory.getInstance(); @Override public Tuple exec(Tuple input) throws IOException { try { return max(input, reporter); } catch(ExecException ee) { IOException oughtToBeEE = new IOException(); oughtToBeEE.initCause(ee); throw oughtToBeEE; } } } static public class Final extends EvalFunc<Tuple> { @Override public Tuple exec(Tuple input) throws IOException { try { return max(input, reporter); } catch(ExecException ee) { IOException oughtToBeEE = new IOException(); oughtToBeEE.initCause(ee); throw oughtToBeEE; } } } @Override public Tuple exec(Tuple input) throws IOException { try { return max(input, reporter); } catch(ExecException ee) { IOException oughtToBeEE = new IOException(); oughtToBeEE.initCause(ee); throw oughtToBeEE; } } protected static Tuple max(Tuple input, PigProgressable reporter) throws ExecException { DataBag values = (DataBag) input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if(values.size() == 0) return null; long curMax = 0; Tuple curMaxTuple = null; int n=0; for(Iterator<Tuple> it = values.iterator(); it.hasNext();) { if(reporter!=null && ++n%PROGRESS_FREQUENCY==0) reporter.progress(); Tuple t = it.next(); try { long d = (Long) t.get(0); if(curMaxTuple == null || d > curMax) { curMax = d; curMaxTuple = t; } } catch(RuntimeException exp) { ExecException newE = new ExecException("Error processing: " + t.toString() + exp.getMessage()); newE.initCause(exp); throw newE; } } return curMaxTuple; } @Override public String getInitial() { return Initial.class.getName(); } @Override public String getIntermed() { return Intermediate.class.getName(); } @Override public String getFinal() { return Final.class.getName(); } }