/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.impl.builtin; import java.io.IOException; import java.io.ObjectInputStream; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.apache.pig.ComparisonFunc; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.CountingMap; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.InternalMap; import org.apache.pig.data.NonSpillableDataBag; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.PigContext; public class FindQuantiles extends EvalFunc<Map<String, Object>>{ // keys for the weightedparts Map public static final String QUANTILES_LIST = "quantiles.list"; public static final String WEIGHTED_PARTS = "weighted.parts"; BagFactory mBagFactory = BagFactory.getInstance(); TupleFactory mTupleFactory = TupleFactory.getInstance(); boolean[] mAsc; enum State { ALL_ASC, ALL_DESC, MIXED }; State mState; private class SortComparator implements Comparator<Tuple> { @Override @SuppressWarnings("unchecked") public int compare(Tuple t1, Tuple t2) { switch (mState) { case ALL_ASC: return t1.compareTo(t2); case ALL_DESC: return t2.compareTo(t1); case MIXED: // Have to break the tuple down and compare it field to field. int sz1 = t1.size(); int sz2 = t2.size(); if (sz2 < sz1) { return 1; } else if (sz2 > sz1) { return -1; } else { for (int i = 0; i < sz1; i++) { try { int c = DataType.compare(t1.get(i), t2.get(i)); if (c != 0) { if (!mAsc[i]) c *= -1; return c; } } catch (ExecException e) { throw new RuntimeException("Unable to compare tuples", e); } } return 0; } } return -1; // keep the compiler happy } } private Comparator<Tuple> mComparator = new SortComparator(); private FuncSpec mUserComparisonFuncSpec; private ComparisonFunc mUserComparisonFunc; @SuppressWarnings("unchecked") private void instantiateFunc() { if(mUserComparisonFunc != null) { this.mUserComparisonFunc = (ComparisonFunc) PigContext.instantiateFuncFromSpec(this.mUserComparisonFuncSpec); this.mUserComparisonFunc.setReporter(reporter); this.mComparator = mUserComparisonFunc; } } // We need to instantiate any user defined comparison function // on the backend when the FindQuantiles udf is deserialized private void readObject(ObjectInputStream is) throws IOException, ClassNotFoundException{ is.defaultReadObject(); instantiateFunc(); } public FindQuantiles() { mState = State.ALL_ASC; } public FindQuantiles(String[] args) { int startIndex = 0; int ascFlagsLength = args.length; // the first argument may be the information // about user defined comparison function if one // was specified if(args[0].startsWith(MRCompiler.USER_COMPARATOR_MARKER)) { mUserComparisonFuncSpec = new FuncSpec( args[0].substring(MRCompiler.USER_COMPARATOR_MARKER.length())); // skip the first argument now that we used it startIndex++; ascFlagsLength--; } mAsc = new boolean[ascFlagsLength]; boolean sawAsc = false; boolean sawDesc = false; for (int i = startIndex; i < ascFlagsLength; i++) { mAsc[i] = Boolean.parseBoolean(args[i]); if (mAsc[i]) sawAsc = true; else sawDesc = true; } if (sawAsc && sawDesc) mState = State.MIXED; else if (sawDesc) mState = State.ALL_DESC; else mState = State.ALL_ASC; // In cast they gave us no args this // defaults to all ascending. } /** * first field in the input tuple is the number of quantiles to generate * second field is the *sorted* bag of samples */ @Override public Map<String, Object> exec(Tuple in) throws IOException { Map<String, Object> output = new HashMap<String, Object>(); if(in==null || in.size()==0) return null; Integer numQuantiles = null; DataBag samples = null; ArrayList<Tuple> quantilesList = new ArrayList<Tuple>(); InternalMap weightedParts = new InternalMap(); // the sample file has a tuple as under: // (numQuantiles, bag of samples) // numQuantiles here is the reduce parallelism try{ numQuantiles = (Integer)in.get(0); samples = (DataBag)in.get(1); long numSamples = samples.size(); long toSkip = numSamples / numQuantiles; if(toSkip == 0) { // numSamples is < numQuantiles; // set numQuantiles to numSamples numQuantiles = (int)numSamples; toSkip = 1; } long ind=0, j=-1, nextQuantile = toSkip-1; for (Tuple it : samples) { if (ind==nextQuantile){ ++j; quantilesList.add(it); nextQuantile+=toSkip; if(j==numQuantiles-1) break; } ind++; if (ind % 1000 == 0) progress(); } long i=-1; Map<Tuple,CountingMap<Integer>> contribs = new HashMap<Tuple, CountingMap<Integer>>(); for (Tuple it : samples){ ++i; if (i % 1000 == 0) progress(); int partInd = (int)(i/toSkip); // which partition if(partInd==numQuantiles) break; // the quantiles array has the element from the sample which is the // last element for a given partition. For example: if numQuantiles // is 5 and number of samples is 100, then toSkip = 20 // quantiles[0] = sample[19] // the 20th element // quantiles[1] = sample[39] // the 40th element // and so on. For any element in the sample between 0 and 19, partInd // will be 0. We want to check if a sample element which is // present between 0 and 19 is also the 19th (quantiles[0] element). // This would mean that element might spread over the 0th and 1st // partition. We are looking for contributions to a partition // from such elements. // First We only check for sample elements in partitions other than the last one // < numQuantiles -1 (partInd is 0 indexed). if(partInd<numQuantiles-1 && areEqual(it,quantilesList.get(partInd))){ if(!contribs.containsKey(it)){ CountingMap<Integer> cm = new CountingMap<Integer>(); cm.put(partInd, 1); contribs.put(it, cm); } else contribs.get(it).put(partInd, 1); } else{ // we are either in the last partition (last quantile) // OR the sample element we are currently processing is not // the same as the element in the quantile array for this partition // if we haven't seen this sample item earlier, this is not an // element which crosses partitions - so ignore if(!contribs.containsKey(it)) continue; else // we have seen this sample before (in a previous partInd), // add to the contribution associated with this sample - if we had // not seen this sample in a previous partInd, then we would have not // had this in the contribs map! (because of the if above).This // "key" (represented by the sample item) can either go to the // previous partInd or this partInd in the final sort reduce stage. // That is where the amount of contribution to each partInd will // matter and influence the choice. contribs.get(it).put(partInd, 1); } } int k = 0; for(Entry<Tuple, CountingMap<Integer>> ent : contribs.entrySet()){ if (k % 1000 == 0) progress(); Tuple key = ent.getKey(); // sample item which repeats // this map will have the contributions of the sample item to the different partitions CountingMap<Integer> value = ent.getValue(); long total = value.getTotalCount(); Tuple probVec = mTupleFactory.newTuple(numQuantiles.intValue()); // initialize all contribution fractions for different // partitions to 0.0 for (int l = 0; l < numQuantiles; l++) { probVec.set(l, new Float(0.0)); } // for each partition that this sample item is present in, // compute the fraction of the total occurrences for that // partition - this will be the probability with which we // will pick this partition in the final sort reduce job // for this sample item for (Entry<Integer,Integer> valEnt : value.entrySet()) { probVec.set(valEnt.getKey(), (float)valEnt.getValue()/total); } weightedParts.put(key, probVec); } output.put(QUANTILES_LIST, new NonSpillableDataBag(quantilesList)); output.put(WEIGHTED_PARTS, weightedParts); return output; }catch (Exception e){ e.printStackTrace(); throw new RuntimeException(e); } } private boolean areEqual(Tuple it, Tuple tuple) { return mComparator.compare(it, tuple)==0; } }