/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.WeakHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.PigConfiguration; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.ExpressionOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.SelfSpillBag.MemoryLimits; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.Spillable; import org.apache.pig.impl.util.SpillableMemoryManager; import com.google.common.collect.Maps; /** * Do partial aggregation in map plan. Inputs are buffered up in * a hashmap until a threshold is reached; then the combiner functions * are fed these buffered up inputs, and results stored in a secondary * map. Once that map fills up or all input has been seen, results are * piped out into the next operator (caller of getNext()). */ public class POPartialAgg extends PhysicalOperator implements Spillable { private static final Log LOG = LogFactory.getLog(POPartialAgg.class); private static final long serialVersionUID = 1L; private static final Result ERR_RESULT = new Result(); private static final Result EOP_RESULT = new Result(POStatus.STATUS_EOP, null); // number of records to sample to determine average size used by each // entry in hash map and average seen reduction private static final int NUM_RECS_TO_SAMPLE = 10000; // We want to avoid massive ArrayList copies as they get big. // Array Lists grow by prevSize + prevSize/2. Given default initial size of 10, // 9369 is the size of the array after 18 such resizings. This seems like a sufficiently // large value to trigger spilling/aggregation instead of paying for yet another data // copy. private static final int MAX_LIST_SIZE = 9368; private static final int DEFAULT_MIN_REDUCTION = 10; // TODO: these are temporary. The real thing should be using memory usage estimation. private static final int FIRST_TIER_THRESHOLD = 20000; private static final int SECOND_TIER_THRESHOLD = FIRST_TIER_THRESHOLD / DEFAULT_MIN_REDUCTION; private static final WeakHashMap<POPartialAgg, Byte> ALL_POPARTS = new WeakHashMap<POPartialAgg, Byte>(); private static final TupleFactory TF = TupleFactory.getInstance(); private static final BagFactory BG = BagFactory.getInstance(); private PhysicalPlan keyPlan; private ExpressionOperator keyLeaf; private List<PhysicalPlan> valuePlans; private List<ExpressionOperator> valueLeaves; private int numRecsInRawMap = 0; private int numRecsInProcessedMap = 0; private Map<Object, List<Tuple>> rawInputMap = Maps.newHashMap(); private Map<Object, List<Tuple>> processedInputMap = Maps.newHashMap(); private boolean disableMapAgg = false; private boolean sizeReductionChecked = false; private boolean inputsExhausted = false; private volatile boolean doSpill = false; private transient MemoryLimits memLimits; private transient boolean initialized = false; private int firstTierThreshold = FIRST_TIER_THRESHOLD; private int secondTierThreshold = SECOND_TIER_THRESHOLD; private int sizeReduction = 1; private int avgTupleSize = 0; private Iterator<Entry<Object, List<Tuple>>> spillingIterator; private boolean estimatedMemThresholds = false; public POPartialAgg(OperatorKey k) { super(k); } private void init() throws ExecException { ALL_POPARTS.put(this, null); float percent = getPercentUsageFromProp(); if (percent <= 0) { LOG.info("No memory allocated to intermediate memory buffers. Turning off partial aggregation."); disableMapAgg(); } initialized = true; SpillableMemoryManager.getInstance().registerSpillable(this); } @Override public Result getNextTuple() throws ExecException { // accumulate tuples from processInput in rawInputMap. // when the maps grow to mem limit, go over each item in map, and call // combiner aggs on each collection. // Store the results into processedInputMap. Clear out rawInputMap. // Mem usage is updated every time we modify either of the maps. // When processedInputMap is >= 20% of allotted memory, run aggs on it, // and output the results as returns of successive calls of this method. // Then reset processedInputMap. // The fact that we are in the latter stage is communicated via the doSpill // flag. if (!initialized && !ALL_POPARTS.containsKey(this)) { init(); } while (true) { if (!sizeReductionChecked && numRecsInRawMap >= NUM_RECS_TO_SAMPLE) { checkSizeReduction(); } if (!estimatedMemThresholds && numRecsInRawMap >= NUM_RECS_TO_SAMPLE) { estimateMemThresholds(); } if (doSpill) { startSpill(); Result result = spillResult(); if (result == EOP_RESULT) { doSpill = false; } if (result != EOP_RESULT || inputsExhausted) { return result; } } if (mapAggDisabled()) { // disableMapAgg() sets doSpill, so we can't get here while there is still contents in the buffered maps. // if we get to this point, everything is flushed, so we can simply return the raw tuples from now on. return processInput(); } else { Result inp = processInput(); if (inp.returnStatus == POStatus.STATUS_ERR) { return inp; } else if (inp.returnStatus == POStatus.STATUS_EOP) { if (parentPlan.endOfAllInput) { // parent input is over. flush what we have. inputsExhausted = true; startSpill(); LOG.info("Spilling last bits."); continue; } else { return EOP_RESULT; } } else if (inp.returnStatus == POStatus.STATUS_NULL) { continue; } else { // add this input to map. Tuple inpTuple = (Tuple) inp.result; keyPlan.attachInput(inpTuple); // evaluate the key Result keyRes = getResult(keyLeaf); if (keyRes == ERR_RESULT) { return ERR_RESULT; } Object key = keyRes.result; keyPlan.detachInput(); numRecsInRawMap += 1; addKeyValToMap(rawInputMap, key, inpTuple); if (shouldAggregateFirstLevel()) { aggregateFirstLevel(); } if (shouldAggregateSecondLevel()) { aggregateSecondLevel(); } if (shouldSpill()) { LOG.info("Starting spill."); startSpill(); // next time around, we'll start emitting. } } } } } private void estimateMemThresholds() { if (!mapAggDisabled()) { LOG.info("Getting mem limits; considering " + ALL_POPARTS.size() + " POPArtialAgg objects."); float percent = getPercentUsageFromProp(); memLimits = new MemoryLimits(ALL_POPARTS.size(), percent); int estTotalMem = 0; int estTuples = 0; for (Map.Entry<Object, List<Tuple>> entry : rawInputMap.entrySet()) { for (Tuple t : entry.getValue()) { estTuples += 1; int mem = (int) t.getMemorySize(); estTotalMem += mem; memLimits.addNewObjSize(mem); } } avgTupleSize = estTotalMem / estTuples; int totalTuples = memLimits.getCacheLimit(); LOG.info("Estimated total tuples to buffer, based on " + estTuples + " tuples that took up " + estTotalMem + " bytes: " + totalTuples); firstTierThreshold = (int) (0.5 + totalTuples * (1f - (1f / sizeReduction))); secondTierThreshold = (int) (0.5 + totalTuples * (1f / sizeReduction)); LOG.info("Setting thresholds. Primary: " + firstTierThreshold + ". Secondary: " + secondTierThreshold); } estimatedMemThresholds = true; } private void checkSizeReduction() throws ExecException { int numBeforeReduction = numRecsInProcessedMap + numRecsInRawMap; aggregateFirstLevel(); aggregateSecondLevel(); int numAfterReduction = numRecsInProcessedMap + numRecsInRawMap; LOG.info("After reduction, processed map: " + numRecsInProcessedMap + "; raw map: " + numRecsInRawMap); int minReduction = getMinOutputReductionFromProp(); LOG.info("Observed reduction factor: from " + numBeforeReduction + " to " + numAfterReduction + " => " + numBeforeReduction / numAfterReduction + "."); if ( numBeforeReduction / numAfterReduction < minReduction) { LOG.info("Disabling in-memory aggregation, since observed reduction is less than " + minReduction); disableMapAgg(); } sizeReduction = numBeforeReduction / numAfterReduction; sizeReductionChecked = true; } private void disableMapAgg() throws ExecException { startSpill(); disableMapAgg = true; } private boolean mapAggDisabled() { return disableMapAgg; } private boolean shouldAggregateFirstLevel() { if (LOG.isInfoEnabled() && numRecsInRawMap > firstTierThreshold) { LOG.info("Aggregating " + numRecsInRawMap + " raw records."); } return (numRecsInRawMap > firstTierThreshold); } private boolean shouldAggregateSecondLevel() { if (LOG.isInfoEnabled() && numRecsInProcessedMap > secondTierThreshold) { LOG.info("Aggregating " + numRecsInProcessedMap + " secondary records."); } return (numRecsInProcessedMap > secondTierThreshold); } private boolean shouldSpill() { // is this always the same as shouldAgg? return shouldAggregateSecondLevel(); } private void addKeyValToMap(Map<Object, List<Tuple>> map, Object key, Tuple inpTuple) throws ExecException { List<Tuple> value = map.get(key); if (value == null) { value = new ArrayList<Tuple>(); map.put(key, value); } value.add(inpTuple); if (value.size() >= MAX_LIST_SIZE) { boolean isFirst = (map == rawInputMap); if (LOG.isDebugEnabled()){ LOG.debug("The cache for key " + key + " has grown too large. Aggregating " + ((isFirst) ? "first level." : "second level.")); } if (isFirst) { aggregateRawRow(key); } else { aggregateSecondLevel(); } } } private void startSpill() throws ExecException { // If spillingIterator is null, we are already spilling and don't need to set up. if (spillingIterator != null) return; if (!rawInputMap.isEmpty()) { if (LOG.isInfoEnabled()) { LOG.info("In startSpill(), aggregating raw inputs. " + numRecsInRawMap + " tuples."); } aggregateFirstLevel(); if (LOG.isInfoEnabled()) { LOG.info("processed inputs: " + numRecsInProcessedMap + " tuples."); } } if (!processedInputMap.isEmpty()) { if (LOG.isInfoEnabled()) { LOG.info("In startSpill(), aggregating processed inputs. " + numRecsInProcessedMap + " tuples."); } aggregateSecondLevel(); if (LOG.isInfoEnabled()) { LOG.info("processed inputs: " + numRecsInProcessedMap + " tuples."); } } doSpill = true; spillingIterator = processedInputMap.entrySet().iterator(); } private Result spillResult() throws ExecException { // if no more to spill, return EOP_RESULT. if (processedInputMap.isEmpty()) { spillingIterator = null; LOG.info("In spillResults(), processed map is empty -- done spilling."); return EOP_RESULT; } else { Map.Entry<Object, List<Tuple>> entry = spillingIterator.next(); Tuple valueTuple = createValueTuple(entry.getKey(), entry.getValue()); numRecsInProcessedMap -= entry.getValue().size(); spillingIterator.remove(); Result res = getOutput(entry.getKey(), valueTuple); return res; } } private void aggregateRawRow(Object key) throws ExecException { List<Tuple> value = rawInputMap.get(key); Tuple valueTuple = createValueTuple(key, value); Result res = getOutput(key, valueTuple); rawInputMap.remove(key); addKeyValToMap(processedInputMap, key, getAggResultTuple(res.result)); numRecsInProcessedMap += valueTuple.size() - 1; } /** * For each entry in rawInputMap, feed the list of tuples into the aggregator funcs * and add the results to processedInputMap. Remove the entries from rawInputMap as we go. * @throws ExecException */ private int aggregate(Map<Object, List<Tuple>> fromMap, Map<Object, List<Tuple>> toMap, int numEntriesInTarget) throws ExecException { Iterator<Map.Entry<Object, List<Tuple>>> iter = fromMap.entrySet().iterator(); while (iter.hasNext()) { Map.Entry<Object, List<Tuple>> entry = iter.next(); Tuple valueTuple = createValueTuple(entry.getKey(), entry.getValue()); Result res = getOutput(entry.getKey(), valueTuple); iter.remove(); addKeyValToMap(toMap, entry.getKey(), getAggResultTuple(res.result)); numEntriesInTarget += valueTuple.size() - 1; } return numEntriesInTarget; } private void aggregateFirstLevel() throws ExecException { numRecsInProcessedMap = aggregate(rawInputMap, processedInputMap, numRecsInProcessedMap); numRecsInRawMap = 0; } private void aggregateSecondLevel() throws ExecException { Map<Object, List<Tuple>> newMap = Maps.newHashMapWithExpectedSize(processedInputMap.size()); numRecsInProcessedMap = aggregate(processedInputMap, newMap, 0); processedInputMap = newMap; } private Tuple createValueTuple(Object key, List<Tuple> inpTuples) throws ExecException { Tuple valueTuple = TF.newTuple(valuePlans.size() + 1); valueTuple.set(0, key); for (int i = 0; i < valuePlans.size(); i++) { DataBag bag = BG.newDefaultBag(); valueTuple.set(i + 1, bag); } for (Tuple t : inpTuples) { for (int i = 1; i < t.size(); i++) { DataBag bag = (DataBag) valueTuple.get(i); bag.add((Tuple) t.get(i)); } } return valueTuple; } private Tuple getAggResultTuple(Object result) throws ExecException { try { return (Tuple) result; } catch (ClassCastException ex) { throw new ExecException("Intermediate Algebraic " + "functions must implement EvalFunc<Tuple>"); } } @Override public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) { // combiner optimizer does not get invoked if the plan is being executed // under illustrate, so POPartialAgg should not get used in that case throw new UnsupportedOperationException(); } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitPartialAgg(this); } private int getMinOutputReductionFromProp() { int minReduction = PigMapReduce.sJobConfInternal.get().getInt( PigConfiguration.PARTAGG_MINREDUCTION, DEFAULT_MIN_REDUCTION); if (minReduction <= 0) { LOG.info("Specified reduction is < 0 (" + minReduction + "). Using default " + DEFAULT_MIN_REDUCTION); minReduction = DEFAULT_MIN_REDUCTION; } return minReduction; } private float getPercentUsageFromProp() { float percent = 0.2F; if (PigMapReduce.sJobConfInternal.get() != null) { String usage = PigMapReduce.sJobConfInternal.get().get( PigConfiguration.PROP_CACHEDBAG_MEMUSAGE); if (usage != null) { percent = Float.parseFloat(usage); } } return percent; } private Result getResult(ExpressionOperator op) throws ExecException { Result res = ERR_RESULT; switch (op.getResultType()) { case DataType.BAG: case DataType.BOOLEAN: case DataType.BYTEARRAY: case DataType.CHARARRAY: case DataType.DOUBLE: case DataType.FLOAT: case DataType.INTEGER: case DataType.LONG: case DataType.BIGINTEGER: case DataType.BIGDECIMAL: case DataType.DATETIME: case DataType.MAP: case DataType.TUPLE: res = op.getNext(op.getResultType()); break; default: String msg = "Invalid result type: " + DataType.findType(op.getResultType()); throw new ExecException(msg, 2270, PigException.BUG); } // allow null as group by key if (res.returnStatus == POStatus.STATUS_OK || res.returnStatus == POStatus.STATUS_NULL) { return res; } return ERR_RESULT; } /** * Runs the provided key-value pair through the aggregator plans. * @param key * @param value * @return Result, containing a tuple of form (key, tupleReturnedByPlan1, tupleReturnedByPlan2, ...) * @throws ExecException */ private Result getOutput(Object key, Tuple value) throws ExecException { Tuple output = TF.newTuple(valuePlans.size() + 1); output.set(0, key); for (int i = 0; i < valuePlans.size(); i++) { valuePlans.get(i).attachInput(value); Result valRes = getResult(valueLeaves.get(i)); if (valRes == ERR_RESULT) { return ERR_RESULT; } output.set(i + 1, valRes.result); } return new Result(POStatus.STATUS_OK, output); } @Override public boolean supportsMultipleInputs() { return false; } @Override public boolean supportsMultipleOutputs() { return false; } @Override public String name() { return getAliasString() + "Partial Agg" + "[" + DataType.findTypeName(resultType) + "]" + mKey.toString(); } public PhysicalPlan getKeyPlan() { return keyPlan; } public void setKeyPlan(PhysicalPlan keyPlan) { this.keyPlan = keyPlan; keyLeaf = (ExpressionOperator) keyPlan.getLeaves().get(0); } public List<PhysicalPlan> getValuePlans() { return valuePlans; } public void setValuePlans(List<PhysicalPlan> valuePlans) { this.valuePlans = valuePlans; valueLeaves = new ArrayList<ExpressionOperator>(); for (PhysicalPlan plan : valuePlans) { valueLeaves.add((ExpressionOperator) plan.getLeaves().get(0)); } } @Override public long spill() { LOG.info("Spill triggered by SpillableMemoryManager"); doSpill = true; return 0; } @Override public long getMemorySize() { return avgTupleSize * (numRecsInProcessedMap + numRecsInRawMap); } }