/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.WeakHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.PigConfiguration; import org.apache.pig.PigException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.ExpressionOperator; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.InternalCachedBag; import org.apache.pig.data.SelfSpillBag.MemoryLimits; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.GroupingSpillable; import org.apache.pig.impl.util.Spillable; import org.apache.pig.impl.util.SpillableMemoryManager; import com.google.common.collect.Maps; /** * Do partial aggregation in map plan. Inputs are buffered up in * a hashmap until a threshold is reached; then the combiner functions * are fed these buffered up inputs, and results stored in a secondary * map. Once that map fills up or all input has been seen, results are * piped out into the next operator (caller of getNext()). */ public class POPartialAgg extends PhysicalOperator implements Spillable, GroupingSpillable { private static final Log LOG = LogFactory.getLog(POPartialAgg.class); private static final long serialVersionUID = 1L; private static final Result EOP_RESULT = new Result(POStatus.STATUS_EOP, null); // number of records to sample to determine average size used by each // entry in hash map and average seen reduction private static final int NUM_RECS_TO_SAMPLE = 10000; // We want to avoid massive ArrayList copies as they get big. // Array Lists grow by prevSize + prevSize/2. Given default initial size of 10, // 9369 is the size of the array after 18 such resizings. This seems like a sufficiently // large value to trigger spilling/aggregation instead of paying for yet another data // copy. private static final int MAX_LIST_SIZE = 9368; private static final int DEFAULT_MIN_REDUCTION = 10; // TODO: these are temporary. The real thing should be using memory usage estimation. private static final int FIRST_TIER_THRESHOLD = 20000; private static final int SECOND_TIER_THRESHOLD = FIRST_TIER_THRESHOLD / DEFAULT_MIN_REDUCTION; private static final WeakHashMap<POPartialAgg, Byte> ALL_POPARTS = new WeakHashMap<POPartialAgg, Byte>(); private static final TupleFactory TF = TupleFactory.getInstance(); private PhysicalPlan keyPlan; private ExpressionOperator keyLeaf; private List<PhysicalPlan> valuePlans; private List<ExpressionOperator> valueLeaves; private transient int numRecsInRawMap; private transient int numRecsInProcessedMap; private transient Map<Object, List<Tuple>> rawInputMap; private transient Map<Object, List<Tuple>> processedInputMap; //Transient booleans always initialize to false private transient boolean initialized; private transient boolean disableMapAgg; private transient boolean sizeReductionChecked; private transient boolean inputsExhausted; private transient boolean estimatedMemThresholds; // The doSpill flag is set when spilling is running or needs to run. // It is set by POPartialAgg when its buffers are full after having run aggregations and // the records have to be emitted to the map output. // The doContingentSpill flag is set when the SpillableMemoryManager is notified // by GC that the runtime is low on memory and the SpillableMemoryManager identifies // the particular buffer as a good spill candidate because it is large. The contingent spill logic tries // to satisfy the memory manager's request for freeing memory by aggregating data // rather than just spilling records to disk. private transient volatile boolean doSpill; private transient volatile boolean doContingentSpill; private transient volatile Object spillLock; private transient int minOutputReduction; private transient float percentUsage; private transient int numRecordsToSample; private transient int firstTierThreshold; private transient int secondTierThreshold; private transient int sizeReduction; private transient int avgTupleSize; private transient Iterator<Entry<Object, List<Tuple>>> spillingIterator; public POPartialAgg(OperatorKey k) { super(k); } private void init() throws ExecException { ALL_POPARTS.put(this, null); numRecsInRawMap = 0; numRecsInProcessedMap = 0; rawInputMap = Maps.newHashMap(); processedInputMap = Maps.newHashMap(); minOutputReduction = DEFAULT_MIN_REDUCTION; numRecordsToSample = NUM_RECS_TO_SAMPLE; firstTierThreshold = FIRST_TIER_THRESHOLD; secondTierThreshold = SECOND_TIER_THRESHOLD; sizeReduction = 1; avgTupleSize = 0; percentUsage = 0.2F; spillLock = new Object(); if (PigMapReduce.sJobConfInternal.get() != null) { String usage = PigMapReduce.sJobConfInternal.get().get( PigConfiguration.PIG_CACHEDBAG_MEMUSAGE); if (usage != null) { percentUsage = Float.parseFloat(usage); } minOutputReduction = PigMapReduce.sJobConfInternal.get().getInt( PigConfiguration.PIG_EXEC_MAP_PARTAGG_MINREDUCTION, DEFAULT_MIN_REDUCTION); if (minOutputReduction <= 0) { LOG.info("Specified reduction is < 0 (" + minOutputReduction + "). Using default " + DEFAULT_MIN_REDUCTION); minOutputReduction = DEFAULT_MIN_REDUCTION; } } if (percentUsage <= 0) { LOG.info("No memory allocated to intermediate memory buffers. Turning off partial aggregation."); disableMapAgg(); // Set them to true instead of adding another check for !disableMapAgg sizeReductionChecked = true; estimatedMemThresholds = true; } initialized = true; SpillableMemoryManager.getInstance().registerSpillable(this); } @Override public Result getNextTuple() throws ExecException { // accumulate tuples from processInput in rawInputMap. // when the maps grow to mem limit, go over each item in map, and call // combiner aggs on each collection. // Store the results into processedInputMap. Clear out rawInputMap. // Mem usage is updated every time we modify either of the maps. // When processedInputMap is >= 20% of allotted memory, run aggs on it, // and output the results as returns of successive calls of this method. // Then reset processedInputMap. // The fact that we are in the latter stage is communicated via the doSpill // flag. if (!initialized && !ALL_POPARTS.containsKey(this)) { init(); } while (true) { if (!sizeReductionChecked && numRecsInRawMap >= numRecordsToSample) { checkSizeReduction(); if (doContingentSpill && !doSpill) { LOG.info("Avoided emitting records during spill memory call."); doContingentSpill = false; } } if (!estimatedMemThresholds && numRecsInRawMap >= numRecordsToSample) { estimateMemThresholds(); } if (doContingentSpill) { // Don't aggregate if spilling. Avoid concurrent update of spilling iterator. if (doSpill == false) { // SpillableMemoryManager requested a spill to reduce memory // consumption. See if we can avoid it. aggregateBothLevels(false, false); if (shouldSpill()) { startSpill(false); } else { LOG.info("Avoided emitting records during spill memory call."); doContingentSpill = false; } } } if (doSpill) { startSpill(true); Result result = spillResult(); if (result.returnStatus == POStatus.STATUS_EOP) { doSpill = false; doContingentSpill = false; } if (result.returnStatus != POStatus.STATUS_EOP || inputsExhausted) { return result; } } if (mapAggDisabled()) { // disableMapAgg() sets doSpill, so we can't get here while there is still contents in the buffered maps. // if we get to this point, everything is flushed, so we can simply return the raw tuples from now on. return processInput(); } else { Result inp = processInput(); if (inp.returnStatus == POStatus.STATUS_ERR) { return inp; } else if (inp.returnStatus == POStatus.STATUS_EOP) { if (parentPlan.endOfAllInput) { // parent input is over. flush what we have. inputsExhausted = true; LOG.info("Spilling last bits."); startSpill(true); continue; } else { return EOP_RESULT; } } else if (inp.returnStatus == POStatus.STATUS_NULL) { continue; } else { // add this input to map. Tuple inpTuple = (Tuple) inp.result; keyPlan.attachInput(inpTuple); // evaluate the key Result keyRes = getResult(keyLeaf); if (keyRes.returnStatus != POStatus.STATUS_OK) { return keyRes; } Object key = keyRes.result; keyPlan.detachInput(); numRecsInRawMap += 1; addKeyValToMap(rawInputMap, key, inpTuple); aggregateBothLevels(true, true); if (shouldSpill()) { startSpill(false); // next time around, we'll start emitting. } } } } } private void estimateMemThresholds() { if (!mapAggDisabled()) { LOG.info("Getting mem limits; considering " + ALL_POPARTS.size() + " POPArtialAgg objects." + " with memory percentage " + percentUsage); MemoryLimits memLimits = new MemoryLimits(ALL_POPARTS.size(), percentUsage); int estTotalMem = 0; int estTuples = 0; for (Map.Entry<Object, List<Tuple>> entry : rawInputMap.entrySet()) { for (Tuple t : entry.getValue()) { estTuples += 1; int mem = (int) t.getMemorySize(); estTotalMem += mem; memLimits.addNewObjSize(mem); } } avgTupleSize = estTotalMem / estTuples; long totalTuples = memLimits.getCacheLimit(); LOG.info("Estimated total tuples to buffer, based on " + estTuples + " tuples that took up " + estTotalMem + " bytes: " + totalTuples); firstTierThreshold = (int) (0.5 + totalTuples * (1f - (1f / sizeReduction))); secondTierThreshold = (int) (0.5 + totalTuples * (1f / sizeReduction)); LOG.info("Setting thresholds. Primary: " + firstTierThreshold + ". Secondary: " + secondTierThreshold); // The second tier should at least allow one tuple before it tries to aggregate. // This code retains the total number of tuples in the buffer while guaranteeing // the second tier has at least one tuple. if (secondTierThreshold == 0) { secondTierThreshold += 1; firstTierThreshold -= 1; } } estimatedMemThresholds = true; } private void checkSizeReduction() throws ExecException { if (!mapAggDisabled()) { int numBeforeReduction = numRecsInProcessedMap + numRecsInRawMap; aggregateBothLevels(false, false); int numAfterReduction = numRecsInProcessedMap + numRecsInRawMap; LOG.info("After reduction, processed map: " + numRecsInProcessedMap + "; raw map: " + numRecsInRawMap); LOG.info("Observed reduction factor: from " + numBeforeReduction + " to " + numAfterReduction + " => " + numBeforeReduction / numAfterReduction + "."); if ( numBeforeReduction / numAfterReduction < minOutputReduction) { LOG.info("Disabling in-memory aggregation, since observed reduction is less than " + minOutputReduction); disableMapAgg(); } sizeReduction = numBeforeReduction / numAfterReduction; sizeReductionChecked = true; } } private void disableMapAgg() throws ExecException { // Do not aggregate as when disableMapAgg is called aggregation is // called and size reduction checked startSpill(false); disableMapAgg = true; } private boolean mapAggDisabled() { return disableMapAgg; } private boolean shouldAggregateFirstLevel() { return (numRecsInRawMap > firstTierThreshold); } private boolean shouldAggregateSecondLevel() { return (numRecsInProcessedMap > secondTierThreshold); } private boolean shouldSpill() { // is this always the same as shouldAgg? return shouldAggregateSecondLevel(); } private void addKeyValToMap(Map<Object, List<Tuple>> map, Object key, Tuple inpTuple) throws ExecException { List<Tuple> value = map.get(key); if (value == null) { value = new ArrayList<Tuple>(); map.put(key, value); } value.add(inpTuple); if (value.size() >= MAX_LIST_SIZE) { boolean isFirst = (map == rawInputMap); if (LOG.isDebugEnabled()){ LOG.debug("The cache for key " + key + " has grown too large. Aggregating " + ((isFirst) ? "first level." : "second level.")); } if (isFirst) { aggregateRawRow(key); } else { aggregateSecondLevel(); } } } private void startSpill(boolean aggregate) throws ExecException { // If spillingIterator is null, we are already spilling and don't need to set up. if (spillingIterator != null) return; LOG.info("Starting spill."); if (aggregate) { aggregateBothLevels(false, true); } doSpill = true; spillingIterator = processedInputMap.entrySet().iterator(); } private Result spillResult() throws ExecException { // if no more to spill, return EOP_RESULT. if (processedInputMap.isEmpty()) { spillingIterator = null; LOG.info("In spillResults(), processed map is empty -- done spilling."); return EOP_RESULT; } else { Map.Entry<Object, List<Tuple>> entry = spillingIterator.next(); Tuple valueTuple = createValueTuple(entry.getKey(), entry.getValue()); numRecsInProcessedMap -= entry.getValue().size(); spillingIterator.remove(); Result res = getOutput(entry.getKey(), valueTuple); return res; } } private void aggregateRawRow(Object key) throws ExecException { List<Tuple> value = rawInputMap.get(key); Tuple valueTuple = createValueTuple(key, value); Result res = getOutput(key, valueTuple); rawInputMap.remove(key); addKeyValToMap(processedInputMap, key, getAggResultTuple(res.result)); numRecsInProcessedMap++; } /** * For each entry in rawInputMap, feed the list of tuples into the aggregator funcs * and add the results to processedInputMap. Remove the entries from rawInputMap as we go. * @throws ExecException */ private int aggregate(Map<Object, List<Tuple>> fromMap, Map<Object, List<Tuple>> toMap, int numEntriesInTarget) throws ExecException { Iterator<Map.Entry<Object, List<Tuple>>> iter = fromMap.entrySet().iterator(); while (iter.hasNext()) { Map.Entry<Object, List<Tuple>> entry = iter.next(); Tuple valueTuple = createValueTuple(entry.getKey(), entry.getValue()); Result res = getOutput(entry.getKey(), valueTuple); iter.remove(); addKeyValToMap(toMap, entry.getKey(), getAggResultTuple(res.result)); numEntriesInTarget++; } return numEntriesInTarget; } private void aggregateBothLevels(boolean checkThresholdForFirst, boolean checkThresholdForSecond) throws ExecException { // When processed map is initially empty, just aggregate first level as // aggregating second level immediately would not yield anything boolean aggregateSecondLevel = !processedInputMap.isEmpty(); if (!checkThresholdForFirst || shouldAggregateFirstLevel()) { aggregateFirstLevel(); } if (aggregateSecondLevel && (!checkThresholdForSecond || shouldAggregateSecondLevel())) { aggregateSecondLevel(); } } private void aggregateFirstLevel() throws ExecException { if (rawInputMap.isEmpty()) { return; } int rawTuples = numRecsInRawMap; int processedTuples = numRecsInProcessedMap; numRecsInProcessedMap = aggregate(rawInputMap, processedInputMap, numRecsInProcessedMap); numRecsInRawMap = 0; LOG.info("Aggregated " + rawTuples+ " raw tuples." + " Processed tuples before aggregation = " + processedTuples + ", after aggregation = " + numRecsInProcessedMap); } private void aggregateSecondLevel() throws ExecException { if (processedInputMap.isEmpty()) { return; } int processedTuples = numRecsInProcessedMap; Map<Object, List<Tuple>> newMap = Maps.newHashMapWithExpectedSize(processedInputMap.size()); numRecsInProcessedMap = aggregate(processedInputMap, newMap, 0); processedInputMap = newMap; LOG.info("Aggregated " + processedTuples + " processed tuples to " + numRecsInProcessedMap + " tuples"); } private Tuple createValueTuple(Object key, List<Tuple> inpTuples) throws ExecException { Tuple valueTuple = TF.newTuple(valuePlans.size() + 1); valueTuple.set(0, key); for (int i = 0; i < valuePlans.size(); i++) { DataBag bag = null; if (doContingentSpill) { // Don't use additional memory since we already have memory stress bag = new InternalCachedBag(); } else { // Take 10% of memory, need fine tune later bag = new InternalCachedBag(1, 0.1F); } valueTuple.set(i + 1, bag); } for (Tuple t : inpTuples) { for (int i = 1; i < t.size(); i++) { DataBag bag = (DataBag) valueTuple.get(i); bag.add((Tuple) t.get(i)); } } return valueTuple; } private Tuple getAggResultTuple(Object result) throws ExecException { try { return (Tuple) result; } catch (ClassCastException ex) { throw new ExecException("Intermediate Algebraic " + "functions must implement EvalFunc<Tuple>"); } } @Override public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) { // combiner optimizer does not get invoked if the plan is being executed // under illustrate, so POPartialAgg should not get used in that case throw new UnsupportedOperationException(); } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitPartialAgg(this); } private Result getResult(ExpressionOperator op) throws ExecException { Result res; switch (op.getResultType()) { case DataType.BAG: case DataType.BOOLEAN: case DataType.BYTEARRAY: case DataType.CHARARRAY: case DataType.DOUBLE: case DataType.FLOAT: case DataType.INTEGER: case DataType.LONG: case DataType.BIGINTEGER: case DataType.BIGDECIMAL: case DataType.DATETIME: case DataType.MAP: case DataType.TUPLE: res = op.getNext(op.getResultType()); break; default: String msg = "Invalid result type: " + DataType.findType(op.getResultType()); throw new ExecException(msg, 2270, PigException.BUG); } return res; } /** * Runs the provided key-value pair through the aggregator plans. * @param key * @param value * @return Result, containing a tuple of form (key, tupleReturnedByPlan1, tupleReturnedByPlan2, ...) * @throws ExecException */ private Result getOutput(Object key, Tuple value) throws ExecException { Tuple output = TF.newTuple(valuePlans.size() + 1); output.set(0, key); for (int i = 0; i < valuePlans.size(); i++) { valuePlans.get(i).attachInput(value); Result valRes = getResult(valueLeaves.get(i)); if (valRes.returnStatus == POStatus.STATUS_ERR) { return valRes; } output.set(i + 1, valRes.result); } return new Result(POStatus.STATUS_OK, output); } @Override public boolean supportsMultipleInputs() { return false; } @Override public boolean supportsMultipleOutputs() { return false; } @Override public String name() { return getAliasString() + "Partial Agg" + "[" + DataType.findTypeName(resultType) + "]" + mKey.toString(); } public PhysicalPlan getKeyPlan() { return keyPlan; } public void setKeyPlan(PhysicalPlan keyPlan) { this.keyPlan = keyPlan; keyLeaf = (ExpressionOperator) keyPlan.getLeaves().get(0); } public List<PhysicalPlan> getValuePlans() { return valuePlans; } public void setValuePlans(List<PhysicalPlan> valuePlans) { this.valuePlans = valuePlans; valueLeaves = new ArrayList<ExpressionOperator>(); for (PhysicalPlan plan : valuePlans) { valueLeaves.add((ExpressionOperator) plan.getLeaves().get(0)); } } @Override public long spill() { if (mapAggDisabled()) { return 0; } else { LOG.info("Spill triggered by SpillableMemoryManager"); doContingentSpill = true; synchronized(spillLock) { if (!sizeReductionChecked) { numRecordsToSample = numRecsInRawMap; } try { while (doContingentSpill == true) { Thread.sleep(50); //Keeping it on the lower side for now. Tune later } } catch (InterruptedException e) { LOG.warn("Interrupted exception while waiting for spill to finish", e); } LOG.info("Finished spill for SpillableMemoryManager call"); return 1; } } } @Override public long getMemorySize() { return avgTupleSize * (numRecsInProcessedMap + numRecsInRawMap); } }