/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.physical.impl.xsort.managed;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import org.apache.drill.common.AutoCloseables;
import org.apache.drill.common.config.DrillConfig;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.MetricDef;
import org.apache.drill.exec.physical.config.ExternalSort;
import org.apache.drill.exec.physical.impl.sort.RecordBatchData;
import org.apache.drill.exec.physical.impl.spill.RecordBatchSizer;
import org.apache.drill.exec.physical.impl.spill.SpillSet;
import org.apache.drill.exec.physical.impl.xsort.MSortTemplate;
import org.apache.drill.exec.physical.impl.xsort.SingleBatchSorter;
import org.apache.drill.exec.physical.impl.xsort.managed.BatchGroup.InputBatch;
import org.apache.drill.exec.physical.impl.xsort.managed.BatchGroup.SpilledRun;
import org.apache.drill.exec.record.AbstractRecordBatch;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.SchemaUtil;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.WritableBatch;
import org.apache.drill.exec.record.selection.SelectionVector2;
import org.apache.drill.exec.record.selection.SelectionVector4;
import org.apache.drill.exec.testing.ControlsInjector;
import org.apache.drill.exec.testing.ControlsInjectorFactory;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.AbstractContainerVector;
import com.google.common.collect.Lists;
/**
* External sort batch: a sort batch which can spill to disk in
* order to operate within a defined memory footprint.
* <p>
* <h4>Basic Operation</h4>
* The operator has three key phases:
* <p>
* <ul>
* <li>The load phase in which batches are read from upstream.</li>
* <li>The merge phase in which spilled batches are combined to
* reduce the number of files below the configured limit. (Best
* practice is to configure the system to avoid this phase.)
* <li>The delivery phase in which batches are combined to produce
* the final output.</li>
* </ul>
* During the load phase:
* <p>
* <ul>
* <li>The incoming (upstream) operator provides a series of batches.</li>
* <li>This operator sorts each batch, and accumulates them in an in-memory
* buffer.</li>
* <li>If the in-memory buffer becomes too large, this operator selects
* a subset of the buffered batches to spill.</li>
* <li>Each spill set is merged to create a new, sorted collection of
* batches, and each is spilled to disk.</li>
* <li>To allow the use of multiple disk storage, each spill group is written
* round-robin to a set of spill directories.</li>
* </ul>
* <p>
* Data is spilled to disk as a "run". A run consists of one or more (typically
* many) batches, each of which is itself a sorted run of records.
* <p>
* During the sort/merge phase:
* <p>
* <ul>
* <li>When the input operator is complete, this operator merges the accumulated
* batches (which may be all in memory or partially on disk), and returns
* them to the output (downstream) operator in chunks of no more than
* 64K records.</li>
* <li>The final merge must combine a collection of in-memory and spilled
* batches. Several limits apply to the maximum "width" of this merge. For
* example, each open spill run consumes a file handle, and we may wish
* to limit the number of file handles. Further, memory must hold one batch
* from each run, so we may need to reduce the number of runs so that the
* remaining runs can fit into memory. A consolidation phase combines
* in-memory and spilled batches prior to the final merge to control final
* merge width.</li>
* <li>A special case occurs if no batches were spilled. In this case, the input
* batches are sorted in memory without merging.</li>
* </ul>
* <p>
* Many complex details are involved in doing the above; the details are explained
* in the methods of this class.
* <p>
* <h4>Configuration Options</h4>
* <dl>
* <dt>drill.exec.sort.external.spill.fs</dt>
* <dd>The file system (file://, hdfs://, etc.) of the spill directory.</dd>
* <dt>drill.exec.sort.external.spill.directories</dt>
* <dd>The comma delimited list of directories, on the above file
* system, to which to spill files in round-robin fashion. The query will
* fail if any one of the directories becomes full.</dt>
* <dt>drill.exec.sort.external.spill.file_size</dt>
* <dd>Target size for first-generation spill files Set this to large
* enough to get nice long writes, but not so large that spill directories
* are overwhelmed.</dd>
* <dt>drill.exec.sort.external.mem_limit</dt>
* <dd>Maximum memory to use for the in-memory buffer. (Primarily for testing.)</dd>
* <dt>drill.exec.sort.external.batch_limit</dt>
* <dd>Maximum number of batches to hold in memory. (Primarily for testing.)</dd>
* <dt>drill.exec.sort.external.spill.max_count</dt>
* <dd>Maximum number of batches to add to “first generation” files.
* Defaults to 0 (no limit). (Primarily for testing.)</dd>
* <dt>drill.exec.sort.external.spill.min_count</dt>
* <dd>Minimum number of batches to add to “first generation” files.
* Defaults to 0 (no limit). (Primarily for testing.)</dd>
* <dt>drill.exec.sort.external.merge_limit</dt>
* <dd>Sets the maximum number of runs to be merged in a single pass (limits
* the number of open files.)</dd>
* </dl>
* <p>
* The memory limit observed by this operator is the lesser of:
* <ul>
* <li>The maximum allocation allowed the allocator assigned to this batch
* as set by the Foreman, or</li>
* <li>The maximum limit configured in the mem_limit parameter above. (Primarily for
* testing.</li>
* </ul>
* <h4>Output</h4>
* It is helpful to note that the sort operator will produce one of two kinds of
* output batches.
* <ul>
* <li>A large output with sv4 if data is sorted in memory. The sv4 addresses
* the entire in-memory sort set. A selection vector remover will copy results
* into new batches of a size determined by that operator.</li>
* <li>A series of batches, without a selection vector, if the sort spills to
* disk. In this case, the downstream operator will still be a selection vector
* remover, but there is nothing for that operator to remove. Each batch is
* of the size set by {@link #MAX_MERGED_BATCH_SIZE}.</li>
* </ul>
* Note that, even in the in-memory sort case, this operator could do the copying
* to eliminate the extra selection vector remover. That is left as an exercise
* for another time.
* <h4>Logging</h4>
* Logging in this operator serves two purposes:
* <li>
* <ul>
* <li>Normal diagnostic information.</li>
* <li>Capturing the essence of the operator functionality for analysis in unit
* tests.</li>
* </ul>
* Test logging is designed to capture key events and timings. Take care
* when changing or removing log messages as you may need to adjust unit tests
* accordingly.
*/
public class ExternalSortBatch extends AbstractRecordBatch<ExternalSort> {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ExternalSortBatch.class);
protected static final ControlsInjector injector = ControlsInjectorFactory.getInjector(ExternalSortBatch.class);
/**
* Smallest allowed output batch size. The smallest output batch
* created even under constrained memory conditions.
*/
private static final int MIN_MERGED_BATCH_SIZE = 256 * 1024;
/**
* In the bizarre case where the user gave us an unrealistically low
* spill file size, set a floor at some bare minimum size. (Note that,
* at this size, big queries will create a huge number of files, which
* is why the configuration default is one the order of hundreds of MB.)
*/
private static final long MIN_SPILL_FILE_SIZE = 1 * 1024 * 1024;
public static final String INTERRUPTION_AFTER_SORT = "after-sort";
public static final String INTERRUPTION_AFTER_SETUP = "after-setup";
public static final String INTERRUPTION_WHILE_SPILLING = "spilling";
public static final String INTERRUPTION_WHILE_MERGING = "merging";
public static final long DEFAULT_SPILL_BATCH_SIZE = 8L * 1024 * 1024;
public static final long MIN_SPILL_BATCH_SIZE = 256 * 1024;
private final RecordBatch incoming;
/**
* Memory allocator for this operator itself. Incoming batches are
* transferred into this allocator. Intermediate batches used during
* merge also reside here.
*/
private final BufferAllocator allocator;
/**
* Schema of batches that this operator produces.
*/
private BatchSchema schema;
/**
* Incoming batches buffered in memory prior to spilling
* or an in-memory merge.
*/
private LinkedList<BatchGroup.InputBatch> bufferedBatches = Lists.newLinkedList();
private LinkedList<BatchGroup.SpilledRun> spilledRuns = Lists.newLinkedList();
private SelectionVector4 sv4;
/**
* The number of records to add to each output batch sent to the
* downstream operator or spilled to disk.
*/
private int mergeBatchRowCount;
private int peakNumBatches = -1;
/**
* Maximum memory this operator may use. Usually comes from the
* operator definition, but may be overridden by a configuration
* parameter for unit testing.
*/
private long memoryLimit;
/**
* Iterates over the final, sorted results.
*/
private SortResults resultsIterator;
/**
* Manages the set of spill directories and files.
*/
private final SpillSet spillSet;
/**
* Manages the copier used to merge a collection of batches into
* a new set of batches.
*/
private final CopierHolder copierHolder;
private enum SortState { START, LOAD, DELIVER, DONE }
private SortState sortState = SortState.START;
private int inputRecordCount = 0;
private int inputBatchCount = 0; // total number of batches received so far
private final OperatorCodeGenerator opCodeGen;
/**
* Estimated size of the records for this query, updated on each
* new batch received from upstream.
*/
private int estimatedRowWidth;
/**
* Size of the merge batches that this operator produces. Generally
* the same as the merge batch size, unless low memory forces a smaller
* value.
*/
private long targetMergeBatchSize;
/**
* Estimate of the input batch size based on the largest batch seen
* thus far.
*/
private long estimatedInputBatchSize;
/**
* Maximum number of spilled runs that can be merged in a single pass.
*/
private int mergeLimit;
/**
* Target size of the first-generation spill files.
*/
private long spillFileSize;
/**
* Tracks the minimum amount of remaining memory for use
* in populating an operator metric.
*/
private long minimumBufferSpace;
/**
* Maximum memory level before spilling occurs. That is, we can buffer input
* batches in memory until we reach the level given by the buffer memory pool.
*/
private long bufferMemoryPool;
/**
* Maximum memory that can hold batches during the merge
* phase.
*/
private long mergeMemoryPool;
/**
* The target size for merge batches sent downstream.
*/
private long preferredMergeBatchSize;
/**
* Sum of the total number of bytes read from upstream.
* This is the raw memory bytes, not actual data bytes.
*/
private long totalInputBytes;
/**
* The configured size for each spill batch.
*/
private Long preferredSpillBatchSize;
/**
* Tracks the maximum density of input batches. Density is
* the amount of actual data / amount of memory consumed.
* Low density batches indicate an EOF or something wrong in
* an upstream operator because a low-density batch wastes
* memory.
*/
private int maxDensity;
private int lastDensity = -1;
/**
* Estimated number of rows that fit into a single spill batch.
*/
private int spillBatchRowCount;
/**
* The estimated actual spill batch size which depends on the
* details of the data rows for any particular query.
*/
private int targetSpillBatchSize;
// WARNING: The enum here is used within this class. But, the members of
// this enum MUST match those in the (unmanaged) ExternalSortBatch since
// that is the enum used in the UI to display metrics for the query profile.
public enum Metric implements MetricDef {
SPILL_COUNT, // number of times operator spilled to disk
RETIRED1, // Was: peak value for totalSizeInMemory
// But operator already provides this value
PEAK_BATCHES_IN_MEMORY, // maximum number of batches kept in memory
MERGE_COUNT, // Number of second+ generation merges
MIN_BUFFER, // Minimum memory level observed in operation.
SPILL_MB; // Number of MB of data spilled to disk. This
// amount is first written, then later re-read.
// So, disk I/O is twice this amount.
@Override
public int metricId() {
return ordinal();
}
}
/**
* Iterates over the final sorted results. Implemented differently
* depending on whether the results are in-memory or spilled to
* disk.
*/
public interface SortResults {
boolean next();
void close();
int getBatchCount();
int getRecordCount();
}
public ExternalSortBatch(ExternalSort popConfig, FragmentContext context, RecordBatch incoming) {
super(popConfig, context, true);
this.incoming = incoming;
allocator = oContext.getAllocator();
opCodeGen = new OperatorCodeGenerator(context, popConfig);
spillSet = new SpillSet(context, popConfig, "sort", "run");
copierHolder = new CopierHolder(context, allocator, opCodeGen);
configure(context.getConfig());
}
private void configure(DrillConfig config) {
// The maximum memory this operator can use as set by the
// operator definition (propagated to the allocator.)
memoryLimit = allocator.getLimit();
// Optional configured memory limit, typically used only for testing.
long configLimit = config.getBytes(ExecConstants.EXTERNAL_SORT_MAX_MEMORY);
if (configLimit > 0) {
memoryLimit = Math.min(memoryLimit, configLimit);
}
// Optional limit on the number of spilled runs to merge in a single
// pass. Limits the number of open file handles. Must allow at least
// two batches to merge to make progress.
mergeLimit = getConfigLimit(config, ExecConstants.EXTERNAL_SORT_MERGE_LIMIT, Integer.MAX_VALUE, 2);
// Limits the size of first-generation spill files.
spillFileSize = config.getBytes(ExecConstants.EXTERNAL_SORT_SPILL_FILE_SIZE);
// Ensure the size is reasonable.
spillFileSize = Math.max(spillFileSize, MIN_SPILL_FILE_SIZE);
// The spill batch size. This is a critical setting for performance.
// Set too large and the ratio between memory and input data sizes becomes
// small. Set too small and disk seek times dominate performance.
preferredSpillBatchSize = config.getBytes(ExecConstants.EXTERNAL_SORT_SPILL_BATCH_SIZE);
// In low memory, use no more than 1/4 of memory for each spill batch. Ensures we
// can merge.
preferredSpillBatchSize = Math.min(preferredSpillBatchSize, memoryLimit / 4);
// But, the spill batch should be above some minimum size to prevent complete
// thrashing.
preferredSpillBatchSize = Math.max(preferredSpillBatchSize, MIN_SPILL_BATCH_SIZE);
// Set the target output batch size. Use the maximum size, but only if
// this represents less than 10% of available memory. Otherwise, use 10%
// of memory, but no smaller than the minimum size. In any event, an
// output batch can contain no fewer than a single record.
preferredMergeBatchSize = config.getBytes(ExecConstants.EXTERNAL_SORT_MERGE_BATCH_SIZE);
long maxAllowance = (long) (memoryLimit - 2 * preferredSpillBatchSize);
preferredMergeBatchSize = Math.min(maxAllowance, preferredMergeBatchSize);
preferredMergeBatchSize = Math.max(preferredMergeBatchSize, MIN_MERGED_BATCH_SIZE);
logger.debug("Config: memory limit = {}, " +
"spill file size = {}, spill batch size = {}, merge limit = {}, merge batch size = {}",
memoryLimit, spillFileSize, preferredSpillBatchSize, mergeLimit,
preferredMergeBatchSize);
}
private int getConfigLimit(DrillConfig config, String paramName, int valueIfZero, int minValue) {
int limit = config.getInt(paramName);
if (limit > 0) {
limit = Math.max(limit, minValue);
} else {
limit = valueIfZero;
}
return limit;
}
@Override
public int getRecordCount() {
if (sv4 != null) {
return sv4.getCount();
}
return container.getRecordCount();
}
@Override
public SelectionVector4 getSelectionVector4() {
return sv4;
}
private void closeBatchGroups(Collection<? extends BatchGroup> groups) {
for (BatchGroup group: groups) {
try {
group.close();
} catch (Exception e) {
// collect all failure and make sure to cleanup all remaining batches
// Originally we would have thrown a RuntimeException that would propagate to FragmentExecutor.closeOutResources()
// where it would have been passed to context.fail()
// passing the exception directly to context.fail(e) will let the cleanup process continue instead of stopping
// right away, this will also make sure we collect any additional exception we may get while cleaning up
context.fail(e);
}
}
}
/**
* Called by {@link AbstractRecordBatch} as a fast-path to obtain
* the first record batch and setup the schema of this batch in order
* to quickly return the schema to the client. Note that this method
* fetches the first batch from upstream which will be waiting for
* us the first time that {@link #innerNext()} is called.
*/
@Override
public void buildSchema() {
IterOutcome outcome = next(incoming);
switch (outcome) {
case OK:
case OK_NEW_SCHEMA:
for (VectorWrapper<?> w : incoming) {
@SuppressWarnings("resource")
ValueVector v = container.addOrGet(w.getField());
if (v instanceof AbstractContainerVector) {
w.getValueVector().makeTransferPair(v); // Can we remove this hack?
v.clear();
}
v.allocateNew(); // Can we remove this? - SVR fails with NPE (TODO)
}
container.buildSchema(SelectionVectorMode.NONE);
container.setRecordCount(0);
break;
case STOP:
state = BatchState.STOP;
break;
case OUT_OF_MEMORY:
state = BatchState.OUT_OF_MEMORY;
break;
case NONE:
state = BatchState.DONE;
break;
default:
throw new IllegalStateException("Unexpected iter outcome: " + outcome);
}
}
/**
* Process each request for a batch. The first request retrieves
* all the incoming batches and sorts them, optionally spilling to
* disk as needed. Subsequent calls retrieve the sorted results in
* fixed-size batches.
*/
@Override
public IterOutcome innerNext() {
switch (sortState) {
case DONE:
return IterOutcome.NONE;
case START:
case LOAD:
return load();
case DELIVER:
return nextOutputBatch();
default:
throw new IllegalStateException("Unexpected sort state: " + sortState);
}
}
private IterOutcome nextOutputBatch() {
if (resultsIterator.next()) {
injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_WHILE_MERGING);
return IterOutcome.OK;
} else {
logger.trace("Deliver phase complete: Returned {} batches, {} records",
resultsIterator.getBatchCount(), resultsIterator.getRecordCount());
sortState = SortState.DONE;
// Close the iterator here to release any remaining resources such
// as spill files. This is important when a query has a join: the
// first branch sort may complete before the second branch starts;
// it may be quite a while after returning the last row before the
// fragment executor calls this opeator's close method.
resultsIterator.close();
resultsIterator = null;
return IterOutcome.NONE;
}
}
/**
* Load and process a single batch, handling schema changes. In general, the
* external sort accepts only one schema.
*
* @return return code depending on the amount of data read from upstream
*/
private IterOutcome loadBatch() {
// If this is the very first batch, then AbstractRecordBatch
// already loaded it for us in buildSchema().
IterOutcome upstream;
if (sortState == SortState.START) {
sortState = SortState.LOAD;
upstream = IterOutcome.OK_NEW_SCHEMA;
} else {
upstream = next(incoming);
}
switch (upstream) {
case NONE:
case STOP:
return upstream;
case OK_NEW_SCHEMA:
case OK:
setupSchema(upstream);
// Add the batch to the in-memory generation, spilling if
// needed.
processBatch();
break;
case OUT_OF_MEMORY:
// Note: it is highly doubtful that this code actually works. It
// requires that the upstream batches got to a safe place to run
// out of memory and that no work as in-flight and thus abandoned.
// Consider removing this case once resource management is in place.
logger.error("received OUT_OF_MEMORY, trying to spill");
if (bufferedBatches.size() > 2) {
spillFromMemory();
} else {
logger.error("not enough batches to spill, sending OUT_OF_MEMORY downstream");
return IterOutcome.OUT_OF_MEMORY;
}
break;
default:
throw new IllegalStateException("Unexpected iter outcome: " + upstream);
}
return IterOutcome.OK;
}
/**
* Load the results and sort them. May bail out early if an exceptional
* condition is passed up from the input batch.
*
* @return return code: OK_NEW_SCHEMA if rows were sorted,
* NONE if no rows
*/
private IterOutcome load() {
logger.trace("Start of load phase");
// Clear the temporary container created by
// buildSchema().
container.clear();
// Loop over all input batches
for (;;) {
IterOutcome result = loadBatch();
// None means all batches have been read.
if (result == IterOutcome.NONE) {
break; }
// Any outcome other than OK means something went wrong.
if (result != IterOutcome.OK) {
return result; }
}
// Anything to actually sort?
if (inputRecordCount == 0) {
sortState = SortState.DONE;
return IterOutcome.NONE;
}
logger.debug("Completed load phase: read {} batches, spilled {} times, total input bytes: {}",
inputBatchCount, spilledRuns.size(), totalInputBytes);
// Do the merge of the loaded batches. The merge can be done entirely in memory if
// the results fit; else we have to do a disk-based merge of
// pre-sorted spilled batches.
if (canUseMemoryMerge()) {
return sortInMemory();
} else {
return mergeSpilledRuns();
}
}
/**
* All data has been read from the upstream batch. Determine if we
* can use a fast in-memory sort, or must use a merge (which typically,
* but not always, involves spilled batches.)
*
* @return whether sufficient resources exist to do an in-memory sort
* if all batches are still in memory
*/
private boolean canUseMemoryMerge() {
if (spillSet.hasSpilled()) { return false; }
// Do we have enough memory for MSorter (the in-memory sorter)?
long allocMem = allocator.getAllocatedMemory();
long availableMem = memoryLimit - allocMem;
long neededForInMemorySort = MSortTemplate.memoryNeeded(inputRecordCount);
if (availableMem < neededForInMemorySort) { return false; }
// Make sure we don't exceed the maximum number of batches SV4 can address.
if (bufferedBatches.size() > Character.MAX_VALUE) { return false; }
// We can do an in-memory merge.
return true;
}
/**
* Handle a new schema from upstream. The ESB is quite limited in its ability
* to handle schema changes.
*
* @param upstream the status code from upstream: either OK or OK_NEW_SCHEMA
*/
private void setupSchema(IterOutcome upstream) {
// First batch: we won't have a schema.
if (schema == null) {
schema = incoming.getSchema();
// Subsequent batches, nothing to do if same schema.
} else if (upstream == IterOutcome.OK) {
return;
// Only change in the case that the schema truly changes. Artificial schema changes are ignored.
} else if (incoming.getSchema().equals(schema)) {
return;
} else if (unionTypeEnabled) {
schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema());
// New schema: must generate a new sorter and copier.
opCodeGen.setSchema(schema);
} else {
throw UserException.unsupportedError()
.message("Schema changes not supported in External Sort. Please enable Union type.")
.build(logger);
}
// Coerce all existing batches to the new schema.
for (BatchGroup b : bufferedBatches) {
b.setSchema(schema);
}
for (BatchGroup b : spilledRuns) {
b.setSchema(schema);
}
}
/**
* Convert an incoming batch into the agree-upon format. (Also seems to
* make a persistent shallow copy of the batch saved until we are ready
* to sort or spill.)
*
* @return the converted batch, or null if the incoming batch is empty
*/
@SuppressWarnings("resource")
private VectorContainer convertBatch() {
// Must accept the batch even if no records. Then clear
// the vectors to release memory since we won't do any
// further processing with the empty batch.
VectorContainer convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext);
if (incoming.getRecordCount() == 0) {
for (VectorWrapper<?> w : convertedBatch) {
w.clear();
}
SelectionVector2 sv2 = incoming.getSelectionVector2();
if (sv2 != null) {
sv2.clear();
}
return null;
}
return convertedBatch;
}
private SelectionVector2 makeSelectionVector() {
if (incoming.getSchema().getSelectionVectorMode() == BatchSchema.SelectionVectorMode.TWO_BYTE) {
return incoming.getSelectionVector2().clone();
} else {
return newSV2();
}
}
/**
* Process the converted incoming batch by adding it to the in-memory store
* of data, or spilling data to disk when necessary.
*/
@SuppressWarnings("resource")
private void processBatch() {
// Skip empty batches (such as the first one.)
if (incoming.getRecordCount() == 0) {
return;
}
// Determine actual sizes of the incoming batch before taking
// ownership. Allows us to figure out if we need to spill first,
// to avoid overflowing memory simply due to ownership transfer.
RecordBatchSizer sizer = analyzeIncomingBatch();
// The heart of the external sort operator: spill to disk when
// the in-memory generation exceeds the allowed memory limit.
// Preemptively spill BEFORE accepting the new batch into our memory
// pool. The allocator will throw an OOM exception if we accept the
// batch when we are near the limit - despite the fact that the batch
// is already in memory and no new memory is allocated during the transfer.
if ( isSpillNeeded(sizer.actualSize())) {
spillFromMemory();
}
// Sanity check. We should now be below the buffer memory maximum.
long startMem = allocator.getAllocatedMemory();
if (startMem > bufferMemoryPool) {
logger.error( "ERROR: Failed to spill above buffer limit. Buffer pool = {}, memory = {}",
bufferMemoryPool, startMem);
}
// Convert the incoming batch to the agreed-upon schema.
// No converted batch means we got an empty input batch.
// Converting the batch transfers memory ownership to our
// allocator. This gives a round-about way to learn the batch
// size: check the before and after memory levels, then use
// the difference as the batch size, in bytes.
VectorContainer convertedBatch = convertBatch();
if (convertedBatch == null) {
return;
}
SelectionVector2 sv2;
try {
sv2 = makeSelectionVector();
} catch (Exception e) {
convertedBatch.clear();
throw e;
}
// Compute batch size, including allocation of an sv2.
long endMem = allocator.getAllocatedMemory();
long batchSize = endMem - startMem;
int count = sv2.getCount();
inputRecordCount += count;
inputBatchCount++;
totalInputBytes += sizer.actualSize();
// Update the minimum buffer space metric.
if (minimumBufferSpace == 0) {
minimumBufferSpace = endMem;
} else {
minimumBufferSpace = Math.min(minimumBufferSpace, endMem);
}
stats.setLongStat(Metric.MIN_BUFFER, minimumBufferSpace);
// Update the size based on the actual record count, not
// the effective count as given by the selection vector
// (which may exclude some records due to filtering.)
updateMemoryEstimates(batchSize, sizer);
// Sort the incoming batch using either the original selection vector,
// or a new one created here.
SingleBatchSorter sorter;
sorter = opCodeGen.getSorter(convertedBatch);
try {
sorter.setup(context, sv2, convertedBatch);
} catch (SchemaChangeException e) {
convertedBatch.clear();
throw UserException.unsupportedError(e)
.message("Unexpected schema change.")
.build(logger);
}
try {
sorter.sort(sv2);
} catch (SchemaChangeException e) {
convertedBatch.clear();
throw UserException.unsupportedError(e)
.message("Unexpected schema change.")
.build(logger);
}
RecordBatchData rbd = new RecordBatchData(convertedBatch, allocator);
try {
rbd.setSv2(sv2);
bufferedBatches.add(new BatchGroup.InputBatch(rbd.getContainer(), rbd.getSv2(), oContext, sizer.netSize()));
if (peakNumBatches < bufferedBatches.size()) {
peakNumBatches = bufferedBatches.size();
stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches);
}
} catch (Throwable t) {
rbd.clear();
throw t;
}
}
/**
* Scan the vectors in the incoming batch to determine batch size and if
* any oversize columns exist. (Oversize columns cause memory fragmentation.)
*
* @return an analysis of the incoming batch
*/
private RecordBatchSizer analyzeIncomingBatch() {
RecordBatchSizer sizer = new RecordBatchSizer(incoming);
sizer.applySv2();
if (inputBatchCount == 0) {
logger.debug("{}", sizer.toString());
}
return sizer;
}
/**
* Update the data-driven memory use numbers including:
* <ul>
* <li>The average size of incoming records.</li>
* <li>The estimated spill and output batch size.</li>
* <li>The estimated number of average-size records per
* spill and output batch.</li>
* <li>The amount of memory set aside to hold the incoming
* batches before spilling starts.</li>
* </ul>
*
* @param actualBatchSize the overall size of the current batch received from
* upstream
* @param actualRecordCount the number of actual (not filtered) records in
* that upstream batch
*/
private void updateMemoryEstimates(long memoryDelta, RecordBatchSizer sizer) {
long actualBatchSize = sizer.actualSize();
int actualRecordCount = sizer.rowCount();
if (actualBatchSize != memoryDelta) {
logger.debug("Memory delta: {}, actual batch size: {}, Diff: {}",
memoryDelta, actualBatchSize, memoryDelta - actualBatchSize);
}
// The record count should never be zero, but better safe than sorry...
if (actualRecordCount == 0) {
return; }
// If the vector is less than 75% full, just ignore it, except in the
// unfortunate case where it is the first batch. Low-density batches generally
// occur only at the end of a file or at the end of a DFS block. In such a
// case, we will continue to rely on estimates created on previous, high-
// density batches.
// We actually track the max density seen, and compare to 75% of that since
// Parquet produces very low density record batches.
if (sizer.avgDensity() < maxDensity * 3 / 4 && sizer.avgDensity() != lastDensity) {
logger.trace("Saw low density batch. Density: {}", sizer.avgDensity());
lastDensity = sizer.avgDensity();
return;
}
maxDensity = Math.max(maxDensity, sizer.avgDensity());
// We know the batch size and number of records. Use that to estimate
// the average record size. Since a typical batch has many records,
// the average size is a fairly good estimator. Note that the batch
// size includes not just the actual vector data, but any unused space
// resulting from power-of-two allocation. This means that we don't
// have to do size adjustments for input batches as we will do below
// when estimating the size of other objects.
int batchRowWidth = sizer.netRowWidth();
// Record sizes may vary across batches. To be conservative, use
// the largest size observed from incoming batches.
int origRowEstimate = estimatedRowWidth;
estimatedRowWidth = Math.max(estimatedRowWidth, batchRowWidth);
// Maintain an estimate of the incoming batch size: the largest
// batch yet seen. Used to reserve memory for the next incoming
// batch. Because we are using the actual observed batch size,
// the size already includes overhead due to power-of-two rounding.
long origInputBatchSize = estimatedInputBatchSize;
estimatedInputBatchSize = Math.max(estimatedInputBatchSize, actualBatchSize);
// The row width may end up as zero if all fields are nulls or some
// other unusual situation. In this case, assume a width of 10 just
// to avoid lots of special case code.
if (estimatedRowWidth == 0) {
estimatedRowWidth = 10;
}
// Go no further if nothing changed.
if (estimatedRowWidth == origRowEstimate && estimatedInputBatchSize == origInputBatchSize) {
return; }
// Estimate the total size of each incoming batch plus sv2. Note that, due
// to power-of-two rounding, the allocated sv2 size might be twice the data size.
long estimatedInputSize = estimatedInputBatchSize + 4 * actualRecordCount;
// Determine the number of records to spill per spill batch. The goal is to
// spill batches of either 64K records, or as many records as fit into the
// amount of memory dedicated to each spill batch, whichever is less.
spillBatchRowCount = (int) Math.max(1, preferredSpillBatchSize / estimatedRowWidth / 2);
spillBatchRowCount = Math.min(spillBatchRowCount, Character.MAX_VALUE);
// Compute the actual spill batch size which may be larger or smaller
// than the preferred size depending on the row width. Double the estimated
// memory needs to allow for power-of-two rounding.
targetSpillBatchSize = spillBatchRowCount * estimatedRowWidth * 2;
// Determine the number of records per batch per merge step. The goal is to
// merge batches of either 64K records, or as many records as fit into the
// amount of memory dedicated to each merge batch, whichever is less.
mergeBatchRowCount = (int) Math.max(1, preferredMergeBatchSize / estimatedRowWidth / 2);
mergeBatchRowCount = Math.min(mergeBatchRowCount, Character.MAX_VALUE);
mergeBatchRowCount = Math.max(1, mergeBatchRowCount);
targetMergeBatchSize = mergeBatchRowCount * estimatedRowWidth * 2;
// Determine the minimum memory needed for spilling. Spilling is done just
// before accepting a batch, so we must spill if we don't have room for a
// (worst case) input batch. To spill, we need room for the output batch created
// by merging the batches already in memory. Double this to allow for power-of-two
// memory allocations.
long spillPoint = estimatedInputBatchSize + 2 * targetSpillBatchSize;
// The merge memory pool assumes we can spill all input batches. To make
// progress, we must have at least two merge batches (same size as an output
// batch) and one output batch. Again, double to allow for power-of-two
// allocation and add one for a margin of error.
long minMergeMemory = 2 * targetSpillBatchSize + targetMergeBatchSize;
// If we are in a low-memory condition, then we might not have room for the
// default output batch size. In that case, pick a smaller size.
if (minMergeMemory > memoryLimit) {
// Figure out the minimum output batch size based on memory,
// must hold at least one complete row.
long mergeAllowance = memoryLimit - 2 * targetSpillBatchSize;
targetMergeBatchSize = Math.max(estimatedRowWidth, mergeAllowance / 2);
mergeBatchRowCount = (int) (targetMergeBatchSize / estimatedRowWidth / 2);
minMergeMemory = 2 * targetSpillBatchSize + targetMergeBatchSize;
}
// Determine the minimum total memory we would need to receive two input
// batches (the minimum needed to make progress) and the allowance for the
// output batch.
long minLoadMemory = spillPoint + estimatedInputSize;
// Determine how much memory can be used to hold in-memory batches of spilled
// runs when reading from disk.
bufferMemoryPool = memoryLimit - spillPoint;
mergeMemoryPool = Math.max(memoryLimit - minMergeMemory,
(long) ((memoryLimit - 3 * targetMergeBatchSize) * 0.95));
// Sanity check: if we've been given too little memory to make progress,
// issue a warning but proceed anyway. Should only occur if something is
// configured terribly wrong.
long minMemoryNeeds = Math.max(minLoadMemory, minMergeMemory);
if (minMemoryNeeds > memoryLimit) {
logger.warn("Potential memory overflow! " +
"Minumum needed = {} bytes, actual available = {} bytes",
minMemoryNeeds, memoryLimit);
}
// Log the calculated values. Turn this on if things seem amiss.
// Message will appear only when the values change.
logger.debug("Input Batch Estimates: record size = {} bytes; input batch = {} bytes, {} records",
estimatedRowWidth, estimatedInputBatchSize, actualRecordCount);
logger.debug("Merge batch size = {} bytes, {} records; spill file size: {} bytes",
targetSpillBatchSize, spillBatchRowCount, spillFileSize);
logger.debug("Output batch size = {} bytes, {} records",
targetMergeBatchSize, mergeBatchRowCount);
logger.debug("Available memory: {}, buffer memory = {}, merge memory = {}",
memoryLimit, bufferMemoryPool, mergeMemoryPool);
}
/**
* Determine if spill is needed before receiving the new record batch.
* Spilling is driven purely by memory availability (and an optional
* batch limit for testing.)
*
* @return true if spilling is needed, false otherwise
*/
private boolean isSpillNeeded(int incomingSize) {
// Can't spill if less than two batches else the merge
// can't make progress.
if (bufferedBatches.size() < 2) {
return false; }
// Must spill if we are below the spill point (the amount of memory
// needed to do the minimal spill.)
return allocator.getAllocatedMemory() + incomingSize >= bufferMemoryPool;
}
/**
* Perform an in-memory sort of the buffered batches. Obviously can
* be used only for the non-spilling case.
*
* @return DONE if no rows, OK_NEW_SCHEMA if at least one row
*/
private IterOutcome sortInMemory() {
logger.debug("Starting in-memory sort. Batches = {}, Records = {}, Memory = {}",
bufferedBatches.size(), inputRecordCount, allocator.getAllocatedMemory());
// Note the difference between how we handle batches here and in the spill/merge
// case. In the spill/merge case, this class decides on the batch size to send
// downstream. However, in the in-memory case, we must pass along all batches
// in a single SV4. Attempts to do paging will result in errors. In the memory
// merge case, the downstream Selection Vector Remover will split the one
// big SV4 into multiple smaller batches to send further downstream.
// If the sort fails or is empty, clean up here. Otherwise, cleanup is done
// by closing the resultsIterator after all results are returned downstream.
MergeSort memoryMerge = new MergeSort(context, allocator, opCodeGen);
try {
sv4 = memoryMerge.merge(bufferedBatches, this, container);
if (sv4 == null) {
sortState = SortState.DONE;
return IterOutcome.STOP;
} else {
logger.debug("Completed in-memory sort. Memory = {}",
allocator.getAllocatedMemory());
resultsIterator = memoryMerge;
memoryMerge = null;
sortState = SortState.DELIVER;
return IterOutcome.OK_NEW_SCHEMA;
}
} finally {
if (memoryMerge != null) {
memoryMerge.close();
}
}
}
/**
* Perform merging of (typically spilled) batches. First consolidates batches
* as needed, then performs a final merge that is read one batch at a time
* to deliver batches to the downstream operator.
*
* @return always returns OK_NEW_SCHEMA
*/
private IterOutcome mergeSpilledRuns() {
logger.debug("Starting consolidate phase. Batches = {}, Records = {}, Memory = {}, In-memory batches {}, spilled runs {}",
inputBatchCount, inputRecordCount, allocator.getAllocatedMemory(),
bufferedBatches.size(), spilledRuns.size());
// Consolidate batches to a number that can be merged in
// a single last pass.
int mergeCount = 0;
while (consolidateBatches()) {
mergeCount++;
}
stats.addLongStat(Metric.MERGE_COUNT, mergeCount);
// Merge in-memory batches and spilled runs for the final merge.
List<BatchGroup> allBatches = new LinkedList<>();
allBatches.addAll(bufferedBatches);
bufferedBatches.clear();
allBatches.addAll(spilledRuns);
spilledRuns.clear();
logger.debug("Starting merge phase. Runs = {}, Alloc. memory = {}",
allBatches.size(), allocator.getAllocatedMemory());
// Do the final merge as a results iterator.
CopierHolder.BatchMerger merger = copierHolder.startFinalMerge(schema, allBatches, container, mergeBatchRowCount);
merger.next();
resultsIterator = merger;
sortState = SortState.DELIVER;
return IterOutcome.OK_NEW_SCHEMA;
}
private boolean consolidateBatches() {
// Determine additional memory needed to hold one batch from each
// spilled run.
int inMemCount = bufferedBatches.size();
int spilledRunsCount = spilledRuns.size();
// Can't merge more than will fit into memory at one time.
int maxMergeWidth = (int) (mergeMemoryPool / targetSpillBatchSize);
maxMergeWidth = Math.min(mergeLimit, maxMergeWidth);
// But, must merge at least two batches.
maxMergeWidth = Math.max(maxMergeWidth, 2);
// If we can't fit all batches in memory, must spill any in-memory
// batches to make room for multiple spill-merge-spill cycles.
if (inMemCount > 0) {
if (spilledRunsCount > maxMergeWidth) {
spillFromMemory();
return true;
}
// If we just plain have too many batches to merge, spill some
// in-memory batches to reduce the burden.
if (inMemCount + spilledRunsCount > mergeLimit) {
spillFromMemory();
return true;
}
// If the on-disk batches and in-memory batches need more memory than
// is available, spill some in-memory batches.
long allocated = allocator.getAllocatedMemory();
long totalNeeds = spilledRunsCount * targetSpillBatchSize + allocated;
if (totalNeeds > mergeMemoryPool) {
spillFromMemory();
return true;
}
}
// Merge on-disk batches if we have too many.
int mergeCount = spilledRunsCount - maxMergeWidth;
if (mergeCount <= 0) {
return false;
}
// Must merge at least 2 batches to make progress.
mergeCount = Math.max(2, mergeCount);
// We will merge. This will create yet another spilled
// run. Account for that.
mergeCount += 1;
mergeCount = Math.min(mergeCount, maxMergeWidth);
// If we are going to merge, and we have batches in memory,
// spill them and try again. We need to do this to ensure we
// have adequate memory to hold the merge batches. We are into
// a second-generation sort/merge so there is no point in holding
// onto batches in memory.
if (inMemCount > 0) {
spillFromMemory();
return true;
}
// Do the merge, then loop to try again in case not
// all the target batches spilled in one go.
logger.trace("Merging {} on-disk runs, Alloc. memory = {}",
mergeCount, allocator.getAllocatedMemory());
mergeRuns(mergeCount);
return true;
}
/**
* This operator has accumulated a set of sorted incoming record batches.
* We wish to spill some of them to disk. To do this, a "copier"
* merges the target batches to produce a stream of new (merged) batches
* which are then written to disk.
* <p>
* This method spills only half the accumulated batches
* minimizing unnecessary disk writes. The exact count must lie between
* the minimum and maximum spill counts.
*/
private void spillFromMemory() {
// Determine the number of batches to spill to create a spill file
// of the desired size. The actual file size might be a bit larger
// or smaller than the target, which is expected.
int spillCount = 0;
long spillSize = 0;
for (InputBatch batch : bufferedBatches) {
long batchSize = batch.getDataSize();
spillSize += batchSize;
spillCount++;
if (spillSize + batchSize / 2 > spillFileSize) {
break; }
}
// Must always spill at least 2, even if this creates an over-size
// spill file. But, if this is a final consolidation, we may have only
// a single batch.
spillCount = Math.max(spillCount, 2);
spillCount = Math.min(spillCount, bufferedBatches.size());
// Do the actual spill.
mergeAndSpill(bufferedBatches, spillCount);
}
private void mergeRuns(int targetCount) {
// Determine the number of runs to merge. The count should be the
// target count. However, to prevent possible memory overrun, we
// double-check with actual spill batch size and only spill as much
// as fits in the merge memory pool.
int mergeCount = 0;
long mergeSize = 0;
for (SpilledRun run : spilledRuns) {
long batchSize = run.getBatchSize();
if (mergeSize + batchSize > mergeMemoryPool) {
break;
}
mergeSize += batchSize;
mergeCount++;
if (mergeCount == targetCount) {
break;
}
}
// Must always spill at least 2, even if this creates an over-size
// spill file. But, if this is a final consolidation, we may have only
// a single batch.
mergeCount = Math.max(mergeCount, 2);
mergeCount = Math.min(mergeCount, spilledRuns.size());
// Do the actual spill.
mergeAndSpill(spilledRuns, mergeCount);
}
private void mergeAndSpill(LinkedList<? extends BatchGroup> source, int count) {
spilledRuns.add(doMergeAndSpill(source, count));
logger.trace("Completed spill: memory = {}",
allocator.getAllocatedMemory());
}
private BatchGroup.SpilledRun doMergeAndSpill(LinkedList<? extends BatchGroup> batchGroups, int spillCount) {
List<BatchGroup> batchesToSpill = Lists.newArrayList();
spillCount = Math.min(batchGroups.size(), spillCount);
assert spillCount > 0 : "Spill count to mergeAndSpill must not be zero";
for (int i = 0; i < spillCount; i++) {
batchesToSpill.add(batchGroups.pollFirst());
}
// Merge the selected set of matches and write them to the
// spill file. After each write, we release the memory associated
// with the just-written batch.
String outputFile = spillSet.getNextSpillFile();
stats.setLongStat(Metric.SPILL_COUNT, spillSet.getFileCount());
BatchGroup.SpilledRun newGroup = null;
try (AutoCloseable ignored = AutoCloseables.all(batchesToSpill);
CopierHolder.BatchMerger merger = copierHolder.startMerge(schema, batchesToSpill, spillBatchRowCount)) {
logger.trace("Spilling {} of {} batches, spill batch size = {} rows, memory = {}, write to {}",
batchesToSpill.size(), bufferedBatches.size() + batchesToSpill.size(),
spillBatchRowCount,
allocator.getAllocatedMemory(), outputFile);
newGroup = new BatchGroup.SpilledRun(spillSet, outputFile, oContext);
// The copier will merge records from the buffered batches into
// the outputContainer up to targetRecordCount number of rows.
// The actual count may be less if fewer records are available.
while (merger.next()) {
// Add a new batch of records (given by merger.getOutput()) to the spill
// file.
//
// note that addBatch also clears the merger's output container
newGroup.addBatch(merger.getOutput());
}
injector.injectChecked(context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class);
newGroup.closeOutputStream();
logger.trace("Spilled {} batches, {} records; memory = {} to {}",
merger.getBatchCount(), merger.getRecordCount(),
allocator.getAllocatedMemory(), outputFile);
newGroup.setBatchSize(merger.getEstBatchSize());
return newGroup;
} catch (Throwable e) {
// we only need to clean up newGroup if spill failed
try {
if (newGroup != null) {
AutoCloseables.close(e, newGroup);
}
} catch (Throwable t) { /* close() may hit the same IO issue; just ignore */ }
// Here the merger is holding onto a partially-completed batch.
// It will release the memory in the close() call.
try {
// Rethrow so we can decide how to handle the error.
throw e;
}
// If error is a User Exception, just use as is.
catch (UserException ue) { throw ue; }
catch (Throwable ex) {
throw UserException.resourceError(ex)
.message("External Sort encountered an error while spilling to disk")
.build(logger);
}
}
}
/**
* Allocate and initialize the selection vector used as the sort index.
* Assumes that memory is available for the vector since memory management
* ensured space is available.
*
* @return a new, populated selection vector 2
*/
private SelectionVector2 newSV2() {
SelectionVector2 sv2 = new SelectionVector2(allocator);
if (!sv2.allocateNewSafe(incoming.getRecordCount())) {
throw UserException.resourceError(new OutOfMemoryException("Unable to allocate sv2 buffer"))
.build(logger);
}
for (int i = 0; i < incoming.getRecordCount(); i++) {
sv2.setIndex(i, (char) i);
}
sv2.setRecordCount(incoming.getRecordCount());
return sv2;
}
@Override
public WritableBatch getWritableBatch() {
throw new UnsupportedOperationException("A sort batch is not writable.");
}
@Override
protected void killIncoming(boolean sendUpstream) {
incoming.kill(sendUpstream);
}
/**
* Extreme paranoia to avoid leaving resources unclosed in the case
* of an error. Since generally only the first error is of interest,
* we track only the first exception, not potential cascading downstream
* exceptions.
* <p>
* Some Drill code ends up calling close() two or more times. The code
* here protects itself from these undesirable semantics.
*/
@Override
public void close() {
if (spillSet.getWriteBytes() > 0) {
logger.debug("End of sort. Total write bytes: {}, Total read bytes: {}",
spillSet.getWriteBytes(), spillSet.getWriteBytes());
}
stats.setLongStat(Metric.SPILL_MB,
(int) Math.round( spillSet.getWriteBytes() / 1024.0D / 1024.0 ) );
RuntimeException ex = null;
try {
if (bufferedBatches != null) {
closeBatchGroups(bufferedBatches);
bufferedBatches = null;
}
} catch (RuntimeException e) {
ex = e;
}
try {
if (spilledRuns != null) {
closeBatchGroups(spilledRuns);
spilledRuns = null;
}
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
try {
if (sv4 != null) {
sv4.clear();
}
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
try {
if (resultsIterator != null) {
resultsIterator.close();
resultsIterator = null;
}
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
try {
copierHolder.close();
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
try {
spillSet.close();
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
try {
opCodeGen.close();
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
// The call to super.close() clears out the output container.
// Doing so requires the allocator here, so it must be closed
// after the super call.
try {
super.close();
} catch (RuntimeException e) {
ex = (ex == null) ? e : ex;
}
// Note: allocator is closed by the FragmentManager
// try {
// allocator.close();
// } catch (RuntimeException e) {
// ex = (ex == null) ? e : ex;
// }
if (ex != null) {
throw ex;
}
}
}