ExternalSortBatch.java example

Explorer
drill-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.physical.impl.xsort.managed;

import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;

import org.apache.drill.common.AutoCloseables;
import org.apache.drill.common.config.DrillConfig;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.MetricDef;
import org.apache.drill.exec.physical.config.ExternalSort;
import org.apache.drill.exec.physical.impl.sort.RecordBatchData;
import org.apache.drill.exec.physical.impl.spill.RecordBatchSizer;
import org.apache.drill.exec.physical.impl.spill.SpillSet;
import org.apache.drill.exec.physical.impl.xsort.MSortTemplate;
import org.apache.drill.exec.physical.impl.xsort.SingleBatchSorter;
import org.apache.drill.exec.physical.impl.xsort.managed.BatchGroup.InputBatch;
import org.apache.drill.exec.physical.impl.xsort.managed.BatchGroup.SpilledRun;
import org.apache.drill.exec.record.AbstractRecordBatch;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.SchemaUtil;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.WritableBatch;
import org.apache.drill.exec.record.selection.SelectionVector2;
import org.apache.drill.exec.record.selection.SelectionVector4;
import org.apache.drill.exec.testing.ControlsInjector;
import org.apache.drill.exec.testing.ControlsInjectorFactory;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.AbstractContainerVector;

import com.google.common.collect.Lists;

/**
 * External sort batch: a sort batch which can spill to disk in
 * order to operate within a defined memory footprint.
 * <p>
 * <h4>Basic Operation</h4>
 * The operator has three key phases:
 * <p>
 * <ul>
 * <li>The load phase in which batches are read from upstream.</li>
 * <li>The merge phase in which spilled batches are combined to
 * reduce the number of files below the configured limit. (Best
 * practice is to configure the system to avoid this phase.)
 * <li>The delivery phase in which batches are combined to produce
 * the final output.</li>
 * </ul>
 * During the load phase:
 * <p>
 * <ul>
 * <li>The incoming (upstream) operator provides a series of batches.</li>
 * <li>This operator sorts each batch, and accumulates them in an in-memory
 * buffer.</li>
 * <li>If the in-memory buffer becomes too large, this operator selects
 * a subset of the buffered batches to spill.</li>
 * <li>Each spill set is merged to create a new, sorted collection of
 * batches, and each is spilled to disk.</li>
 * <li>To allow the use of multiple disk storage, each spill group is written
 * round-robin to a set of spill directories.</li>
 * </ul>
 * <p>
 * Data is spilled to disk as a "run". A run consists of one or more (typically
 * many) batches, each of which is itself a sorted run of records.
 * <p>
 * During the sort/merge phase:
 * <p>
 * <ul>
 * <li>When the input operator is complete, this operator merges the accumulated
 * batches (which may be all in memory or partially on disk), and returns
 * them to the output (downstream) operator in chunks of no more than
 * 64K records.</li>
 * <li>The final merge must combine a collection of in-memory and spilled
 * batches. Several limits apply to the maximum "width" of this merge. For
 * example, each open spill run consumes a file handle, and we may wish
 * to limit the number of file handles. Further, memory must hold one batch
 * from each run, so we may need to reduce the number of runs so that the
 * remaining runs can fit into memory. A consolidation phase combines
 * in-memory and spilled batches prior to the final merge to control final
 * merge width.</li>
 * <li>A special case occurs if no batches were spilled. In this case, the input
 * batches are sorted in memory without merging.</li>
 * </ul>
 * <p>
 * Many complex details are involved in doing the above; the details are explained
 * in the methods of this class.
 * <p>
 * <h4>Configuration Options</h4>
 * <dl>
 * <dt>drill.exec.sort.external.spill.fs</dt>
 * <dd>The file system (file://, hdfs://, etc.) of the spill directory.</dd>
 * <dt>drill.exec.sort.external.spill.directories</dt>
 * <dd>The comma delimited list of directories, on the above file
 * system, to which to spill files in round-robin fashion. The query will
 * fail if any one of the directories becomes full.</dt>
 * <dt>drill.exec.sort.external.spill.file_size</dt>
 * <dd>Target size for first-generation spill files Set this to large
 * enough to get nice long writes, but not so large that spill directories
 * are overwhelmed.</dd>
 * <dt>drill.exec.sort.external.mem_limit</dt>
 * <dd>Maximum memory to use for the in-memory buffer. (Primarily for testing.)</dd>
 * <dt>drill.exec.sort.external.batch_limit</dt>
 * <dd>Maximum number of batches to hold in memory. (Primarily for testing.)</dd>
 * <dt>drill.exec.sort.external.spill.max_count</dt>
 * <dd>Maximum number of batches to add to “first generation” files.
 * Defaults to 0 (no limit). (Primarily for testing.)</dd>
 * <dt>drill.exec.sort.external.spill.min_count</dt>
 * <dd>Minimum number of batches to add to “first generation” files.
 * Defaults to 0 (no limit). (Primarily for testing.)</dd>
 * <dt>drill.exec.sort.external.merge_limit</dt>
 * <dd>Sets the maximum number of runs to be merged in a single pass (limits
 * the number of open files.)</dd>
 * </dl>
 * <p>
 * The memory limit observed by this operator is the lesser of:
 * <ul>
 * <li>The maximum allocation allowed the allocator assigned to this batch
 * as set by the Foreman, or</li>
 * <li>The maximum limit configured in the mem_limit parameter above. (Primarily for
 * testing.</li>
 * </ul>
 * <h4>Output</h4>
 * It is helpful to note that the sort operator will produce one of two kinds of
 * output batches.
 * <ul>
 * <li>A large output with sv4 if data is sorted in memory. The sv4 addresses
 * the entire in-memory sort set. A selection vector remover will copy results
 * into new batches of a size determined by that operator.</li>
 * <li>A series of batches, without a selection vector, if the sort spills to
 * disk. In this case, the downstream operator will still be a selection vector
 * remover, but there is nothing for that operator to remove. Each batch is
 * of the size set by {@link #MAX_MERGED_BATCH_SIZE}.</li>
 * </ul>
 * Note that, even in the in-memory sort case, this operator could do the copying
 * to eliminate the extra selection vector remover. That is left as an exercise
 * for another time.
 * <h4>Logging</h4>
 * Logging in this operator serves two purposes:
 * <li>
 * <ul>
 * <li>Normal diagnostic information.</li>
 * <li>Capturing the essence of the operator functionality for analysis in unit
 * tests.</li>
 * </ul>
 * Test logging is designed to capture key events and timings. Take care
 * when changing or removing log messages as you may need to adjust unit tests
 * accordingly.
 */

public class ExternalSortBatch extends AbstractRecordBatch<ExternalSort> {
  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ExternalSortBatch.class);
  protected static final ControlsInjector injector = ControlsInjectorFactory.getInjector(ExternalSortBatch.class);

  /**
   * Smallest allowed output batch size. The smallest output batch
   * created even under constrained memory conditions.
   */
  private static final int MIN_MERGED_BATCH_SIZE = 256 * 1024;

  /**
   * In the bizarre case where the user gave us an unrealistically low
   * spill file size, set a floor at some bare minimum size. (Note that,
   * at this size, big queries will create a huge number of files, which
   * is why the configuration default is one the order of hundreds of MB.)
   */

  private static final long MIN_SPILL_FILE_SIZE = 1 * 1024 * 1024;

  public static final String INTERRUPTION_AFTER_SORT = "after-sort";
  public static final String INTERRUPTION_AFTER_SETUP = "after-setup";
  public static final String INTERRUPTION_WHILE_SPILLING = "spilling";
  public static final String INTERRUPTION_WHILE_MERGING = "merging";
  public static final long DEFAULT_SPILL_BATCH_SIZE = 8L * 1024 * 1024;
  public static final long MIN_SPILL_BATCH_SIZE = 256 * 1024;

  private final RecordBatch incoming;

  /**
   * Memory allocator for this operator itself. Incoming batches are
   * transferred into this allocator. Intermediate batches used during
   * merge also reside here.
   */

  private final BufferAllocator allocator;

  /**
   * Schema of batches that this operator produces.
   */

  private BatchSchema schema;

  /**
   * Incoming batches buffered in memory prior to spilling
   * or an in-memory merge.
   */

  private LinkedList<BatchGroup.InputBatch> bufferedBatches = Lists.newLinkedList();
  private LinkedList<BatchGroup.SpilledRun> spilledRuns = Lists.newLinkedList();
  private SelectionVector4 sv4;

  /**
   * The number of records to add to each output batch sent to the
   * downstream operator or spilled to disk.
   */

  private int mergeBatchRowCount;
  private int peakNumBatches = -1;

  /**
   * Maximum memory this operator may use. Usually comes from the
   * operator definition, but may be overridden by a configuration
   * parameter for unit testing.
   */

  private long memoryLimit;

  /**
   * Iterates over the final, sorted results.
   */

  private SortResults resultsIterator;

  /**
   * Manages the set of spill directories and files.
   */

  private final SpillSet spillSet;

  /**
   * Manages the copier used to merge a collection of batches into
   * a new set of batches.
   */

  private final CopierHolder copierHolder;

  private enum SortState { START, LOAD, DELIVER, DONE }
  private SortState sortState = SortState.START;
  private int inputRecordCount = 0;
  private int inputBatchCount = 0; // total number of batches received so far
  private final OperatorCodeGenerator opCodeGen;

  /**
   * Estimated size of the records for this query, updated on each
   * new batch received from upstream.
   */

  private int estimatedRowWidth;

  /**
   * Size of the merge batches that this operator produces. Generally
   * the same as the merge batch size, unless low memory forces a smaller
   * value.
   */

  private long targetMergeBatchSize;

  /**
   * Estimate of the input batch size based on the largest batch seen
   * thus far.
   */
  private long estimatedInputBatchSize;

  /**
   * Maximum number of spilled runs that can be merged in a single pass.
   */

  private int mergeLimit;

  /**
   * Target size of the first-generation spill files.
   */
  private long spillFileSize;

  /**
   * Tracks the minimum amount of remaining memory for use
   * in populating an operator metric.
   */

  private long minimumBufferSpace;

  /**
   * Maximum memory level before spilling occurs. That is, we can buffer input
   * batches in memory until we reach the level given by the buffer memory pool.
   */

  private long bufferMemoryPool;

  /**
   * Maximum memory that can hold batches during the merge
   * phase.
   */

  private long mergeMemoryPool;

  /**
   * The target size for merge batches sent downstream.
   */

  private long preferredMergeBatchSize;

  /**
   * Sum of the total number of bytes read from upstream.
   * This is the raw memory bytes, not actual data bytes.
   */

  private long totalInputBytes;

  /**
   * The configured size for each spill batch.
   */
  private Long preferredSpillBatchSize;

  /**
   * Tracks the maximum density of input batches. Density is
   * the amount of actual data / amount of memory consumed.
   * Low density batches indicate an EOF or something wrong in
   * an upstream operator because a low-density batch wastes
   * memory.
   */

  private int maxDensity;
  private int lastDensity = -1;

  /**
   * Estimated number of rows that fit into a single spill batch.
   */

  private int spillBatchRowCount;

  /**
   * The estimated actual spill batch size which depends on the
   * details of the data rows for any particular query.
   */

  private int targetSpillBatchSize;

  // WARNING: The enum here is used within this class. But, the members of
  // this enum MUST match those in the (unmanaged) ExternalSortBatch since
  // that is the enum used in the UI to display metrics for the query profile.

  public enum Metric implements MetricDef {
    SPILL_COUNT,            // number of times operator spilled to disk
    RETIRED1,               // Was: peak value for totalSizeInMemory
                            // But operator already provides this value
    PEAK_BATCHES_IN_MEMORY, // maximum number of batches kept in memory
    MERGE_COUNT,            // Number of second+ generation merges
    MIN_BUFFER,             // Minimum memory level observed in operation.
    SPILL_MB;               // Number of MB of data spilled to disk. This
                            // amount is first written, then later re-read.
                            // So, disk I/O is twice this amount.

    @Override
    public int metricId() {
      return ordinal();
    }
  }

  /**
   * Iterates over the final sorted results. Implemented differently
   * depending on whether the results are in-memory or spilled to
   * disk.
   */

  public interface SortResults {
    boolean next();
    void close();
    int getBatchCount();
    int getRecordCount();
  }

  public ExternalSortBatch(ExternalSort popConfig, FragmentContext context, RecordBatch incoming) {
    super(popConfig, context, true);
    this.incoming = incoming;
    allocator = oContext.getAllocator();
    opCodeGen = new OperatorCodeGenerator(context, popConfig);

    spillSet = new SpillSet(context, popConfig, "sort", "run");
    copierHolder = new CopierHolder(context, allocator, opCodeGen);
    configure(context.getConfig());
  }

  private void configure(DrillConfig config) {

    // The maximum memory this operator can use as set by the
    // operator definition (propagated to the allocator.)

    memoryLimit = allocator.getLimit();

    // Optional configured memory limit, typically used only for testing.

    long configLimit = config.getBytes(ExecConstants.EXTERNAL_SORT_MAX_MEMORY);
    if (configLimit > 0) {
      memoryLimit = Math.min(memoryLimit, configLimit);
    }

    // Optional limit on the number of spilled runs to merge in a single
    // pass. Limits the number of open file handles. Must allow at least
    // two batches to merge to make progress.

    mergeLimit = getConfigLimit(config, ExecConstants.EXTERNAL_SORT_MERGE_LIMIT, Integer.MAX_VALUE, 2);

    // Limits the size of first-generation spill files.

    spillFileSize = config.getBytes(ExecConstants.EXTERNAL_SORT_SPILL_FILE_SIZE);

    // Ensure the size is reasonable.

    spillFileSize = Math.max(spillFileSize, MIN_SPILL_FILE_SIZE);

    // The spill batch size. This is a critical setting for performance.
    // Set too large and the ratio between memory and input data sizes becomes
    // small. Set too small and disk seek times dominate performance.

    preferredSpillBatchSize = config.getBytes(ExecConstants.EXTERNAL_SORT_SPILL_BATCH_SIZE);

    // In low memory, use no more than 1/4 of memory for each spill batch. Ensures we
    // can merge.

    preferredSpillBatchSize = Math.min(preferredSpillBatchSize, memoryLimit / 4);

    // But, the spill batch should be above some minimum size to prevent complete
    // thrashing.

    preferredSpillBatchSize = Math.max(preferredSpillBatchSize, MIN_SPILL_BATCH_SIZE);

    // Set the target output batch size. Use the maximum size, but only if
    // this represents less than 10% of available memory. Otherwise, use 10%
    // of memory, but no smaller than the minimum size. In any event, an
    // output batch can contain no fewer than a single record.

    preferredMergeBatchSize = config.getBytes(ExecConstants.EXTERNAL_SORT_MERGE_BATCH_SIZE);
    long maxAllowance = (long) (memoryLimit - 2 * preferredSpillBatchSize);
    preferredMergeBatchSize = Math.min(maxAllowance, preferredMergeBatchSize);
    preferredMergeBatchSize = Math.max(preferredMergeBatchSize, MIN_MERGED_BATCH_SIZE);

    logger.debug("Config: memory limit = {}, " +
                 "spill file size = {}, spill batch size = {}, merge limit = {}, merge batch size = {}",
                  memoryLimit, spillFileSize, preferredSpillBatchSize, mergeLimit,
                  preferredMergeBatchSize);
  }

  private int getConfigLimit(DrillConfig config, String paramName, int valueIfZero, int minValue) {
    int limit = config.getInt(paramName);
    if (limit > 0) {
      limit = Math.max(limit, minValue);
    } else {
      limit = valueIfZero;
    }
    return limit;
  }

  @Override
  public int getRecordCount() {
    if (sv4 != null) {
      return sv4.getCount();
    }
    return container.getRecordCount();
  }

  @Override
  public SelectionVector4 getSelectionVector4() {
    return sv4;
  }

  private void closeBatchGroups(Collection<? extends BatchGroup> groups) {
    for (BatchGroup group: groups) {
      try {
        group.close();
      } catch (Exception e) {
        // collect all failure and make sure to cleanup all remaining batches
        // Originally we would have thrown a RuntimeException that would propagate to FragmentExecutor.closeOutResources()
        // where it would have been passed to context.fail()
        // passing the exception directly to context.fail(e) will let the cleanup process continue instead of stopping
        // right away, this will also make sure we collect any additional exception we may get while cleaning up
        context.fail(e);
      }
    }
  }

  /**
   * Called by {@link AbstractRecordBatch} as a fast-path to obtain
   * the first record batch and setup the schema of this batch in order
   * to quickly return the schema to the client. Note that this method
   * fetches the first batch from upstream which will be waiting for
   * us the first time that {@link #innerNext()} is called.
   */

  @Override
  public void buildSchema() {
    IterOutcome outcome = next(incoming);
    switch (outcome) {
      case OK:
      case OK_NEW_SCHEMA:
        for (VectorWrapper<?> w : incoming) {
          @SuppressWarnings("resource")
          ValueVector v = container.addOrGet(w.getField());
          if (v instanceof AbstractContainerVector) {
            w.getValueVector().makeTransferPair(v); // Can we remove this hack?
            v.clear();
          }
          v.allocateNew(); // Can we remove this? - SVR fails with NPE (TODO)
        }
        container.buildSchema(SelectionVectorMode.NONE);
        container.setRecordCount(0);
        break;
      case STOP:
        state = BatchState.STOP;
        break;
      case OUT_OF_MEMORY:
        state = BatchState.OUT_OF_MEMORY;
        break;
      case NONE:
        state = BatchState.DONE;
        break;
      default:
        throw new IllegalStateException("Unexpected iter outcome: " + outcome);
    }
  }

  /**
   * Process each request for a batch. The first request retrieves
   * all the incoming batches and sorts them, optionally spilling to
   * disk as needed. Subsequent calls retrieve the sorted results in
   * fixed-size batches.
   */

  @Override
  public IterOutcome innerNext() {
    switch (sortState) {
    case DONE:
      return IterOutcome.NONE;
    case START:
    case LOAD:
      return load();
    case DELIVER:
      return nextOutputBatch();
    default:
      throw new IllegalStateException("Unexpected sort state: " + sortState);
    }
  }

  private IterOutcome nextOutputBatch() {
    if (resultsIterator.next()) {
      injector.injectUnchecked(context.getExecutionControls(), INTERRUPTION_WHILE_MERGING);
      return IterOutcome.OK;
    } else {
      logger.trace("Deliver phase complete: Returned {} batches, {} records",
                    resultsIterator.getBatchCount(), resultsIterator.getRecordCount());
      sortState = SortState.DONE;

      // Close the iterator here to release any remaining resources such
      // as spill files. This is important when a query has a join: the
      // first branch sort may complete before the second branch starts;
      // it may be quite a while after returning the last row before the
      // fragment executor calls this opeator's close method.

      resultsIterator.close();
      resultsIterator = null;
      return IterOutcome.NONE;
    }
  }

  /**
   * Load and process a single batch, handling schema changes. In general, the
   * external sort accepts only one schema.
   *
   * @return return code depending on the amount of data read from upstream
   */

  private IterOutcome loadBatch() {

    // If this is the very first batch, then AbstractRecordBatch
    // already loaded it for us in buildSchema().

    IterOutcome upstream;
    if (sortState == SortState.START) {
      sortState = SortState.LOAD;
      upstream = IterOutcome.OK_NEW_SCHEMA;
    } else {
      upstream = next(incoming);
    }
    switch (upstream) {
    case NONE:
    case STOP:
      return upstream;
    case OK_NEW_SCHEMA:
    case OK:
      setupSchema(upstream);

      // Add the batch to the in-memory generation, spilling if
      // needed.

      processBatch();
      break;
    case OUT_OF_MEMORY:

      // Note: it is highly doubtful that this code actually works. It
      // requires that the upstream batches got to a safe place to run
      // out of memory and that no work as in-flight and thus abandoned.
      // Consider removing this case once resource management is in place.

      logger.error("received OUT_OF_MEMORY, trying to spill");
      if (bufferedBatches.size() > 2) {
        spillFromMemory();
      } else {
        logger.error("not enough batches to spill, sending OUT_OF_MEMORY downstream");
        return IterOutcome.OUT_OF_MEMORY;
      }
      break;
    default:
      throw new IllegalStateException("Unexpected iter outcome: " + upstream);
    }
    return IterOutcome.OK;
  }

  /**
   * Load the results and sort them. May bail out early if an exceptional
   * condition is passed up from the input batch.
   *
   * @return return code: OK_NEW_SCHEMA if rows were sorted,
   * NONE if no rows
   */

  private IterOutcome load() {
    logger.trace("Start of load phase");

    // Clear the temporary container created by
    // buildSchema().

    container.clear();

    // Loop over all input batches

    for (;;) {
      IterOutcome result = loadBatch();

      // None means all batches have been read.

      if (result == IterOutcome.NONE) {
        break; }

      // Any outcome other than OK means something went wrong.

      if (result != IterOutcome.OK) {
        return result; }
    }

    // Anything to actually sort?

    if (inputRecordCount == 0) {
      sortState = SortState.DONE;
      return IterOutcome.NONE;
    }
    logger.debug("Completed load phase: read {} batches, spilled {} times, total input bytes: {}",
                 inputBatchCount, spilledRuns.size(), totalInputBytes);

    // Do the merge of the loaded batches. The merge can be done entirely in memory if
    // the results fit; else we have to do a disk-based merge of
    // pre-sorted spilled batches.

    if (canUseMemoryMerge()) {
      return sortInMemory();
    } else {
      return mergeSpilledRuns();
    }
  }

  /**
   * All data has been read from the upstream batch. Determine if we
   * can use a fast in-memory sort, or must use a merge (which typically,
   * but not always, involves spilled batches.)
   *
   * @return whether sufficient resources exist to do an in-memory sort
   * if all batches are still in memory
   */

  private boolean canUseMemoryMerge() {
    if (spillSet.hasSpilled()) { return false; }

    // Do we have enough memory for MSorter (the in-memory sorter)?

    long allocMem = allocator.getAllocatedMemory();
    long availableMem = memoryLimit - allocMem;
    long neededForInMemorySort = MSortTemplate.memoryNeeded(inputRecordCount);
    if (availableMem < neededForInMemorySort) { return false; }

    // Make sure we don't exceed the maximum number of batches SV4 can address.

    if (bufferedBatches.size() > Character.MAX_VALUE) { return false; }

    // We can do an in-memory merge.

    return true;
  }

  /**
   * Handle a new schema from upstream. The ESB is quite limited in its ability
   * to handle schema changes.
   *
   * @param upstream the status code from upstream: either OK or OK_NEW_SCHEMA
   */

  private void setupSchema(IterOutcome upstream)  {

    // First batch: we won't have a schema.

    if (schema == null) {
      schema = incoming.getSchema();

    // Subsequent batches, nothing to do if same schema.

    } else if (upstream == IterOutcome.OK) {
      return;

    // Only change in the case that the schema truly changes. Artificial schema changes are ignored.

    } else if (incoming.getSchema().equals(schema)) {
      return;
    } else if (unionTypeEnabled) {
        schema = SchemaUtil.mergeSchemas(schema, incoming.getSchema());

        // New schema: must generate a new sorter and copier.

        opCodeGen.setSchema(schema);
    } else {
      throw UserException.unsupportedError()
            .message("Schema changes not supported in External Sort. Please enable Union type.")
            .build(logger);
    }

    // Coerce all existing batches to the new schema.

    for (BatchGroup b : bufferedBatches) {
      b.setSchema(schema);
    }
    for (BatchGroup b : spilledRuns) {
      b.setSchema(schema);
    }
  }

  /**
   * Convert an incoming batch into the agree-upon format. (Also seems to
   * make a persistent shallow copy of the batch saved until we are ready
   * to sort or spill.)
   *
   * @return the converted batch, or null if the incoming batch is empty
   */

  @SuppressWarnings("resource")
  private VectorContainer convertBatch() {

    // Must accept the batch even if no records. Then clear
    // the vectors to release memory since we won't do any
    // further processing with the empty batch.

    VectorContainer convertedBatch = SchemaUtil.coerceContainer(incoming, schema, oContext);
    if (incoming.getRecordCount() == 0) {
      for (VectorWrapper<?> w : convertedBatch) {
        w.clear();
      }
      SelectionVector2 sv2 = incoming.getSelectionVector2();
      if (sv2 != null) {
        sv2.clear();
      }
      return null;
    }
    return convertedBatch;
  }

  private SelectionVector2 makeSelectionVector() {
    if (incoming.getSchema().getSelectionVectorMode() == BatchSchema.SelectionVectorMode.TWO_BYTE) {
      return incoming.getSelectionVector2().clone();
    } else {
      return newSV2();
    }
  }

  /**
   * Process the converted incoming batch by adding it to the in-memory store
   * of data, or spilling data to disk when necessary.
   */

  @SuppressWarnings("resource")
  private void processBatch() {

    // Skip empty batches (such as the first one.)

    if (incoming.getRecordCount() == 0) {
      return;
    }

    // Determine actual sizes of the incoming batch before taking
    // ownership. Allows us to figure out if we need to spill first,
    // to avoid overflowing memory simply due to ownership transfer.

    RecordBatchSizer sizer = analyzeIncomingBatch();

    // The heart of the external sort operator: spill to disk when
    // the in-memory generation exceeds the allowed memory limit.
    // Preemptively spill BEFORE accepting the new batch into our memory
    // pool. The allocator will throw an OOM exception if we accept the
    // batch when we are near the limit - despite the fact that the batch
    // is already in memory and no new memory is allocated during the transfer.

    if ( isSpillNeeded(sizer.actualSize())) {
      spillFromMemory();
    }

    // Sanity check. We should now be below the buffer memory maximum.

    long startMem = allocator.getAllocatedMemory();
    if (startMem > bufferMemoryPool) {
      logger.error( "ERROR: Failed to spill above buffer limit. Buffer pool = {}, memory = {}",
          bufferMemoryPool, startMem);
    }

    // Convert the incoming batch to the agreed-upon schema.
    // No converted batch means we got an empty input batch.
    // Converting the batch transfers memory ownership to our
    // allocator. This gives a round-about way to learn the batch
    // size: check the before and after memory levels, then use
    // the difference as the batch size, in bytes.

    VectorContainer convertedBatch = convertBatch();
    if (convertedBatch == null) {
      return;
    }

    SelectionVector2 sv2;
    try {
      sv2 = makeSelectionVector();
    } catch (Exception e) {
      convertedBatch.clear();
      throw e;
    }

    // Compute batch size, including allocation of an sv2.

    long endMem = allocator.getAllocatedMemory();
    long batchSize = endMem - startMem;
    int count = sv2.getCount();
    inputRecordCount += count;
    inputBatchCount++;
    totalInputBytes += sizer.actualSize();

    // Update the minimum buffer space metric.

    if (minimumBufferSpace == 0) {
      minimumBufferSpace = endMem;
    } else {
      minimumBufferSpace = Math.min(minimumBufferSpace, endMem);
    }
    stats.setLongStat(Metric.MIN_BUFFER, minimumBufferSpace);

    // Update the size based on the actual record count, not
    // the effective count as given by the selection vector
    // (which may exclude some records due to filtering.)

    updateMemoryEstimates(batchSize, sizer);

    // Sort the incoming batch using either the original selection vector,
    // or a new one created here.

    SingleBatchSorter sorter;
    sorter = opCodeGen.getSorter(convertedBatch);
    try {
      sorter.setup(context, sv2, convertedBatch);
    } catch (SchemaChangeException e) {
      convertedBatch.clear();
      throw UserException.unsupportedError(e)
            .message("Unexpected schema change.")
            .build(logger);
    }
    try {
      sorter.sort(sv2);
    } catch (SchemaChangeException e) {
      convertedBatch.clear();
      throw UserException.unsupportedError(e)
                .message("Unexpected schema change.")
                .build(logger);
    }
    RecordBatchData rbd = new RecordBatchData(convertedBatch, allocator);
    try {
      rbd.setSv2(sv2);
      bufferedBatches.add(new BatchGroup.InputBatch(rbd.getContainer(), rbd.getSv2(), oContext, sizer.netSize()));
      if (peakNumBatches < bufferedBatches.size()) {
        peakNumBatches = bufferedBatches.size();
        stats.setLongStat(Metric.PEAK_BATCHES_IN_MEMORY, peakNumBatches);
      }

    } catch (Throwable t) {
      rbd.clear();
      throw t;
    }
  }

  /**
   * Scan the vectors in the incoming batch to determine batch size and if
   * any oversize columns exist. (Oversize columns cause memory fragmentation.)
   *
   * @return an analysis of the incoming batch
   */

  private RecordBatchSizer analyzeIncomingBatch() {
    RecordBatchSizer sizer = new RecordBatchSizer(incoming);
    sizer.applySv2();
    if (inputBatchCount == 0) {
      logger.debug("{}", sizer.toString());
    }
    return sizer;
  }

  /**
   * Update the data-driven memory use numbers including:
   * <ul>
   * <li>The average size of incoming records.</li>
   * <li>The estimated spill and output batch size.</li>
   * <li>The estimated number of average-size records per
   * spill and output batch.</li>
   * <li>The amount of memory set aside to hold the incoming
   * batches before spilling starts.</li>
   * </ul>
   *
   * @param actualBatchSize the overall size of the current batch received from
   * upstream
   * @param actualRecordCount the number of actual (not filtered) records in
   * that upstream batch
   */

  private void updateMemoryEstimates(long memoryDelta, RecordBatchSizer sizer) {
    long actualBatchSize = sizer.actualSize();
    int actualRecordCount = sizer.rowCount();

    if (actualBatchSize != memoryDelta) {
      logger.debug("Memory delta: {}, actual batch size: {}, Diff: {}",
                   memoryDelta, actualBatchSize, memoryDelta - actualBatchSize);
    }

    // The record count should never be zero, but better safe than sorry...

    if (actualRecordCount == 0) {
      return; }

    // If the vector is less than 75% full, just ignore it, except in the
    // unfortunate case where it is the first batch. Low-density batches generally
    // occur only at the end of a file or at the end of a DFS block. In such a
    // case, we will continue to rely on estimates created on previous, high-
    // density batches.
    // We actually track the max density seen, and compare to 75% of that since
    // Parquet produces very low density record batches.

    if (sizer.avgDensity() < maxDensity * 3 / 4 && sizer.avgDensity() != lastDensity) {
      logger.trace("Saw low density batch. Density: {}", sizer.avgDensity());
      lastDensity = sizer.avgDensity();
      return;
    }
    maxDensity = Math.max(maxDensity, sizer.avgDensity());

    // We know the batch size and number of records. Use that to estimate
    // the average record size. Since a typical batch has many records,
    // the average size is a fairly good estimator. Note that the batch
    // size includes not just the actual vector data, but any unused space
    // resulting from power-of-two allocation. This means that we don't
    // have to do size adjustments for input batches as we will do below
    // when estimating the size of other objects.

    int batchRowWidth = sizer.netRowWidth();

    // Record sizes may vary across batches. To be conservative, use
    // the largest size observed from incoming batches.

    int origRowEstimate = estimatedRowWidth;
    estimatedRowWidth = Math.max(estimatedRowWidth, batchRowWidth);

    // Maintain an estimate of the incoming batch size: the largest
    // batch yet seen. Used to reserve memory for the next incoming
    // batch. Because we are using the actual observed batch size,
    // the size already includes overhead due to power-of-two rounding.

    long origInputBatchSize = estimatedInputBatchSize;
    estimatedInputBatchSize = Math.max(estimatedInputBatchSize, actualBatchSize);

    // The row width may end up as zero if all fields are nulls or some
    // other unusual situation. In this case, assume a width of 10 just
    // to avoid lots of special case code.

    if (estimatedRowWidth == 0) {
      estimatedRowWidth = 10;
    }

    // Go no further if nothing changed.

    if (estimatedRowWidth == origRowEstimate && estimatedInputBatchSize == origInputBatchSize) {
      return; }

    // Estimate the total size of each incoming batch plus sv2. Note that, due
    // to power-of-two rounding, the allocated sv2 size might be twice the data size.

    long estimatedInputSize = estimatedInputBatchSize + 4 * actualRecordCount;

    // Determine the number of records to spill per spill batch. The goal is to
    // spill batches of either 64K records, or as many records as fit into the
    // amount of memory dedicated to each spill batch, whichever is less.

    spillBatchRowCount = (int) Math.max(1, preferredSpillBatchSize / estimatedRowWidth / 2);
    spillBatchRowCount = Math.min(spillBatchRowCount, Character.MAX_VALUE);

    // Compute the actual spill batch size which may be larger or smaller
    // than the preferred size depending on the row width. Double the estimated
    // memory needs to allow for power-of-two rounding.

    targetSpillBatchSize = spillBatchRowCount * estimatedRowWidth * 2;

    // Determine the number of records per batch per merge step. The goal is to
    // merge batches of either 64K records, or as many records as fit into the
    // amount of memory dedicated to each merge batch, whichever is less.

    mergeBatchRowCount = (int) Math.max(1, preferredMergeBatchSize / estimatedRowWidth / 2);
    mergeBatchRowCount = Math.min(mergeBatchRowCount, Character.MAX_VALUE);
    mergeBatchRowCount = Math.max(1,  mergeBatchRowCount);
    targetMergeBatchSize = mergeBatchRowCount * estimatedRowWidth * 2;

    // Determine the minimum memory needed for spilling. Spilling is done just
    // before accepting a batch, so we must spill if we don't have room for a
    // (worst case) input batch. To spill, we need room for the output batch created
    // by merging the batches already in memory. Double this to allow for power-of-two
    // memory allocations.

    long spillPoint = estimatedInputBatchSize + 2 * targetSpillBatchSize;

    // The merge memory pool assumes we can spill all input batches. To make
    // progress, we must have at least two merge batches (same size as an output
    // batch) and one output batch. Again, double to allow for power-of-two
    // allocation and add one for a margin of error.

    long minMergeMemory = 2 * targetSpillBatchSize + targetMergeBatchSize;

    // If we are in a low-memory condition, then we might not have room for the
    // default output batch size. In that case, pick a smaller size.

    if (minMergeMemory > memoryLimit) {

      // Figure out the minimum output batch size based on memory,
      // must hold at least one complete row.

      long mergeAllowance = memoryLimit - 2 * targetSpillBatchSize;
      targetMergeBatchSize = Math.max(estimatedRowWidth, mergeAllowance / 2);
      mergeBatchRowCount = (int) (targetMergeBatchSize / estimatedRowWidth / 2);
      minMergeMemory = 2 * targetSpillBatchSize + targetMergeBatchSize;
    }

    // Determine the minimum total memory we would need to receive two input
    // batches (the minimum needed to make progress) and the allowance for the
    // output batch.

    long minLoadMemory = spillPoint + estimatedInputSize;

    // Determine how much memory can be used to hold in-memory batches of spilled
    // runs when reading from disk.

    bufferMemoryPool = memoryLimit - spillPoint;
    mergeMemoryPool = Math.max(memoryLimit - minMergeMemory,
                               (long) ((memoryLimit - 3 * targetMergeBatchSize) * 0.95));

    // Sanity check: if we've been given too little memory to make progress,
    // issue a warning but proceed anyway. Should only occur if something is
    // configured terribly wrong.

    long minMemoryNeeds = Math.max(minLoadMemory, minMergeMemory);
    if (minMemoryNeeds > memoryLimit) {
      logger.warn("Potential memory overflow! " +
                   "Minumum needed = {} bytes, actual available = {} bytes",
                   minMemoryNeeds, memoryLimit);
    }

    // Log the calculated values. Turn this on if things seem amiss.
    // Message will appear only when the values change.

    logger.debug("Input Batch Estimates: record size = {} bytes; input batch = {} bytes, {} records",
                 estimatedRowWidth, estimatedInputBatchSize, actualRecordCount);
    logger.debug("Merge batch size = {} bytes, {} records; spill file size: {} bytes",
                 targetSpillBatchSize, spillBatchRowCount, spillFileSize);
    logger.debug("Output batch size = {} bytes, {} records",
                 targetMergeBatchSize, mergeBatchRowCount);
    logger.debug("Available memory: {}, buffer memory = {}, merge memory = {}",
                 memoryLimit, bufferMemoryPool, mergeMemoryPool);
  }

  /**
   * Determine if spill is needed before receiving the new record batch.
   * Spilling is driven purely by memory availability (and an optional
   * batch limit for testing.)
   *
   * @return true if spilling is needed, false otherwise
   */

  private boolean isSpillNeeded(int incomingSize) {

    // Can't spill if less than two batches else the merge
    // can't make progress.

    if (bufferedBatches.size() < 2) {
      return false; }

    // Must spill if we are below the spill point (the amount of memory
    // needed to do the minimal spill.)

    return allocator.getAllocatedMemory() + incomingSize >= bufferMemoryPool;
  }

  /**
   * Perform an in-memory sort of the buffered batches. Obviously can
   * be used only for the non-spilling case.
   *
   * @return DONE if no rows, OK_NEW_SCHEMA if at least one row
   */

  private IterOutcome sortInMemory() {
    logger.debug("Starting in-memory sort. Batches = {}, Records = {}, Memory = {}",
                 bufferedBatches.size(), inputRecordCount, allocator.getAllocatedMemory());

    // Note the difference between how we handle batches here and in the spill/merge
    // case. In the spill/merge case, this class decides on the batch size to send
    // downstream. However, in the in-memory case, we must pass along all batches
    // in a single SV4. Attempts to do paging will result in errors. In the memory
    // merge case, the downstream Selection Vector Remover will split the one
    // big SV4 into multiple smaller batches to send further downstream.

    // If the sort fails or is empty, clean up here. Otherwise, cleanup is done
    // by closing the resultsIterator after all results are returned downstream.

    MergeSort memoryMerge = new MergeSort(context, allocator, opCodeGen);
    try {
      sv4 = memoryMerge.merge(bufferedBatches, this, container);
      if (sv4 == null) {
        sortState = SortState.DONE;
        return IterOutcome.STOP;
      } else {
        logger.debug("Completed in-memory sort. Memory = {}",
                     allocator.getAllocatedMemory());
        resultsIterator = memoryMerge;
        memoryMerge = null;
        sortState = SortState.DELIVER;
        return IterOutcome.OK_NEW_SCHEMA;
      }
    } finally {
      if (memoryMerge != null) {
        memoryMerge.close();
      }
    }
  }

  /**
   * Perform merging of (typically spilled) batches. First consolidates batches
   * as needed, then performs a final merge that is read one batch at a time
   * to deliver batches to the downstream operator.
   *
   * @return always returns OK_NEW_SCHEMA
   */

  private IterOutcome mergeSpilledRuns() {
    logger.debug("Starting consolidate phase. Batches = {}, Records = {}, Memory = {}, In-memory batches {}, spilled runs {}",
                 inputBatchCount, inputRecordCount, allocator.getAllocatedMemory(),
                 bufferedBatches.size(), spilledRuns.size());

    // Consolidate batches to a number that can be merged in
    // a single last pass.

    int mergeCount = 0;
    while (consolidateBatches()) {
      mergeCount++;
    }
    stats.addLongStat(Metric.MERGE_COUNT, mergeCount);

    // Merge in-memory batches and spilled runs for the final merge.

    List<BatchGroup> allBatches = new LinkedList<>();
    allBatches.addAll(bufferedBatches);
    bufferedBatches.clear();
    allBatches.addAll(spilledRuns);
    spilledRuns.clear();

    logger.debug("Starting merge phase. Runs = {}, Alloc. memory = {}",
                 allBatches.size(), allocator.getAllocatedMemory());

    // Do the final merge as a results iterator.

    CopierHolder.BatchMerger merger = copierHolder.startFinalMerge(schema, allBatches, container, mergeBatchRowCount);
    merger.next();
    resultsIterator = merger;
    sortState = SortState.DELIVER;
    return IterOutcome.OK_NEW_SCHEMA;
  }

  private boolean consolidateBatches() {

    // Determine additional memory needed to hold one batch from each
    // spilled run.

    int inMemCount = bufferedBatches.size();
    int spilledRunsCount = spilledRuns.size();

    // Can't merge more than will fit into memory at one time.

    int maxMergeWidth = (int) (mergeMemoryPool / targetSpillBatchSize);
    maxMergeWidth = Math.min(mergeLimit, maxMergeWidth);

    // But, must merge at least two batches.

    maxMergeWidth = Math.max(maxMergeWidth, 2);

    // If we can't fit all batches in memory, must spill any in-memory
    // batches to make room for multiple spill-merge-spill cycles.

    if (inMemCount > 0) {
      if (spilledRunsCount > maxMergeWidth) {
        spillFromMemory();
        return true;
      }

      // If we just plain have too many batches to merge, spill some
      // in-memory batches to reduce the burden.

      if (inMemCount + spilledRunsCount > mergeLimit) {
        spillFromMemory();
        return true;
      }

      // If the on-disk batches and in-memory batches need more memory than
      // is available, spill some in-memory batches.

      long allocated = allocator.getAllocatedMemory();
      long totalNeeds = spilledRunsCount * targetSpillBatchSize + allocated;
      if (totalNeeds > mergeMemoryPool) {
        spillFromMemory();
        return true;
      }
    }

    // Merge on-disk batches if we have too many.

    int mergeCount = spilledRunsCount - maxMergeWidth;
    if (mergeCount <= 0) {
      return false;
    }

    // Must merge at least 2 batches to make progress.

    mergeCount = Math.max(2, mergeCount);

    // We will merge. This will create yet another spilled
    // run. Account for that.

    mergeCount += 1;

    mergeCount = Math.min(mergeCount, maxMergeWidth);

    // If we are going to merge, and we have batches in memory,
    // spill them and try again. We need to do this to ensure we
    // have adequate memory to hold the merge batches. We are into
    // a second-generation sort/merge so there is no point in holding
    // onto batches in memory.

    if (inMemCount > 0) {
      spillFromMemory();
      return true;
    }

    // Do the merge, then loop to try again in case not
    // all the target batches spilled in one go.

    logger.trace("Merging {} on-disk runs, Alloc. memory = {}",
        mergeCount, allocator.getAllocatedMemory());
    mergeRuns(mergeCount);
    return true;
  }

  /**
   * This operator has accumulated a set of sorted incoming record batches.
   * We wish to spill some of them to disk. To do this, a "copier"
   * merges the target batches to produce a stream of new (merged) batches
   * which are then written to disk.
   * <p>
   * This method spills only half the accumulated batches
   * minimizing unnecessary disk writes. The exact count must lie between
   * the minimum and maximum spill counts.
   */

  private void spillFromMemory() {

    // Determine the number of batches to spill to create a spill file
    // of the desired size. The actual file size might be a bit larger
    // or smaller than the target, which is expected.

    int spillCount = 0;
    long spillSize = 0;
    for (InputBatch batch : bufferedBatches) {
      long batchSize = batch.getDataSize();
      spillSize += batchSize;
      spillCount++;
      if (spillSize + batchSize / 2 > spillFileSize) {
        break; }
    }

    // Must always spill at least 2, even if this creates an over-size
    // spill file. But, if this is a final consolidation, we may have only
    // a single batch.

    spillCount = Math.max(spillCount, 2);
    spillCount = Math.min(spillCount, bufferedBatches.size());

    // Do the actual spill.

    mergeAndSpill(bufferedBatches, spillCount);
  }

  private void mergeRuns(int targetCount) {

    // Determine the number of runs to merge. The count should be the
    // target count. However, to prevent possible memory overrun, we
    // double-check with actual spill batch size and only spill as much
    // as fits in the merge memory pool.

    int mergeCount = 0;
    long mergeSize = 0;
    for (SpilledRun run : spilledRuns) {
      long batchSize = run.getBatchSize();
      if (mergeSize + batchSize > mergeMemoryPool) {
        break;
      }
      mergeSize += batchSize;
      mergeCount++;
      if (mergeCount == targetCount) {
        break;
      }
    }

    // Must always spill at least 2, even if this creates an over-size
    // spill file. But, if this is a final consolidation, we may have only
    // a single batch.

    mergeCount = Math.max(mergeCount, 2);
    mergeCount = Math.min(mergeCount, spilledRuns.size());

    // Do the actual spill.

    mergeAndSpill(spilledRuns, mergeCount);
  }

  private void mergeAndSpill(LinkedList<? extends BatchGroup> source, int count) {
    spilledRuns.add(doMergeAndSpill(source, count));
    logger.trace("Completed spill: memory = {}",
                 allocator.getAllocatedMemory());
  }

  private BatchGroup.SpilledRun doMergeAndSpill(LinkedList<? extends BatchGroup> batchGroups, int spillCount) {
    List<BatchGroup> batchesToSpill = Lists.newArrayList();
    spillCount = Math.min(batchGroups.size(), spillCount);
    assert spillCount > 0 : "Spill count to mergeAndSpill must not be zero";
    for (int i = 0; i < spillCount; i++) {
      batchesToSpill.add(batchGroups.pollFirst());
    }

    // Merge the selected set of matches and write them to the
    // spill file. After each write, we release the memory associated
    // with the just-written batch.

    String outputFile = spillSet.getNextSpillFile();
    stats.setLongStat(Metric.SPILL_COUNT, spillSet.getFileCount());
    BatchGroup.SpilledRun newGroup = null;
    try (AutoCloseable ignored = AutoCloseables.all(batchesToSpill);
         CopierHolder.BatchMerger merger = copierHolder.startMerge(schema, batchesToSpill, spillBatchRowCount)) {
      logger.trace("Spilling {} of {} batches, spill batch size = {} rows, memory = {}, write to {}",
                   batchesToSpill.size(), bufferedBatches.size() + batchesToSpill.size(),
                   spillBatchRowCount,
                   allocator.getAllocatedMemory(), outputFile);
      newGroup = new BatchGroup.SpilledRun(spillSet, outputFile, oContext);

      // The copier will merge records from the buffered batches into
      // the outputContainer up to targetRecordCount number of rows.
      // The actual count may be less if fewer records are available.

      while (merger.next()) {

        // Add a new batch of records (given by merger.getOutput()) to the spill
        // file.
        //
        // note that addBatch also clears the merger's output container

        newGroup.addBatch(merger.getOutput());
      }
      injector.injectChecked(context.getExecutionControls(), INTERRUPTION_WHILE_SPILLING, IOException.class);
      newGroup.closeOutputStream();
      logger.trace("Spilled {} batches, {} records; memory = {} to {}",
                   merger.getBatchCount(), merger.getRecordCount(),
                   allocator.getAllocatedMemory(), outputFile);
      newGroup.setBatchSize(merger.getEstBatchSize());
      return newGroup;
    } catch (Throwable e) {
      // we only need to clean up newGroup if spill failed
      try {
        if (newGroup != null) {
          AutoCloseables.close(e, newGroup);
        }
      } catch (Throwable t) { /* close() may hit the same IO issue; just ignore */ }

      // Here the merger is holding onto a partially-completed batch.
      // It will release the memory in the close() call.

      try {
        // Rethrow so we can decide how to handle the error.

        throw e;
      }

      // If error is a User Exception, just use as is.

      catch (UserException ue) { throw ue; }
      catch (Throwable ex) {
        throw UserException.resourceError(ex)
              .message("External Sort encountered an error while spilling to disk")
              .build(logger);
      }
    }
  }

  /**
   * Allocate and initialize the selection vector used as the sort index.
   * Assumes that memory is available for the vector since memory management
   * ensured space is available.
   *
   * @return a new, populated selection vector 2
   */

  private SelectionVector2 newSV2() {
    SelectionVector2 sv2 = new SelectionVector2(allocator);
    if (!sv2.allocateNewSafe(incoming.getRecordCount())) {
      throw UserException.resourceError(new OutOfMemoryException("Unable to allocate sv2 buffer"))
            .build(logger);
    }
    for (int i = 0; i < incoming.getRecordCount(); i++) {
      sv2.setIndex(i, (char) i);
    }
    sv2.setRecordCount(incoming.getRecordCount());
    return sv2;
  }

  @Override
  public WritableBatch getWritableBatch() {
    throw new UnsupportedOperationException("A sort batch is not writable.");
  }

  @Override
  protected void killIncoming(boolean sendUpstream) {
    incoming.kill(sendUpstream);
  }

  /**
   * Extreme paranoia to avoid leaving resources unclosed in the case
   * of an error. Since generally only the first error is of interest,
   * we track only the first exception, not potential cascading downstream
   * exceptions.
   * <p>
   * Some Drill code ends up calling close() two or more times. The code
   * here protects itself from these undesirable semantics.
   */

  @Override
  public void close() {
    if (spillSet.getWriteBytes() > 0) {
      logger.debug("End of sort. Total write bytes: {}, Total read bytes: {}",
                   spillSet.getWriteBytes(), spillSet.getWriteBytes());
    }
    stats.setLongStat(Metric.SPILL_MB,
        (int) Math.round( spillSet.getWriteBytes() / 1024.0D / 1024.0 ) );
    RuntimeException ex = null;
    try {
      if (bufferedBatches != null) {
        closeBatchGroups(bufferedBatches);
        bufferedBatches = null;
      }
    } catch (RuntimeException e) {
      ex = e;
    }
    try {
      if (spilledRuns != null) {
        closeBatchGroups(spilledRuns);
        spilledRuns = null;
      }
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }
    try {
      if (sv4 != null) {
        sv4.clear();
      }
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }
    try {
      if (resultsIterator != null) {
        resultsIterator.close();
        resultsIterator = null;
      }
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }
    try {
      copierHolder.close();
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }
    try {
      spillSet.close();
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }
    try {
      opCodeGen.close();
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }

    // The call to super.close() clears out the output container.
    // Doing so requires the allocator here, so it must be closed
    // after the super call.

    try {
      super.close();
    } catch (RuntimeException e) {
      ex = (ex == null) ? e : ex;
    }
    // Note: allocator is closed by the FragmentManager
//    try {
//      allocator.close();
//    } catch (RuntimeException e) {
//      ex = (ex == null) ? e : ex;
//    }
    if (ex != null) {
      throw ex;
    }
  }
}