Merge.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;

/**
 * Merges the sorted output of a set of operators.
 * */
public final class Merge extends NAryOperator {

  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;

  /** True if the tuples coming from children are in ascending order. */
  private boolean[] ascending;

  /** Indexes of columns that are sorted. */
  private int[] sortedColumns;

  /**
   * Contains a tuple batch for each child.
   *
   * Contains null if there is no batch.
   */
  private transient ArrayList<TupleBatch> childBatches = new ArrayList<TupleBatch>();

  /**
   * Index that points to the current location in the tupleBuffer of all children.
   *
   * -1 indicates an invalid pointer.
   */
  private transient ArrayList<Integer> childRowIndexes = new ArrayList<Integer>();

  /**
   * The buffer holding the results.
   */
  private transient TupleBatchBuffer ans;

  /**
   * Use a heap to find the smallest tuple (actually just the index in {@link #childBatches} to a child, where the
   * pointer in {@link #childRowIndexes} points to the smallest tuple).
   */
  private transient Queue<Integer> heap;

  /**
   * Comparator for tuples in tuple batch.
   */
  class TupleComparator implements Comparator<Integer> {
    @Override
    public int compare(final Integer left, final Integer right) {
      Integer leftPointer = childRowIndexes.get(left);
      Integer rightPointer = childRowIndexes.get(right);
      TupleBatch leftTb = childBatches.get(left);
      TupleBatch rightTb = childBatches.get(right);
      Preconditions.checkArgument(leftTb.numTuples() > leftPointer);
      Preconditions.checkArgument(rightTb.numTuples() > rightPointer);
      for (int columnIndex = 0; columnIndex < sortedColumns.length; columnIndex++) {
        int compared =
            TupleUtils.cellCompare(
                leftTb, columnIndex, leftPointer, rightTb, columnIndex, rightPointer);
        if (compared != 0) {
          if (ascending[columnIndex]) {
            return compared;
          } else {
            return -compared;
          }
        }
      }
      return 0;
    }
  }

  /**
   * @param children the children to be merged.
   * @param sortedColumns the indexes of columns that tuples are ordered by in the input
   * @param ascending true for each column that is ordered ascending
   *
   * */
  public Merge(final Operator[] children, final int[] sortedColumns, final boolean[] ascending) {
    super(children);

    setSortedColumns(sortedColumns, ascending);
  }

  @Override
  protected void cleanup() throws DbException {
    childBatches.clear();
  }

  @Override
  protected TupleBatch fetchNextReady() throws Exception {
    TupleBatch nexttb = ans.popFilled();
    if (nexttb != null) {
      return nexttb;
    }

    int numEOS = 0;

    while (nexttb == null) {
      numEOS = 0;

      // fill the buffers, if possible and necessary
      boolean notEnoughData = false;
      for (int childIdx = 0; childIdx < getNumChildren(); childIdx++) {
        Operator child = getChild(childIdx);
        if (child.eos()) {
          numEOS++;
          continue;
        }
        if (childBatches.get(childIdx) == null) {
          TupleBatch tb = child.nextReady();

          // After fetching from a child, it might be EOS.
          // If we don't catch this case here but return null,
          // this method might not be called again.
          if (child.eos()) {
            numEOS++;
            continue;
          }
          if (tb != null) {
            childBatches.set(childIdx, tb);
            childRowIndexes.set(childIdx, 0);

            // add the index after we refilled the batch or
            // when the batches are initially loaded
            heap.add(childIdx);
          } else {
            notEnoughData = true;
            break;
          }
        }
      }

      if (notEnoughData) {
        // break if this is EOS to ensure that we don't return
        // null and have data in buffer which would mean that this
        // method is never called again.
        if (numEOS == getNumChildren()) {
          break;
        }
        return null;
      }

      Integer smallestTb = heap.poll();
      if (smallestTb != null) {
        Integer positionInSmallestTb = childRowIndexes.get(smallestTb);
        ans.append(childBatches.get(smallestTb), positionInSmallestTb);

        // reset tb or advance position pointer
        if (positionInSmallestTb == childBatches.get(smallestTb).numTuples() - 1) {
          childRowIndexes.set(smallestTb, -1);
          childBatches.set(smallestTb, null);
        } else {
          childRowIndexes.set(smallestTb, positionInSmallestTb + 1);
          // We either re-add the index to the child here or when we fetch more data.
          // We cannot re-add it here because we don't know whether there will be data or not.
          heap.add(smallestTb);
        }
        nexttb = ans.popFilled();
      }

      if (numEOS == getNumChildren()) {
        setEOS();
        Preconditions.checkArgument(heap.size() == 0);
        Preconditions.checkArgument(nexttb == null);
        break;
      }
    }

    if (nexttb == null && ans.numTuples() > 0) {
      nexttb = ans.popAny();
    }

    return nexttb;
  }

  @Override
  public void init(final ImmutableMap<String, Object> execEnvVars) throws Exception {
    Objects.requireNonNull(getChildren());
    Preconditions.checkArgument(getNumChildren() > 0);

    Preconditions.checkArgument(ascending.length == sortedColumns.length);

    ans = new TupleBatchBuffer(getSchema());
    for (Operator child : getChildren()) {
      Preconditions.checkNotNull(child);
      Preconditions.checkArgument(getSchema().equals(child.getSchema()));

      childBatches.add(null);
      childRowIndexes.add(-1);
    }

    Comparator<Integer> comparator = new TupleComparator();
    heap = new PriorityQueue<Integer>(getNumChildren(), comparator);
  }

  @Override
  public Schema generateSchema() {
    if (getChildren() == null) {
      return null;
    }
    if (getChild(0) == null) {
      return null;
    }
    return getChild(0).getSchema();
  }

  /**
   * Define how the tuples are sorted in the input and how they should be sorted in the output.
   *
   * @param sortedColumns the indexes of columns that tuples are ordered by in the input
   * @param ascending true for each column that is ordered ascending
   */
  void setSortedColumns(final int[] sortedColumns, final boolean[] ascending) {
    this.sortedColumns = sortedColumns;
    this.ascending = ascending;
  }
}