DupElimRefOnly.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;

import com.google.common.collect.ImmutableMap;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.util.HashUtils;

/**
 * A simple implementation of duplicate eliminate. It keeps the references to all the TupleBatches which contain unique
 * tuples.
 * */
public final class DupElimRefOnly extends UnaryOperator {

  /**
   * Pointer data structure for pointing to a tuple in a TupleBatch.
   * */
  private class IndexedTuple {
    /**
     * The row index.
     * */
    private int index;
    /**
     * The source data TB.
     * */
    private final TupleBatch tb;

    /**
     * @param tb the source data TB.
     * */
    public IndexedTuple(final TupleBatch tb) {
      this.tb = tb;
    }

    /**
     * @param tb the source data TB.
     * @param index the row index.
     * */
    public IndexedTuple(final TupleBatch tb, final int index) {
      this.tb = tb;
      this.index = index;
    }

    /**
     * compare the equality of a column of two tuples.
     *
     * @return true if equal.
     * @param another another source data TB.
     * @param colIndx columnIndex to compare
     * */
    public boolean columnEquals(final IndexedTuple another, final int colIndx) {
      final Type type = tb.getSchema().getColumnType(colIndx);
      final int rowIndx1 = index;
      final int rowIndx2 = another.index;
      switch (type) {
        case BOOLEAN_TYPE:
          return tb.getBoolean(colIndx, rowIndx1) == another.tb.getBoolean(colIndx, rowIndx2);
        case DOUBLE_TYPE:
          return tb.getDouble(colIndx, rowIndx1) == another.tb.getDouble(colIndx, rowIndx2);
        case FLOAT_TYPE:
          return tb.getFloat(colIndx, rowIndx1) == another.tb.getFloat(colIndx, rowIndx2);
        case INT_TYPE:
          return tb.getInt(colIndx, rowIndx1) == another.tb.getInt(colIndx, rowIndx2);
        case LONG_TYPE:
          return tb.getLong(colIndx, rowIndx1) == another.tb.getLong(colIndx, rowIndx2);
        case STRING_TYPE:
          return tb.getString(colIndx, rowIndx1).equals(another.tb.getString(colIndx, rowIndx2));
        case DATETIME_TYPE:
          return tb.getDateTime(colIndx, rowIndx1)
              .equals(another.tb.getDateTime(colIndx, rowIndx2));
        case BLOB_TYPE:
          return tb.getBlob(colIndx, rowIndx1).equals(another.tb.getBlob(colIndx, rowIndx2));
      }
      return false;
    }

    @Override
    public boolean equals(final Object o) {
      if (this == o) {
        return true;
      }
      if (!(o instanceof IndexedTuple)) {
        return false;
      }
      final IndexedTuple another = (IndexedTuple) o;
      if (!(tb.getSchema().equals(another.tb.getSchema()))) {
        return false;
      }
      for (int i = 0; i < tb.getSchema().numColumns(); ++i) {
        if (!columnEquals(another, i)) {
          return false;
        }
      }
      return true;
    }

    @Override
    public int hashCode() {
      return HashUtils.hashRow(tb, index);
    }
  }

  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;

  /**
   * Storing the unique tuples.
   * */
  private transient HashMap<Integer, List<IndexedTuple>> uniqueTuples;

  /**
   * @param child the child
   * */
  public DupElimRefOnly(final Operator child) {
    super(child);
  }

  @Override
  protected void cleanup() throws DbException {}

  /**
   * Do duplicate elimination for the tb.
   *
   * @param tb the TB.
   * @return a new TB with duplicates removed.
   * */
  protected TupleBatch doDupElim(final TupleBatch tb) {
    final int numTuples = tb.numTuples();
    if (numTuples <= 0) {
      return tb;
    }
    final BitSet toRemove = new BitSet(numTuples);
    final IndexedTuple currentTuple = new IndexedTuple(tb);
    for (int i = 0; i < numTuples; ++i) {
      currentTuple.index = i;
      final int cntHashCode = currentTuple.hashCode();
      // might need to check invalid | change to use outputTuples later
      List<IndexedTuple> tupleList = uniqueTuples.get(cntHashCode);
      if (tupleList == null) {
        tupleList = new ArrayList<IndexedTuple>();
        uniqueTuples.put(cntHashCode, tupleList);
        tupleList.add(new IndexedTuple(tb, i));
        continue;
      }
      boolean unique = true;
      for (final IndexedTuple oldTuple : tupleList) {
        if (currentTuple.equals(oldTuple)) {
          unique = false;
          break;
        }
      }
      if (unique) {
        tupleList.add(new IndexedTuple(tb, i));
      } else {
        toRemove.set(i);
      }
    }
    return tb.filterOut(toRemove);
  }

  @Override
  public TupleBatch fetchNextReady() throws DbException {

    TupleBatch tb = null;
    tb = getChild().nextReady();
    while (tb != null) {
      tb = doDupElim(tb);
      if (tb.numTuples() > 0) {
        return tb;
      }
      tb = getChild().nextReady();
    }
    return null;
  }

  @Override
  public Schema generateSchema() {
    Operator child = getChild();
    if (child == null) {
      return null;
    }
    return child.getSchema();
  }

  @Override
  protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
    uniqueTuples = new HashMap<Integer, List<IndexedTuple>>();
  }
}