package edu.washington.escience.myria.operator; import java.util.Arrays; import java.util.BitSet; import java.util.List; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.primitives.Ints; import com.gs.collections.api.block.procedure.primitive.IntProcedure; import com.gs.collections.impl.list.mutable.primitive.IntArrayList; import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.storage.MutableTupleBuffer; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.HashUtils; /** * Keeps min value. It adds newly met unique tuples into a buffer so that the source TupleBatches are not referenced. * This implementation reduces memory consumption. * */ public final class KeepAndSortOnMinValue extends StreamingState { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** * The logger for this class. * */ static final Logger LOGGER = LoggerFactory.getLogger(KeepAndSortOnMinValue.class); /** * Indices to unique tuples. * */ private transient IntObjectHashMap<IntArrayList> uniqueTupleIndices; /** * The buffer for storing unique tuples. * */ private transient MutableTupleBuffer uniqueTuples = null; /** column indices of the key. */ private final int[] keyColIndices; /** column indices of the key as a set. */ private final Set<Integer> keyColIndicesSet; /** column indices of the value. */ private final int[] valueColIndices; /** * * @param keyColIndices column indices of the key * @param valueColIndices column indices of the value */ public KeepAndSortOnMinValue(final int[] keyColIndices, final int[] valueColIndices) { this.keyColIndices = Arrays.copyOf(keyColIndices, keyColIndices.length); keyColIndicesSet = ImmutableSet.copyOf(Ints.asList(keyColIndices)); this.valueColIndices = Arrays.copyOf(valueColIndices, valueColIndices.length); } @Override public void cleanup() { uniqueTuples = null; uniqueTupleIndices = null; } /** * Check if a tuple in uniqueTuples should be replaced by a given tuple * * @param index the row index of the tuple in uniqueTuples * @param columns the columns of the given tuple * @param row the row index of the given tuple * @return true if should be replaced by * */ private boolean shouldReplace( final int index, final List<? extends Column<?>> columns, final int row) { for (int valueColIndex : valueColIndices) { Column<?> column = columns.get(valueColIndex); switch (column.getType()) { case INT_TYPE: { int t1 = column.getInt(row); int t2 = uniqueTuples.getInt(valueColIndex, index); if (t1 < t2) { return true; } if (t1 > t2) { return false; } break; } case LONG_TYPE: { long t1 = column.getLong(row); long t2 = uniqueTuples.getLong(valueColIndex, index); if (t1 < t2) { return true; } if (t1 > t2) { return false; } break; } case FLOAT_TYPE: { float t1 = column.getFloat(row); float t2 = uniqueTuples.getFloat(valueColIndex, index); if (t1 < t2) { return true; } if (t1 > t2) { return false; } break; } case DOUBLE_TYPE: { double t1 = column.getDouble(row); double t2 = uniqueTuples.getDouble(valueColIndex, index); if (t1 < t2) { return true; } if (t1 > t2) { return false; } break; } default: throw new IllegalStateException( "type " + column.getType() + " is not supported in KeepMinValue.replace()"); } } return false; } /** * Do duplicate elimination for tb. * * @param tb the TupleBatch for performing DupElim. * @return the duplicate eliminated TB. * */ protected TupleBatch keepMinValue(final TupleBatch tb) { final int numTuples = tb.numTuples(); if (numTuples <= 0) { return tb; } doReplace.inputTB = tb; final List<? extends Column<?>> columns = tb.getDataColumns(); final BitSet toRemove = new BitSet(numTuples); for (int i = 0; i < numTuples; ++i) { final int nextIndex = uniqueTuples.numTuples(); final int cntHashCode = HashUtils.hashSubRow(tb, keyColIndices, i); IntArrayList tupleIndexList = uniqueTupleIndices.get(cntHashCode); doReplace.unique = true; if (tupleIndexList == null) { tupleIndexList = new IntArrayList(); tupleIndexList.add(nextIndex); uniqueTupleIndices.put(cntHashCode, tupleIndexList); } else { doReplace.replaced = false; doReplace.sourceRow = i; tupleIndexList.forEach(doReplace); if (!doReplace.unique && !doReplace.replaced) { toRemove.set(i); } } if (doReplace.unique) { for (int j = 0; j < tb.numColumns(); ++j) { uniqueTuples.put(j, columns.get(j), i); } tupleIndexList.add(nextIndex); } } return tb.filterOut(toRemove); } @Override public Schema getSchema() { return getOp().getInputSchema(); } @Override public void init(final ImmutableMap<String, Object> execEnvVars) { uniqueTupleIndices = new IntObjectHashMap<IntArrayList>(); uniqueTuples = new MutableTupleBuffer(getSchema()); doReplace = new ReplaceProcedure(); } @Override public TupleBatch update(final TupleBatch tb) { TupleBatch newtb = keepMinValue(tb); if (newtb.numTuples() > 0 || newtb.isEOI()) { return newtb; } return null; } @Override public List<TupleBatch> exportState() { MutableTupleBuffer tmp = uniqueTuples.clone(); sortOn(tmp, valueColIndices); return tmp.getAll(); } /** * Traverse through the list of tuples and replace old values. * */ private transient ReplaceProcedure doReplace; /** * Traverse through the list of tuples with the same hash code. * */ private final class ReplaceProcedure implements IntProcedure { /** serial version id. */ private static final long serialVersionUID = 1L; /** row index of the tuple. */ private int sourceRow; /** input TupleBatch. */ private TupleBatch inputTB; /** if found a replacement. */ private boolean replaced; /** if the given tuple doesn't exist. */ private boolean unique; @Override public void value(final int destRow) { if (TupleUtils.tupleEquals( inputTB, keyColIndices, sourceRow, uniqueTuples, keyColIndices, destRow)) { unique = false; if (shouldReplace(destRow, inputTB.getDataColumns(), sourceRow)) { for (int i = 0; i < uniqueTuples.numColumns(); ++i) { if (!keyColIndicesSet.contains(i)) { // replace the whole tuple except key columns. uniqueTuples.replace(i, destRow, inputTB.getDataColumns().get(i), sourceRow); } } replaced = true; } } } }; /** * sort the given TukpleBuffer on a column. * * @param tuples tuples * @param col column index */ private void sortOn(final MutableTupleBuffer tuples, final int[] col) { quicksort(tuples, col, 0, tuples.numTuples() - 1); } /** * quick sort on column col, tuple with smaller values are put in the front. * * @param tuples tuples * @param col the column index * @param low lower bound * @param high upper bound */ private void quicksort( final MutableTupleBuffer tuples, final int[] col, final int low, final int high) { int i = low, j = high; int pivot = low + (high - low) / 2; while (i <= j) { while (compare(tuples, col, i, pivot) < 0) { i++; } while (compare(tuples, col, j, pivot) > 0) { j--; } if (i <= j) { if (i != j) { if (i == pivot) { pivot = j; } else if (j == pivot) { pivot = i; } for (int c = 0; c < tuples.numColumns(); ++c) { tuples.swap(c, i, j); } } i++; j--; } } if (low < j) { quicksort(tuples, col, low, j); } if (i < high) { quicksort(tuples, col, i, high); } } /** * compare a value in a column with pivot. * * @param tuples tuples * @param columns the column indices * @param row row index to compare with * @param pivot the index of the pivot value * @return if the value is smaller than (-1), equal to (0) or bigger than (1) pivot */ public int compare( final MutableTupleBuffer tuples, final int[] columns, final int row, final int pivot) { for (int column : columns) { Type t = getSchema().getColumnType(column); switch (t) { case LONG_TYPE: { long t1 = tuples.getLong(column, row); long t2 = tuples.getLong(column, pivot); if (t1 < t2) { return -1; } if (t1 > t2) { return 1; } break; } case INT_TYPE: { int t1 = tuples.getInt(column, row); int t2 = tuples.getInt(column, pivot); if (t1 < t2) { return -1; } if (t1 > t2) { return 1; } break; } case FLOAT_TYPE: { float t1 = tuples.getFloat(column, row); float t2 = tuples.getFloat(column, pivot); if (t1 < t2) { return -1; } if (t1 > t2) { return 1; } break; } case DOUBLE_TYPE: { double t1 = tuples.getDouble(column, row); double t2 = tuples.getDouble(column, pivot); if (t1 < t2) { return -1; } if (t1 > t2) { return 1; } break; } default: throw new IllegalStateException("type " + t + " is not supported"); } } return 0; } @Override public int numTuples() { return uniqueTuples.numTuples(); } @Override public StreamingState duplicate() { return new KeepAndSortOnMinValue(keyColIndices, valueColIndices); } }