package edu.washington.escience.myria.operator;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Ints;
import com.gs.collections.api.block.procedure.primitive.IntProcedure;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;
/**
* Keeps min value. It adds newly met unique tuples into a buffer so that the source TupleBatches are not referenced.
* This implementation reduces memory consumption.
* */
public final class KeepAndSortOnMinValue extends StreamingState {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/**
* The logger for this class.
* */
static final Logger LOGGER = LoggerFactory.getLogger(KeepAndSortOnMinValue.class);
/**
* Indices to unique tuples.
* */
private transient IntObjectHashMap<IntArrayList> uniqueTupleIndices;
/**
* The buffer for storing unique tuples.
* */
private transient MutableTupleBuffer uniqueTuples = null;
/** column indices of the key. */
private final int[] keyColIndices;
/** column indices of the key as a set. */
private final Set<Integer> keyColIndicesSet;
/** column indices of the value. */
private final int[] valueColIndices;
/**
*
* @param keyColIndices column indices of the key
* @param valueColIndices column indices of the value
*/
public KeepAndSortOnMinValue(final int[] keyColIndices, final int[] valueColIndices) {
this.keyColIndices = Arrays.copyOf(keyColIndices, keyColIndices.length);
keyColIndicesSet = ImmutableSet.copyOf(Ints.asList(keyColIndices));
this.valueColIndices = Arrays.copyOf(valueColIndices, valueColIndices.length);
}
@Override
public void cleanup() {
uniqueTuples = null;
uniqueTupleIndices = null;
}
/**
* Check if a tuple in uniqueTuples should be replaced by a given tuple
*
* @param index the row index of the tuple in uniqueTuples
* @param columns the columns of the given tuple
* @param row the row index of the given tuple
* @return true if should be replaced by
* */
private boolean shouldReplace(
final int index, final List<? extends Column<?>> columns, final int row) {
for (int valueColIndex : valueColIndices) {
Column<?> column = columns.get(valueColIndex);
switch (column.getType()) {
case INT_TYPE:
{
int t1 = column.getInt(row);
int t2 = uniqueTuples.getInt(valueColIndex, index);
if (t1 < t2) {
return true;
}
if (t1 > t2) {
return false;
}
break;
}
case LONG_TYPE:
{
long t1 = column.getLong(row);
long t2 = uniqueTuples.getLong(valueColIndex, index);
if (t1 < t2) {
return true;
}
if (t1 > t2) {
return false;
}
break;
}
case FLOAT_TYPE:
{
float t1 = column.getFloat(row);
float t2 = uniqueTuples.getFloat(valueColIndex, index);
if (t1 < t2) {
return true;
}
if (t1 > t2) {
return false;
}
break;
}
case DOUBLE_TYPE:
{
double t1 = column.getDouble(row);
double t2 = uniqueTuples.getDouble(valueColIndex, index);
if (t1 < t2) {
return true;
}
if (t1 > t2) {
return false;
}
break;
}
default:
throw new IllegalStateException(
"type " + column.getType() + " is not supported in KeepMinValue.replace()");
}
}
return false;
}
/**
* Do duplicate elimination for tb.
*
* @param tb the TupleBatch for performing DupElim.
* @return the duplicate eliminated TB.
* */
protected TupleBatch keepMinValue(final TupleBatch tb) {
final int numTuples = tb.numTuples();
if (numTuples <= 0) {
return tb;
}
doReplace.inputTB = tb;
final List<? extends Column<?>> columns = tb.getDataColumns();
final BitSet toRemove = new BitSet(numTuples);
for (int i = 0; i < numTuples; ++i) {
final int nextIndex = uniqueTuples.numTuples();
final int cntHashCode = HashUtils.hashSubRow(tb, keyColIndices, i);
IntArrayList tupleIndexList = uniqueTupleIndices.get(cntHashCode);
doReplace.unique = true;
if (tupleIndexList == null) {
tupleIndexList = new IntArrayList();
tupleIndexList.add(nextIndex);
uniqueTupleIndices.put(cntHashCode, tupleIndexList);
} else {
doReplace.replaced = false;
doReplace.sourceRow = i;
tupleIndexList.forEach(doReplace);
if (!doReplace.unique && !doReplace.replaced) {
toRemove.set(i);
}
}
if (doReplace.unique) {
for (int j = 0; j < tb.numColumns(); ++j) {
uniqueTuples.put(j, columns.get(j), i);
}
tupleIndexList.add(nextIndex);
}
}
return tb.filterOut(toRemove);
}
@Override
public Schema getSchema() {
return getOp().getInputSchema();
}
@Override
public void init(final ImmutableMap<String, Object> execEnvVars) {
uniqueTupleIndices = new IntObjectHashMap<IntArrayList>();
uniqueTuples = new MutableTupleBuffer(getSchema());
doReplace = new ReplaceProcedure();
}
@Override
public TupleBatch update(final TupleBatch tb) {
TupleBatch newtb = keepMinValue(tb);
if (newtb.numTuples() > 0 || newtb.isEOI()) {
return newtb;
}
return null;
}
@Override
public List<TupleBatch> exportState() {
MutableTupleBuffer tmp = uniqueTuples.clone();
sortOn(tmp, valueColIndices);
return tmp.getAll();
}
/**
* Traverse through the list of tuples and replace old values.
* */
private transient ReplaceProcedure doReplace;
/**
* Traverse through the list of tuples with the same hash code.
* */
private final class ReplaceProcedure implements IntProcedure {
/** serial version id. */
private static final long serialVersionUID = 1L;
/** row index of the tuple. */
private int sourceRow;
/** input TupleBatch. */
private TupleBatch inputTB;
/** if found a replacement. */
private boolean replaced;
/** if the given tuple doesn't exist. */
private boolean unique;
@Override
public void value(final int destRow) {
if (TupleUtils.tupleEquals(
inputTB, keyColIndices, sourceRow, uniqueTuples, keyColIndices, destRow)) {
unique = false;
if (shouldReplace(destRow, inputTB.getDataColumns(), sourceRow)) {
for (int i = 0; i < uniqueTuples.numColumns(); ++i) {
if (!keyColIndicesSet.contains(i)) {
// replace the whole tuple except key columns.
uniqueTuples.replace(i, destRow, inputTB.getDataColumns().get(i), sourceRow);
}
}
replaced = true;
}
}
}
};
/**
* sort the given TukpleBuffer on a column.
*
* @param tuples tuples
* @param col column index
*/
private void sortOn(final MutableTupleBuffer tuples, final int[] col) {
quicksort(tuples, col, 0, tuples.numTuples() - 1);
}
/**
* quick sort on column col, tuple with smaller values are put in the front.
*
* @param tuples tuples
* @param col the column index
* @param low lower bound
* @param high upper bound
*/
private void quicksort(
final MutableTupleBuffer tuples, final int[] col, final int low, final int high) {
int i = low, j = high;
int pivot = low + (high - low) / 2;
while (i <= j) {
while (compare(tuples, col, i, pivot) < 0) {
i++;
}
while (compare(tuples, col, j, pivot) > 0) {
j--;
}
if (i <= j) {
if (i != j) {
if (i == pivot) {
pivot = j;
} else if (j == pivot) {
pivot = i;
}
for (int c = 0; c < tuples.numColumns(); ++c) {
tuples.swap(c, i, j);
}
}
i++;
j--;
}
}
if (low < j) {
quicksort(tuples, col, low, j);
}
if (i < high) {
quicksort(tuples, col, i, high);
}
}
/**
* compare a value in a column with pivot.
*
* @param tuples tuples
* @param columns the column indices
* @param row row index to compare with
* @param pivot the index of the pivot value
* @return if the value is smaller than (-1), equal to (0) or bigger than (1) pivot
*/
public int compare(
final MutableTupleBuffer tuples, final int[] columns, final int row, final int pivot) {
for (int column : columns) {
Type t = getSchema().getColumnType(column);
switch (t) {
case LONG_TYPE:
{
long t1 = tuples.getLong(column, row);
long t2 = tuples.getLong(column, pivot);
if (t1 < t2) {
return -1;
}
if (t1 > t2) {
return 1;
}
break;
}
case INT_TYPE:
{
int t1 = tuples.getInt(column, row);
int t2 = tuples.getInt(column, pivot);
if (t1 < t2) {
return -1;
}
if (t1 > t2) {
return 1;
}
break;
}
case FLOAT_TYPE:
{
float t1 = tuples.getFloat(column, row);
float t2 = tuples.getFloat(column, pivot);
if (t1 < t2) {
return -1;
}
if (t1 > t2) {
return 1;
}
break;
}
case DOUBLE_TYPE:
{
double t1 = tuples.getDouble(column, row);
double t2 = tuples.getDouble(column, pivot);
if (t1 < t2) {
return -1;
}
if (t1 > t2) {
return 1;
}
break;
}
default:
throw new IllegalStateException("type " + t + " is not supported");
}
}
return 0;
}
@Override
public int numTuples() {
return uniqueTuples.numTuples();
}
@Override
public StreamingState duplicate() {
return new KeepAndSortOnMinValue(keyColIndices, valueColIndices);
}
}