package edu.washington.escience.myria.operator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import java.util.List;
import org.joda.time.DateTime;
import org.junit.Test;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.util.TestEnvVars;
public class DupElimTest {
private static List<TupleBatch> makeTestData() {
List<Type> types =
ImmutableList.of(
Type.BOOLEAN_TYPE, Type.DATETIME_TYPE, Type.INT_TYPE, Type.LONG_TYPE, Type.STRING_TYPE);
List<String> names = ImmutableList.of("boolean", "datetime", "int", "long", "string");
Schema schema = Schema.of(types, names);
TupleBatchBuffer tbb = new TupleBatchBuffer(schema);
List<TupleBatch> ret = Lists.newLinkedList();
boolean bool0 = false;
DateTime dateTime0 = DateTime.now();
int int0 = 17;
long long0 = 4000000000L;
long long1 = -4000000000L;
String string0 = "row0";
String string1 = "row1";
assertNotEquals(long0, long1);
assertNotEquals(string0, string1);
/* First row */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long0);
tbb.putString(4, string0);
/* Second row is the same as the first row with one difference in column 3. */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long1);
tbb.putString(4, string0);
/* Third row is the same as the first. */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long0);
tbb.putString(4, string0);
/* Fourth row is the same as the second. */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long1);
tbb.putString(4, string0);
/* Fifth row is the same as the second. */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long1);
tbb.putString(4, string0);
/* Sixth row is different in column 4. */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long0);
tbb.putString(4, string1);
/* Pop a TB of 6 rows, 3 unique (0, 1, 5). */
ret.add(tbb.popAny());
/* Copy that tuplebatch twice more */
ret.add(ret.get(0));
ret.add(ret.get(0));
/* Add a single tuple that's different from all prev in column 4 */
tbb.putBoolean(0, bool0);
tbb.putDateTime(1, dateTime0);
tbb.putInt(2, int0);
tbb.putLong(3, long1);
tbb.putString(4, string1);
/* Add it to ret. */
ret.add(tbb.popAny());
/* At the end we should have 4 different rows. */
return ret;
}
@Test
public void testDupElim() throws DbException {
BatchTupleSource src = new BatchTupleSource(makeTestData());
DupElim dupElim = new DupElim(src);
List<TupleBatch> ans = Lists.newLinkedList();
dupElim.open(TestEnvVars.get());
while (!dupElim.eos()) {
TupleBatch tb = dupElim.nextReady();
if (tb != null) {
ans.add(tb);
}
}
dupElim.close();
int count = 0;
for (TupleBatch tb : ans) {
count += tb.numTuples();
}
assertEquals(2, ans.size());
assertEquals(4, count);
assertEquals(3, ans.get(0).numTuples());
assertEquals(1, ans.get(1).numTuples());
}
@Test
public void testStatefulDupElim() throws DbException {
BatchTupleSource src = new BatchTupleSource(makeTestData());
StreamingStateWrapper dupElim = new StreamingStateWrapper(src, new StatefulDupElim());
List<TupleBatch> ans = Lists.newLinkedList();
dupElim.open(TestEnvVars.get());
while (!dupElim.eos()) {
TupleBatch tb = dupElim.nextReady();
if (tb != null) {
ans.add(tb);
}
}
assertEquals(2, ans.size());
assertEquals(3, ans.get(0).numTuples());
assertEquals(1, ans.get(1).numTuples());
List<TupleBatch> state = dupElim.getStreamingState().exportState();
assertEquals(1, state.size());
assertEquals(4, state.get(0).numTuples());
dupElim.close();
}
}