package edu.washington.escience.myria.operator; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.joda.time.DateTime; import org.junit.Test; import com.google.common.collect.ImmutableList; import com.google.common.primitives.Doubles; import com.google.common.primitives.Floats; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.column.ConstantValueColumn; import edu.washington.escience.myria.column.builder.BooleanColumnBuilder; import edu.washington.escience.myria.column.builder.ColumnBuilder; import edu.washington.escience.myria.column.builder.DateTimeColumnBuilder; import edu.washington.escience.myria.column.builder.DoubleColumnBuilder; import edu.washington.escience.myria.column.builder.FloatColumnBuilder; import edu.washington.escience.myria.column.builder.IntColumnBuilder; import edu.washington.escience.myria.column.builder.LongColumnBuilder; import edu.washington.escience.myria.column.builder.StringColumnBuilder; import edu.washington.escience.myria.operator.agg.Aggregate; import edu.washington.escience.myria.operator.agg.AggregatorFactory; import edu.washington.escience.myria.operator.agg.PrimitiveAggregator.AggregationOp; import edu.washington.escience.myria.operator.agg.PrimitiveAggregatorFactory; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleBuffer; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.HashUtils; import edu.washington.escience.myria.util.TestEnvVars; import edu.washington.escience.myria.util.TestUtils; import edu.washington.escience.myria.util.Tuple; public class AggregateTest { /** * Ensure that the given Schema matches the expected numeric aggregate types for the given Type. * * All numeric aggs, in order: COUNT, MIN, MAX, SUM, AVG, STDEV * * MIN,MAX match the input type * * SUM is the largest type compatible with the input type (int->long and float->double) * * COUNT is always long * * AVG and STDEV are always double * * @param schema the schema * @param type the type being aggregated */ private void allNumericAggsTestSchema(final Schema schema, final Type type) { Type bigType = type; if (type == Type.INT_TYPE) { bigType = Type.LONG_TYPE; } else if (type == Type.FLOAT_TYPE) { bigType = Type.DOUBLE_TYPE; } assertEquals(6, schema.numColumns()); assertEquals(Type.LONG_TYPE, schema.getColumnType(0)); assertEquals(type, schema.getColumnType(1)); assertEquals(type, schema.getColumnType(2)); assertEquals(bigType, schema.getColumnType(3)); assertEquals(Type.DOUBLE_TYPE, schema.getColumnType(4)); assertEquals(Type.DOUBLE_TYPE, schema.getColumnType(5)); } /** * Ensure that the given Schema matches the expected non-numeric aggregate types for the given Type. * * All non-numeric aggs, in order: COUNT, MIN, MAX * * MIN,MAX match the input type * * COUNT is always long * * @param schema the schema * @param type the type being aggregated */ private void allNonNumericAggsTestSchema(final Schema schema, final Type type) { assertEquals(3, schema.numColumns()); assertEquals(Type.LONG_TYPE, schema.getColumnType(0)); assertEquals(type, schema.getColumnType(1)); assertEquals(type, schema.getColumnType(2)); } /** * Helper function to turn a single {@link ColumnBuilder} into a {@link TupleBatch}. * * @param builder the column builder * @return the TupleBatch */ private TupleBatch makeTrivialTupleBatch(final ColumnBuilder<?> builder) { Schema schema = Schema.of(ImmutableList.of(builder.getType()), ImmutableList.of("col0")); return new TupleBatch(schema, ImmutableList.<Column<?>>of(builder.build())); } /** * Helper function to instantiate an aggregator and do the aggregation. Do not use if more than one TupleBatch are * expected. * * @param builder the tuples to be aggregated * @param aggOps the aggregate operations over the column * @param noColumns whether to group by no columns (if true) or to append a constant value single column and group by * it (if false). * @return a single TupleBatch containing the results of the aggregation * @throws Exception if there is an error */ private TupleBatch doAggOpsToCol( final ColumnBuilder<?> builder, final AggregationOp[] aggOps, final boolean noColumns) throws Exception { if (noColumns == false) { return doAggOpsToSingleCol(builder, aggOps); } BatchTupleSource source = new BatchTupleSource(makeTrivialTupleBatch(builder)); AggregatorFactory aggs = new PrimitiveAggregatorFactory(0, aggOps); Aggregate agg = new Aggregate(source, new int[] {}, aggs); /* Do it -- this should cause an error. */ agg.open(TestEnvVars.get()); TupleBatch tb = agg.nextReady(); agg.close(); return tb; } /** * Helper function to instantiate an aggregator and do the aggregation. Do not use if more than one TupleBatch are * expected. * * @param builder the tuples to be aggregated * @param aggOps the aggregate operations over the column * @return a single TupleBatch containing the results of the aggregation * @throws Exception if there is an error */ private TupleBatch doAggOpsToSingleCol( final ColumnBuilder<?> builder, final AggregationOp[] aggOps) throws Exception { TupleBatch trivialTb = makeTrivialTupleBatch(builder); ConstantValueColumn constCol = new ConstantValueColumn(3, Type.INT_TYPE, trivialTb.numTuples()); Schema newSchema = Schema.merge(trivialTb.getSchema(), Schema.ofFields("_const_col", Type.INT_TYPE)); List<Column<?>> columns = ImmutableList.<Column<?>>builder().addAll(trivialTb.getDataColumns()).add(constCol).build(); BatchTupleSource source = new BatchTupleSource(new TupleBatch(newSchema, columns)); AggregatorFactory[] aggs = new AggregatorFactory[aggOps.length]; for (int i = 0; i < aggs.length; ++i) { aggs[i] = new PrimitiveAggregatorFactory(0, aggOps[i]); } Aggregate agg = new Aggregate(source, new int[] {trivialTb.numColumns()}, aggs); /* Do it -- this should cause an error. */ agg.open(TestEnvVars.get()); TupleBatch tb = agg.nextReady(); agg.close(); /* Take the 1st through nth column, because the first column is the thing we grouped by. */ int[] colsToKeep = new int[tb.numColumns() - 1]; for (int i = 0; i < colsToKeep.length; ++i) { colsToKeep[i] = i + 1; } return tb.selectColumns(colsToKeep); } @Test public void testNumericAggs() throws Exception { ColumnBuilder<?> builder; TupleBatch tb; AggregationOp[] numericAggBitsInOrder = new AggregationOp[] { AggregationOp.COUNT, AggregationOp.MIN, AggregationOp.MAX, AggregationOp.SUM, AggregationOp.AVG, AggregationOp.STDEV }; AggregationOp[] justCount = new AggregationOp[] {AggregationOp.COUNT}; AggregationOp[] justMin = new AggregationOp[] {AggregationOp.MIN}; AggregationOp[] justMax = new AggregationOp[] {AggregationOp.MAX}; AggregationOp[] justSum = new AggregationOp[] {AggregationOp.SUM}; AggregationOp[] justAvg = new AggregationOp[] {AggregationOp.AVG}; AggregationOp[] justStdev = new AggregationOp[] {AggregationOp.STDEV}; for (int variant = 0; variant < 2; ++variant) { /* Whether to group by zero or 1 columns. */ boolean noColumns = (variant == 0); /* Ints, all as a group */ int[] ints = new int[] {3, 5, 6}; builder = new IntColumnBuilder(); for (int i : ints) { builder.appendInt(i); } tb = doAggOpsToCol(builder, numericAggBitsInOrder, noColumns); allNumericAggsTestSchema(tb.getSchema(), builder.getType()); assertEquals(ints.length, tb.getLong(0, 0)); assertEquals(Ints.min(ints), tb.getInt(1, 0)); assertEquals(Ints.max(ints), tb.getInt(2, 0)); assertEquals(3 + 5 + 6, tb.getLong(3, 0)); assertEquals((3 + 5 + 6) / 3.0, tb.getDouble(4, 0), 0.0001); // Wolfram Alpha: population standard deviation {3,5,6} assertEquals(1.2472, tb.getDouble(5, 0), 0.0001); /* Ints, one aggregate at a time */ tb = doAggOpsToCol(builder, justCount, noColumns); assertEquals(ints.length, tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justMin, noColumns); assertEquals(Ints.min(ints), tb.getInt(0, 0)); tb = doAggOpsToCol(builder, justMax, noColumns); assertEquals(Ints.max(ints), tb.getInt(0, 0)); tb = doAggOpsToCol(builder, justSum, noColumns); assertEquals(3 + 5 + 6, tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justAvg, noColumns); assertEquals((3 + 5 + 6) / 3.0, tb.getDouble(0, 0), 0.0001); tb = doAggOpsToCol(builder, justStdev, noColumns); assertEquals(1.2472, tb.getDouble(0, 0), 0.0001); /* Longs */ long[] longs = new long[] {3, 5, 9}; builder = new LongColumnBuilder(); for (long l : longs) { builder.appendLong(l); } tb = doAggOpsToCol(builder, numericAggBitsInOrder, noColumns); allNumericAggsTestSchema(tb.getSchema(), builder.getType()); assertEquals(longs.length, tb.getLong(0, 0)); assertEquals(Longs.min(longs), tb.getLong(1, 0)); assertEquals(Longs.max(longs), tb.getLong(2, 0)); assertEquals(3 + 5 + 9, tb.getLong(3, 0)); assertEquals((3 + 5 + 9) / 3.0, tb.getDouble(4, 0), 0.0001); // Wolfram Alpha: population standard deviation {3,5,9} assertEquals(2.4944, tb.getDouble(5, 0), 0.0001); /* Longs, one aggregate at a time */ tb = doAggOpsToCol(builder, justCount, noColumns); assertEquals(longs.length, tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justMin, noColumns); assertEquals(Longs.min(longs), tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justMax, noColumns); assertEquals(Longs.max(longs), tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justSum, noColumns); assertEquals(3 + 5 + 9, tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justAvg, noColumns); assertEquals((3 + 5 + 9) / 3.0, tb.getDouble(0, 0), 0.0001); tb = doAggOpsToCol(builder, justStdev, noColumns); assertEquals(2.4944, tb.getDouble(0, 0), 0.0001); /* Floats */ float[] floats = new float[] {3, 5, 11}; builder = new FloatColumnBuilder(); for (float f : floats) { builder.appendFloat(f); } tb = doAggOpsToCol(builder, numericAggBitsInOrder, noColumns); allNumericAggsTestSchema(tb.getSchema(), builder.getType()); assertEquals(floats.length, tb.getLong(0, 0)); assertEquals(Floats.min(floats), tb.getFloat(1, 0), 0.000001); assertEquals(Floats.max(floats), tb.getFloat(2, 0), 0.000001); assertEquals(3f + 5f + 11f, tb.getDouble(3, 0), 0.0000001); assertEquals((3 + 5 + 11) / 3.0, tb.getDouble(4, 0), 0.0001); // Wolfram Alpha: population standard deviation {3,5,11} assertEquals(3.3993, tb.getDouble(5, 0), 0.0001); /* Floats, one aggregate at a time */ tb = doAggOpsToCol(builder, justCount, noColumns); assertEquals(floats.length, tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justMin, noColumns); assertEquals(Floats.min(floats), tb.getFloat(0, 0), 0.000001); tb = doAggOpsToCol(builder, justMax, noColumns); assertEquals(Floats.max(floats), tb.getFloat(0, 0), 0.000001); tb = doAggOpsToCol(builder, justSum, noColumns); assertEquals(3f + 5f + 11f, tb.getDouble(0, 0), 0.000001); tb = doAggOpsToCol(builder, justAvg, noColumns); assertEquals((3f + 5f + 11f) / 3.0, tb.getDouble(0, 0), 0.0001); tb = doAggOpsToCol(builder, justStdev, noColumns); assertEquals(3.3993, tb.getDouble(0, 0), 0.0001); /* Double */ double[] doubles = new double[] {3, 5, 13}; builder = new DoubleColumnBuilder(); for (double d : doubles) { builder.appendDouble(d); } tb = doAggOpsToCol(builder, numericAggBitsInOrder, noColumns); allNumericAggsTestSchema(tb.getSchema(), builder.getType()); assertEquals(doubles.length, tb.getLong(0, 0)); assertEquals(Doubles.min(doubles), tb.getDouble(1, 0), 0.000001); assertEquals(Doubles.max(doubles), tb.getDouble(2, 0), 0.000001); assertEquals(3 + 5 + 13, tb.getDouble(3, 0), 0.0000001); assertEquals((3 + 5 + 13) / 3.0, tb.getDouble(4, 0), 0.0001); // Wolfram Alpha: population standard deviation {3,5,13} assertEquals(4.3205, tb.getDouble(5, 0), 0.0001); /* Doubles, one aggregate at a time */ tb = doAggOpsToCol(builder, justCount, noColumns); assertEquals(doubles.length, tb.getLong(0, 0)); tb = doAggOpsToCol(builder, justMin, noColumns); assertEquals(Doubles.min(doubles), tb.getDouble(0, 0), 0.000001); tb = doAggOpsToCol(builder, justMax, noColumns); assertEquals(Doubles.max(doubles), tb.getDouble(0, 0), 0.000001); tb = doAggOpsToCol(builder, justSum, noColumns); assertEquals(3f + 5f + 13f, tb.getDouble(0, 0), 0.000001); tb = doAggOpsToCol(builder, justAvg, noColumns); assertEquals((3f + 5f + 13f) / 3.0, tb.getDouble(0, 0), 0.0001); tb = doAggOpsToCol(builder, justStdev, noColumns); assertEquals(4.3205, tb.getDouble(0, 0), 0.0001); } } @Test public void testNonNumericAggs() throws Exception { ColumnBuilder<?> builder; TupleBatch tb; AggregationOp[] nonNumAggBitsInOrder = new AggregationOp[] {AggregationOp.COUNT, AggregationOp.MIN, AggregationOp.MAX}; for (int variant = 0; variant < 2; ++variant) { boolean noColumns = (variant == 0); /* Dates */ DateTime[] dates = new DateTime[] { DateTime.parse("2014-04-01T11:30"), DateTime.parse("2014-04-01T11:31"), DateTime.parse("2012-02-29T12:00") }; builder = new DateTimeColumnBuilder(); for (DateTime d : dates) { builder.appendDateTime(d); } tb = doAggOpsToCol(builder, nonNumAggBitsInOrder, noColumns); allNonNumericAggsTestSchema(tb.getSchema(), builder.getType()); assertEquals(dates.length, tb.getLong(0, 0)); assertEquals(DateTime.parse("2012-02-29T12:00"), tb.getDateTime(1, 0)); assertEquals(DateTime.parse("2014-04-01T11:31"), tb.getDateTime(2, 0)); /* Strings */ String[] strings = new String[] {"abcd", "abc", "abcde", "fghij0", "fghij1"}; builder = new StringColumnBuilder(); for (String s : strings) { builder.appendString(s); } tb = doAggOpsToCol(builder, nonNumAggBitsInOrder, noColumns); allNonNumericAggsTestSchema(tb.getSchema(), builder.getType()); assertEquals(strings.length, tb.getLong(0, 0)); assertEquals("abc", tb.getString(1, 0)); assertEquals("fghij1", tb.getString(2, 0)); /* Booleans */ AggregationOp[] booleanAggs = new AggregationOp[] {AggregationOp.COUNT}; boolean[] booleans = new boolean[] {true, false, true}; builder = new BooleanColumnBuilder(); for (boolean b : booleans) { builder.appendBoolean(b); } tb = doAggOpsToCol(builder, booleanAggs, noColumns); assertEquals(1, tb.getSchema().numColumns()); assertEquals(Type.LONG_TYPE, tb.getSchema().getColumnType(0)); assertEquals(booleans.length, tb.getLong(0, 0)); } } public TupleBatchBuffer generateRandomTuples(final int numTuples) { final String[] names = TestUtils.randomFixedLengthNumericString(1000, 1005, numTuples, 20); final long[] ids = TestUtils.randomLong(1000, 1005, names.length); final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); for (int i = 0; i < names.length; i++) { tbb.putLong(0, ids[i]); tbb.putString(1, names[i]); } return tbb; } @Test public void testSingleGroupAvg() throws DbException, InterruptedException { final int numTuples = 2 * TupleUtils.getBatchSize(Type.DOUBLE_TYPE) + 1; final TupleBatchBuffer testBase = generateRandomTuples(numTuples); // group by name, aggregate on id final Aggregate agg = new Aggregate( new BatchTupleSource(testBase), new int[] {1}, new PrimitiveAggregatorFactory(0, AggregationOp.AVG)); agg.open(TestEnvVars.get()); TupleBatch tb = null; final TupleBatchBuffer result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); final HashMap<Tuple, Integer> actualResult = TestUtils.tupleBatchToTupleBag(result); TestUtils.assertTupleBagEqual(TestUtils.groupByAvgLongColumn(testBase, 1, 0), actualResult); } @Test public void testSingleGroupMax() throws DbException, InterruptedException { final int numTuples = 2 * TupleUtils.getBatchSize(Type.LONG_TYPE) + 1; final TupleBatchBuffer testBase = generateRandomTuples(numTuples); // group by name, aggregate on id Aggregate agg = new Aggregate( new BatchTupleSource(testBase), new int[] {1}, new PrimitiveAggregatorFactory(0, AggregationOp.MAX)); agg.open(TestEnvVars.get()); TupleBatch tb = null; TupleBatchBuffer result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); HashMap<Tuple, Integer> actualResult = TestUtils.tupleBatchToTupleBag(result); TestUtils.assertTupleBagEqual(TestUtils.groupByMax(testBase, 1, 0), actualResult); agg = new Aggregate( new BatchTupleSource(testBase), new int[] {0}, new PrimitiveAggregatorFactory(1, AggregationOp.MAX)); agg.open(TestEnvVars.get()); tb = null; result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); actualResult = TestUtils.tupleBatchToTupleBag(result); TestUtils.assertTupleBagEqual(TestUtils.groupByMax(testBase, 0, 1), actualResult); } @Test public void testSingleGroupMin() throws DbException, InterruptedException { final int numTuples = 2 * TupleUtils.getBatchSize(Type.LONG_TYPE) + 1; final TupleBatchBuffer testBase = generateRandomTuples(numTuples); // group by name, aggregate on id Aggregate agg = new Aggregate( new BatchTupleSource(testBase), new int[] {1}, new PrimitiveAggregatorFactory(0, AggregationOp.MIN)); agg.open(TestEnvVars.get()); TupleBatch tb = null; TupleBatchBuffer result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); HashMap<Tuple, Integer> actualResult = TestUtils.tupleBatchToTupleBag(result); TestUtils.assertTupleBagEqual(TestUtils.groupByMin(testBase, 1, 0), actualResult); agg = new Aggregate( new BatchTupleSource(testBase), new int[] {0}, new PrimitiveAggregatorFactory(1, AggregationOp.MIN)); agg.open(TestEnvVars.get()); tb = null; result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); actualResult = TestUtils.tupleBatchToTupleBag(result); TestUtils.assertTupleBagEqual(TestUtils.groupByMin(testBase, 0, 1), actualResult); } @Test public void testSingleGroupSum() throws DbException, InterruptedException { final int numTuples = 2 * TupleUtils.getBatchSize(Type.DOUBLE_TYPE) + 1; final TupleBatchBuffer testBase = generateRandomTuples(numTuples); // group by name, aggregate on id final Aggregate agg = new Aggregate( new BatchTupleSource(testBase), new int[] {1}, new PrimitiveAggregatorFactory(0, AggregationOp.SUM)); agg.open(TestEnvVars.get()); TupleBatch tb = null; final TupleBatchBuffer result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); final HashMap<Tuple, Integer> actualResult = TestUtils.tupleBatchToTupleBag(result); TestUtils.assertTupleBagEqual(TestUtils.groupBySumLongColumn(testBase, 1, 0), actualResult); } @Test public void testSingleGroupStd() throws Exception { /* The source tuples -- integers 2 through 5 */ int from = 2, to = 5; int n = to - from + 1; // we are using a biased version of variance final TupleBatchBuffer testBase = new TupleBatchBuffer( Schema.of( ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE), ImmutableList.of("group", "value"))); int sum = 0; for (int i = from; i <= to; ++i) { testBase.putInt(0, 0); testBase.putInt(1, i); sum += i; } /* Generate expected values for stdev */ double mean = (double) sum / n; double diffSquared = 0.0; for (int i = from; i <= to; ++i) { double diff = i - mean; diffSquared += diff * diff; } double expectedStdev = Math.sqrt(diffSquared / n); /* Group by group, aggregate on value */ final Aggregate agg = new Aggregate( new BatchTupleSource(testBase), new int[] {0}, new PrimitiveAggregatorFactory(1, AggregationOp.STDEV)); agg.open(TestEnvVars.get()); TupleBatch tb = null; final TupleBatchBuffer result = new TupleBatchBuffer(agg.getSchema()); while (!agg.eos()) { tb = agg.nextReady(); if (tb != null) { tb.compactInto(result); } } agg.close(); tb = result.popAny(); assertEquals(expectedStdev, tb.getDouble(1, 0), 0.000001); } @Test public void testMultiGroupSum() throws DbException { final int numTuples = 2 * TupleUtils.getBatchSize(Type.DOUBLE_TYPE) + 2; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); long expectedFirst = 0; long expectedSecond = 0; // The idea of the tests is to generate altering data in the following scheme: // inserting { 0, 1, 2, i / 2 } on even rows, { 0, 1, 4, i / 2 } on odd rows for (long i = 0; i < numTuples; i++) { long value = i / 2; tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); expectedSecond += value; } else { tbb.putLong(2, 4L); } tbb.putLong(3, value); expectedFirst += value; } // test for grouping at the first and second column // expected all the i / 2 to be sum up Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory(3, AggregationOp.SUM)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(1, result.numTuples()); assertEquals(expectedFirst, result.getLong(result.numColumns() - 1, 0)); mga.close(); // test for grouping at the first, second and third column // expecting half of i / 2 to be sum up on each group Aggregate mgaTwo = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1, 2}, new PrimitiveAggregatorFactory(3, AggregationOp.SUM)); mgaTwo.open(TestEnvVars.get()); TupleBatch resultTwo = mgaTwo.nextReady(); assertNotNull(result); assertEquals(2, resultTwo.numTuples()); assertEquals(expectedSecond, resultTwo.getLong(resultTwo.numColumns() - 1, 0)); assertEquals(expectedSecond, resultTwo.getLong(resultTwo.numColumns() - 1, 1)); mgaTwo.close(); } @Test public void testMultiGroupAvg() throws DbException { final int numTuples = 10; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); double expected = 0.0; for (long i = 0; i < numTuples; i++) { tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); expected += i; } else { tbb.putLong(2, 4L); } tbb.putLong(3, i / 2); } expected /= numTuples; Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1, 2}, new PrimitiveAggregatorFactory(3, AggregationOp.AVG)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(2, result.numTuples()); assertEquals(expected, result.getDouble(result.numColumns() - 1, 0), 0.000001); mga.close(); } @Test public void testMultiGroupMin() throws DbException { final int numTuples = 10; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); long expected = 0; for (long i = 0; i < numTuples; i++) { tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); } else { tbb.putLong(2, 4L); } tbb.putLong(3, i / 2); } Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory(3, AggregationOp.MIN)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(1, result.numTuples()); assertEquals(expected, result.getLong(result.numColumns() - 1, 0)); mga.close(); } @Test public void testMultiGroupMax() throws DbException { final int numTuples = 10; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); long expected = numTuples - 1; for (long i = 0; i < numTuples; i++) { tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); } else { tbb.putLong(2, 4L); } tbb.putLong(3, i); } Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory(3, AggregationOp.MAX)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(1, result.numTuples()); assertEquals(3, result.numColumns()); assertEquals(expected, result.getLong(2, 0)); mga.close(); } @Test public void testMultiGroupMaxAndMin() throws DbException { final int numTuples = 10; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); long expected = numTuples - 1; for (long i = 0; i < numTuples; i++) { tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); } else { tbb.putLong(2, 4L); } tbb.putLong(3, i); } Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory( 3, new AggregationOp[] {AggregationOp.MAX, AggregationOp.MIN})); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(1, result.numTuples()); assertEquals(4, result.numColumns()); assertEquals(expected, result.getLong(2, 0)); assertEquals(0, result.getLong(3, 0)); mga.close(); } @Test public void testMultiGroupMaxMultiColumn() throws DbException { final int numTuples = 10; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); long expectedMin = 0; long expectedMax = numTuples - 1 + expectedMin; for (long i = expectedMin; i < numTuples; i++) { tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); } else { tbb.putLong(2, 4L); } tbb.putLong(3, i); } Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory( 3, new AggregationOp[] {AggregationOp.MAX, AggregationOp.MIN})); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(1, result.numTuples()); assertEquals(4, result.getSchema().numColumns()); assertEquals(expectedMin, result.getLong(result.numColumns() - 1, 0)); assertEquals(expectedMax, result.getLong(result.numColumns() - 2, 0)); mga.close(); } @Test public void testMultiGroupCountMultiColumn() throws DbException { final int numTuples = 10; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); for (long i = 0; i < numTuples; i++) { tbb.putLong(0, 0L); tbb.putLong(1, 1L); if (i % 2 == 0) { tbb.putLong(2, 2L); } else { tbb.putLong(2, 4L); } tbb.putLong(3, i); } Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory(0, AggregationOp.COUNT)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(1, result.numTuples()); assertEquals(3, result.getSchema().numColumns()); assertEquals(numTuples, result.getLong(result.numColumns() - 1, 0)); mga.close(); } /** * Finds a collision of a tuple of all integers with the given grouping. * * @param numCols the columns to group by * @param groupCols the number of columns in the tuples * @return two rows that should have the same hash value */ @SuppressWarnings("unused") private static TupleBatch findIntsHashCollision(final int numCols, final int[] groupCols) { for (int i : groupCols) { assertTrue(i < numCols); } List<Type> types = new LinkedList<>(); for (int i = 0; i < numCols; ++i) { types.add(Type.INT_TYPE); } Schema schema = new Schema(types); TupleBuffer buffer = new TupleBuffer(schema); final Map<Integer, Integer> foundSoFar = new HashMap<>(); for (int i = 0; i < Integer.MAX_VALUE; ++i) { for (int j = 0; j < numCols; ++j) { buffer.putInt(j, i); } int hashCode = HashUtils.hashSubRow(buffer, groupCols, i); Integer old = foundSoFar.put(hashCode, i); if (old != null) { /* Found a match */ TupleBatchBuffer ret = new TupleBatchBuffer(schema); for (int j = 0; j < numCols; ++j) { ret.putInt(j, old); } for (int j = 0; j < numCols; ++j) { ret.putInt(j, i); } return ret.popAny(); } } throw new IllegalStateException( "Could not find a collision for hashColumns=" + Arrays.toString(groupCols)); } @Test public void testMultiGroupCountHashCollision() throws DbException { int groupCols[] = new int[] {2, 0}; /* I used the following code to compute these two collision values. */ // TupleBatch collision = findIntsHashCollision(3, groupCols); // System.err.println(collision.getInt(0, 0)); // 94328 // System.err.println(collision.getInt(1, 1)); // 113814 Schema schema = Schema.ofFields(Type.INT_TYPE, Type.INT_TYPE, Type.INT_TYPE); TupleBuffer buffer = new TupleBuffer(schema); /* First row */ buffer.putInt(0, 113814); buffer.putInt(1, 113814); buffer.putInt(2, 113814); /* Second row */ buffer.putInt(0, 3); buffer.putInt(1, 5); buffer.putInt(2, 4); /* Third row */ buffer.putInt(0, 94328); buffer.putInt(1, 94328); buffer.putInt(2, 94328); /* Fourth row */ buffer.putInt(0, 113814); buffer.putInt(1, 113814); buffer.putInt(2, 113814); /* Fifth row */ buffer.putInt(0, 113814); buffer.putInt(1, 113814); buffer.putInt(2, 113814); /* Verify that the collisions hold where expected. */ assertEquals( HashUtils.hashSubRow(buffer, groupCols, 0), HashUtils.hashSubRow(buffer, groupCols, 2)); assertEquals( HashUtils.hashSubRow(buffer, groupCols, 0), HashUtils.hashSubRow(buffer, groupCols, 3)); assertEquals( HashUtils.hashSubRow(buffer, groupCols, 0), HashUtils.hashSubRow(buffer, groupCols, 4)); /* Verify that collisions do not hold where expected. */ assertNotEquals( HashUtils.hashSubRow(buffer, groupCols, 0), HashUtils.hashSubRow(buffer, groupCols, 1)); BatchTupleSource source = new BatchTupleSource(buffer.finalResult()); Aggregate mga = new Aggregate(source, groupCols, new PrimitiveAggregatorFactory(1, AggregationOp.COUNT)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNotNull(result); assertEquals(3, result.numTuples()); assertEquals(3, result.getSchema().numColumns()); // 113814 3 times assertEquals(113814, result.getInt(0, 0)); assertEquals(113814, result.getInt(1, 0)); assertEquals(3, result.getLong(2, 0)); // random vals once assertEquals(4, result.getInt(0, 1)); assertEquals(3, result.getInt(1, 1)); assertEquals(1, result.getLong(2, 1)); // 94328 once assertEquals(94328, result.getInt(0, 2)); assertEquals(94328, result.getInt(1, 2)); assertEquals(1, result.getLong(2, 2)); mga.close(); } @Test public void testMultiGroupCountMultiColumnEmpty() throws DbException { final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("a", "b", "c", "d")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); Aggregate mga = new Aggregate( new BatchTupleSource(tbb), new int[] {0, 1}, new PrimitiveAggregatorFactory(0, AggregationOp.COUNT)); mga.open(TestEnvVars.get()); TupleBatch result = mga.nextReady(); assertNull(result); mga.close(); } @Test(expected = ArithmeticException.class) public void testLongAggOverflow() throws Exception { LongColumnBuilder builder = new LongColumnBuilder().appendLong(Long.MAX_VALUE - 1).appendLong(3); doAggOpsToCol(builder, new AggregationOp[] {AggregationOp.SUM}, true); } @Test(expected = ArithmeticException.class) public void testLongAggUnderflow() throws Exception { LongColumnBuilder builder = new LongColumnBuilder().appendLong(Long.MIN_VALUE + 1).appendLong(-3); doAggOpsToCol(builder, new AggregationOp[] {AggregationOp.SUM}, true); } }