/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.voltdb.regressionsuites;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.Random;
import org.voltdb.BackendTarget;
import org.voltdb.VoltTable;
import org.voltdb.client.Client;
import org.voltdb.client.ClientResponse;
import org.voltdb.compiler.VoltProjectBuilder;
import org.voltdb.types.TimestampType;
public class TestApproxCountDistinctSuite extends RegressionSuite {
private static final double ALLOWED_PERCENT_ERROR = 1.5;
private static final String COLUMN_NAMES[] = {
"bi",
"ii",
"si",
"ti",
"dd",
"ts"
};
private static final String TABLE_NAMES[] = {"r", "p"};
private static long getNormalValue(Random r, double magnitude, long min, long max) {
double d;
do {
d = r.nextGaussian() * magnitude;
} while (d > max || d <= min);
return (long) d;
}
private static BigDecimal getNormalDecimalValue(Random r, double magnitude) {
double d;
do {
d = r.nextGaussian() * magnitude;
} while (d > (magnitude * 2) || d <= (-magnitude * 2));
return new BigDecimal(String.format("%.12f", d));
}
private void fillTable(Client client, String tbl) throws Exception {
Random r = new Random(777);
// Insert 1000 rows of data, and 84 (every 13th row) of nulls.
int numRows = 1084;
if (isValgrind()) {
// This test takes 20 minutes if we use 1000 rows in valgrind,
// so reduce the number of rows so it runs in a reasonable amount
// of time.
numRows = 109;
}
// Insert 1000 rows of data, and 84 (every 13th row) of nulls.
for (int i = 0; i < numRows; ++i) {
// Every 13th row, insert null values, just to make sure
// it doesn't mess with the algorithm.
if (i % 13 == 0) {
client.callProcedure(tbl + ".Insert", i,
null, null, null, null, null, null);
}
else {
// Use a a Gaussian distribution (bell curve), to exercise the hyperloglog hash.
final long baseTs = 1437589323966000L; // July 22, 2015 or so
client.callProcedure(tbl + ".Insert",
i, // primary key
getNormalValue(r, 1000, Long.MIN_VALUE, Long.MAX_VALUE),
getNormalValue(r, 1000, Integer.MIN_VALUE, Integer.MAX_VALUE),
getNormalValue(r, 1000, Short.MIN_VALUE, Short.MAX_VALUE),
getNormalValue(r, 100, Byte.MIN_VALUE, Byte.MAX_VALUE),
getNormalDecimalValue(r, 1000000000), // decimal
new TimestampType(baseTs + getNormalValue(r, 10000, Short.MIN_VALUE, Short.MAX_VALUE)));
}
}
}
private void assertEstimateWithin(String col, long exact, long estimate) {
double maxError = ALLOWED_PERCENT_ERROR;
if (isValgrind()) {
// in valgrind mode, table has fewer rows, to estimates are less accurate.
maxError *= 2.0;
}
double percentError;
if (exact != 0) {
double diff = Math.abs(exact - estimate);
percentError = diff / exact * 100.0;
}
else if (estimate == 0.0) {
percentError = 0.0;
}
else {
percentError = Double.MAX_VALUE;
}
/* Uncomment this if you are curious about how accurate the estimates are
System.out.println(String.format(" %s: Percent error: %2.2f%% (Exact: %5d, Estimate: %4.2f)",
col, percentError, exact, estimate));
// */
assertTrue("Estimate for distinct values in " + col + ":\n" +
"estimate: " + estimate + ", exact: " + exact + "\n" +
"Percent error: " + percentError,
percentError < maxError);
}
/**
* Given two tables assert that cardinality estimates are within
* some percentage of the exact cardinality. Relevant data is assumed to
* be in the last column of the table, after any group by keys.
* @param col -- for error reporting
* @param exactTable -- table containing exact values
* @param estimateTable -- table containing estimates
* @param maxError -- Maximum allowed error percentage
*/
private void assertEstimatesAreWithin(String col, VoltTable exactTable, VoltTable estimateTable, double maxError) {
final int whichCol = exactTable.getColumnCount() - 1;
assertEquals(exactTable.getRowCount(), estimateTable.getRowCount());
while(estimateTable.advanceRow()) {
assertTrue(exactTable.advanceRow());
assertEstimateWithin(col, exactTable.getLong(whichCol), estimateTable.getLong(whichCol));
}
assertFalse(exactTable.advanceRow());
}
public void testAsTableAgg() throws Exception {
Client client = getClient();
// lock down some precise values here
// (rather than comparing with "count(distinct col)")
// just so we have an idea of what causes these values to change.
//
// In other tests, we can just verify that the error in the estimate
// is reasonably bounded, by computing an exact answer.
long expectedEstimates[] = {
879,
872,
873,
244,
1003,
983
};
// If there's zero values, then cardinality estimate should be 0.0.
for (int colIdx = 0; colIdx < COLUMN_NAMES.length; ++colIdx) {
for (int tblIdx = 0; tblIdx < TABLE_NAMES.length; ++tblIdx) {
String tbl = TABLE_NAMES[tblIdx];
String col = COLUMN_NAMES[colIdx];
String approxStmt = String.format("select approx_count_distinct(%s) from %s", col, tbl);
VoltTable vt = client.callProcedure("@AdHoc", approxStmt).getResults()[0];
assertTrue(vt.advanceRow());
assertEquals(0, vt.getLong(0));
assertFalse(vt.advanceRow());
}
}
fillTable(client, "p");
fillTable(client, "r");
for (int tblIdx = 0; tblIdx < TABLE_NAMES.length; ++tblIdx) {
for (int colIdx = 0; colIdx < COLUMN_NAMES.length; ++colIdx) {
String tbl = TABLE_NAMES[tblIdx];
String col = COLUMN_NAMES[colIdx];
String approxStmt = String.format("select approx_count_distinct(%s) from %s", col, tbl);
VoltTable vt = client.callProcedure("@AdHoc", approxStmt).getResults()[0];
assertTrue(vt.advanceRow());
long actualEstimate = vt.getLong(0);
if (! isValgrind()) {
// Hard-coded expected values are not valid for valgrind mode, which
// uses fewer rows for brevity.
assertEquals("Actual estimate not expected for column " + col,
expectedEstimates[colIdx], actualEstimate);
}
assertFalse(vt.advanceRow());
// If we filter out the null values, the answer should be exactly the same
String approxStmtNoNulls =
"select approx_count_distinct(" + col + ")" +
" from " + tbl +
" where " + col + " is not null";
vt = client.callProcedure("@AdHoc", approxStmtNoNulls).getResults()[0];
assertTrue(vt.advanceRow());
long actualEstimateNoNulls = vt.getLong(0);
assertEquals(actualEstimate, actualEstimateNoNulls);
assertFalse(vt.advanceRow());
// Compare with the exact distinct count
String exactStmt = String.format("select count(distinct %s) from %s", col, tbl);
vt = client.callProcedure("@AdHoc", exactStmt).getResults()[0];
assertTrue(vt.advanceRow());
long exact = vt.getLong(0);
assertEstimateWithin(col, exact, actualEstimate);
assertFalse(vt.advanceRow());
}
}
// ENG-12466
ClientResponse cr = client.callProcedure("@AdHoc", "SELECT ALL APPROX_COUNT_DISTINCT(bi) C1, COUNT(bi) AS C1 FROM p;");
assertEquals(cr.getStatus(), ClientResponse.SUCCESS);
}
/**
* Run a query against a series of tables and columns to get both approximate
* and exact distinct cardinality, and compare the two.
*
* @param client database client, assumes tables are loaded
* @param tables
* @param columns
* @param queryFormat a printf-style string with three embedded %s markers:
* - a place to put "count( distinct" or "approx_count_distinct("
* - a column name
* - a table name
* @throws Exception
*/
public void compareEstimateAndExact(Client client,
String tables[], String columns[], String queryFormat)
throws Exception {
for (String tbl : tables) {
for (String col : columns) {
String approxStmt = String.format(queryFormat,
"approx_count_distinct(", col, tbl);
String exactStmt = String.format(queryFormat,
"count( distinct ", col, tbl);
VoltTable estimateTable = client.callProcedure("@AdHoc", approxStmt).getResults()[0];
VoltTable exactTable = client.callProcedure("@AdHoc", exactStmt).getResults()[0];
assertEstimatesAreWithin(col, exactTable, estimateTable, ALLOWED_PERCENT_ERROR);
}
}
}
public void testAsGroupByAgg() throws Exception {
Client client = getClient();
fillTable(client, "p");
fillTable(client, "r");
String queryFormat;
// Groups by the low 4 bits of the primary key
queryFormat =
"select bitand(cast(pk as bigint), x'03') as pk_lobits, " +
"%s %s ) " +
"from %s " +
"group by pk_lobits " +
"order by pk_lobits";
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES, queryFormat);
// Test with another aggregate that cannot be pushed down
// (all rows will be sent to coordinator, will lock down
// this behavior with an equivalent planner test.)
queryFormat =
"select bitand(cast(pk as bigint), x'03') as pk_lobits, " +
" count(distinct bi), " +
" %s %s ) " +
"from %s " +
"group by pk_lobits " +
"order by pk_lobits";
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES, queryFormat);
// Test with another aggregate that can be pushed down
// (approx_count_distinct will be distributed)
queryFormat =
"select bitand(cast(pk as bigint), x'03') as pk_lobits, " +
" count(dd), " +
" %s %s ) " +
"from %s " +
"group by pk_lobits " +
"order by pk_lobits";
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES, queryFormat);
}
public void testWithPartitionKey() throws Exception {
Client client = getClient();
fillTable(client, "p");
compareEstimateAndExact(client, new String[] {"p"}, new String[] {"pk"},
"select %s %s ) from %s;");
compareEstimateAndExact(client, new String[] {"p"}, new String[] {"pk"},
"select %s %s ) from %s where mod(pk, 3) = 0;");
compareEstimateAndExact(client, new String[] {"p"}, new String[] {"pk"},
"select bitand(bi, x'05') as gbk, %s %s ) " +
"from %s " +
"group by gbk order by gbk;");
compareEstimateAndExact(client, new String[] {"p"}, new String[] {"pk"},
"select bitand(bi, x'05') as gbk, %s %s ) " +
"from %s " +
"where mod(pk, 3) = 0 " +
"group by gbk order by gbk;");
}
public void testWithSubqueries() throws Exception {
Client client = getClient();
fillTable(client, "p");
fillTable(client, "r");
// Try a query where there may be multiple approx_count_distinct aggs,
// perhaps distributed, perhaps not.
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES,
"select %s %s ) " +
"from (select approx_count_distinct(ii) from r) as repl_subquery," +
" %s");
// As above but with reorder from clause elements
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES,
"select %s %s ) " +
"from %s, " +
"(select approx_count_distinct(ii) from r) as repl_subquery");
// As above but with other aggregates.
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES,
"select count(distinct bi), %s %s ) " +
"from (select sum(dd), approx_count_distinct(ii) from r) as repl_subquery," +
" %s");
// As above but with other aggregates.
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES,
"select count(bi), %s %s ) " +
"from (select sum(distinct dd), approx_count_distinct(ii) from r) as repl_subquery," +
" %s");
// This time let's put the distributed approx_count_distinct in the inner query
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES,
"select approx_count_distinct(bi), subq.a_count " +
"from (select %s %s ) as a_count from %s) as subq," +
" r " +
"group by subq.a_count");
// As above, but break push-down-ability inner query's agg.
compareEstimateAndExact(client, TABLE_NAMES, COLUMN_NAMES,
"select approx_count_distinct(bi), subq.a_count " +
"from (select %s %s ) as a_count, sum(distinct ii) from %s) as subq," +
" r " +
"group by subq.a_count");
}
public void testWithOtherClauses() throws Exception {
Client client = getClient();
fillTable(client, "p");
fillTable(client, "r");
// An ORDER BY query
compareEstimateAndExact(client, TABLE_NAMES, new String[] {"bi", "dd"},
"select pk, %s %s ) as cnt from %s group by pk order by cnt desc");
// ORDER BY with GROUP BY
compareEstimateAndExact(client, TABLE_NAMES, new String[] {"bi", "dd"},
"select bitand(cast(pk as bigint), x'03') lobits, %s %s ) as cnt " +
"from %s " +
"group by lobits " +
"order by lobits");
// HAVING (all rows evaluate to true for HAVING clause)
compareEstimateAndExact(client, TABLE_NAMES, new String[] {"bi", "dd"},
"select bitand(cast(pk as bigint), x'03') lobits, %s %s ) as cnt " +
"from %s " +
"group by lobits " +
"having approx_count_distinct(bi) between 225 and 275 " +
"order by lobits");
// HAVING (all rows evaluate to false for HAVING clause)
compareEstimateAndExact(client, TABLE_NAMES, new String[] {"bi", "dd"},
"select bitand(cast(pk as bigint), x'03') lobits, %s %s ) as cnt " +
"from %s " +
"group by lobits " +
"having approx_count_distinct(bi) > 275 " +
"order by lobits");
}
public void testNegative() throws Exception {
Client client = getClient();
// Currently only fixed-width types are allowed
verifyStmtFails(client,
"select approx_count_distinct(vc) from unsupported_column_types;",
"incompatible data type in operation");
verifyStmtFails(client,
"select approx_count_distinct(vb) from unsupported_column_types;",
"incompatible data type in operation");
verifyStmtFails(client,
"select approx_count_distinct(vc_inline) from unsupported_column_types;",
"incompatible data type in operation");
verifyStmtFails(client,
"select approx_count_distinct(vb_inline) from unsupported_column_types;",
"incompatible data type in operation");
// FLOAT is not allowed because wierdnesses of the floating point type:
// NaN, positive and negative zero, [de]normalized numbers.
verifyStmtFails(client,
"select approx_count_distinct(ff) from unsupported_column_types;",
"incompatible data type in operation");
}
public TestApproxCountDistinctSuite(String name) {
super(name);
}
static public junit.framework.Test suite() {
VoltServerConfig config = null;
MultiConfigSuiteBuilder builder = new MultiConfigSuiteBuilder(TestApproxCountDistinctSuite.class);
VoltProjectBuilder project = new VoltProjectBuilder();
final String literalSchema =
"CREATE TABLE r ( " +
"pk integer primary key not null, " +
"bi bigint, " +
"ii integer, " +
"si smallint, " +
"ti tinyint, " +
"dd decimal, " +
"ts timestamp " +
");" +
"CREATE TABLE p ( " +
"pk integer primary key not null, " +
"bi bigint, " +
"ii integer, " +
"si smallint, " +
"ti tinyint, " +
"dd decimal, " +
"ts timestamp " +
"); " +
"partition table p on column pk;" +
"CREATE TABLE unsupported_column_types ( " +
"vb varbinary(256), " +
"vc varchar(256)," +
"vb_inline varbinary(4), " +
"vc_inline varchar(4), " +
"ff float " +
");";
try {
project.addLiteralSchema(literalSchema);
}
catch (IOException e) {
assertFalse(true);
}
boolean success;
config = new LocalCluster("testApproxCountDistinctSuite-onesite.jar", 3, 1, 0, BackendTarget.NATIVE_EE_JNI);
success = config.compile(project);
assert(success);
builder.addServerConfig(config);
config = new LocalCluster("testApproxCountDistinctSuite-onesite.jar", 1, 1, 0, BackendTarget.NATIVE_EE_JNI);
success = config.compile(project);
assert(success);
builder.addServerConfig(config);
return builder;
}
}