/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.integration.tests; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.commons.lang.StringUtils; import org.json.JSONArray; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The <code>QueryGenerator</code> class is used to generate random equivalent PQL/SQL query pairs based on Avro files. * <ul> * <li> * Supports COMPARISON, IN and BETWEEN predicate for both single-value and multi-value columns. * <ul> * <li>For multi-value columns, does not support NOT EQUAL and NOT IN.</li> * <li>The reason for this restriction is that number of elements in multi-value columns is not fixed.</li> * </ul> * </li> * <li>Supports single-value data type: BOOLEAN, INT, LONG, FLOAT, DOUBLE, STRING.</li> * <li>Supports multi-value data type: INT, LONG, FLOAT, DOUBLE, STRING.</li> * <li> * Supports aggregation function: SUM, MIN, MAX, AVG, COUNT, DISTINCTCOUNT. * <ul> * <li>SUM, MIN, MAX, AVG can only work on numeric single-value columns.</li> * <li>COUNT, DISTINCTCOUNT can work on any single-value columns.</li> * </ul> * </li> * </ul> */ public class QueryGenerator { private static final Logger LOGGER = LoggerFactory.getLogger(QueryGenerator.class); // Configurable variables. private static final int MAX_NUM_SELECTION_COLUMNS = 3; private static final int MAX_NUM_AGGREGATION_COLUMNS = 3; private static final int MAX_NUM_ORDER_BY_COLUMNS = 3; private static final int MAX_NUM_GROUP_BY_COLUMNS = 3; private static final int MAX_NUM_PREDICATES = 3; private static final int MAX_NUM_IN_CLAUSE_VALUES = 5; private static final int MAX_RESULT_LIMIT = 30; private static final List<String> BOOLEAN_OPERATORS = Arrays.asList("OR", "AND"); private static final List<String> COMPARISON_OPERATORS = Arrays.asList("=", "<>", "<", ">", "<=", ">="); // TODO: fix DISTINCTCOUNT implementation and add it back. // Currently for DISTINCTCOUNT we use hashcode as the key, should change it to use raw values. private static final List<String> AGGREGATION_FUNCTIONS = Arrays.asList("SUM", "MIN", "MAX", "AVG", "COUNT"/*, "DISTINCTCOUNT"*/); private static final Random RANDOM = new Random(); private final Map<String, Set<String>> _columnToValueSet = new HashMap<>(); private final Map<String, List<String>> _columnToValueList = new HashMap<>(); private final List<String> _columnNames = new ArrayList<>(); private final List<String> _singleValueColumnNames = new ArrayList<>(); private final List<String> _singleValueNumericalColumnNames = new ArrayList<>(); private final Map<String, Integer> _multiValueColumnMaxNumElements = new HashMap<>(); private final List<QueryGenerationStrategy> _queryGenerationStrategies = Arrays.asList(new SelectionQueryGenerationStrategy(), new AggregationQueryGenerationStrategy()); private final List<PredicateGenerator> _singleValuePredicateGenerators = Arrays.asList(new SingleValueComparisonPredicateGenerator(), new SingleValueInPredicateGenerator(), new SingleValueBetweenPredicateGenerator(), new SingleValueRegexPredicateGenerator()); private final List<PredicateGenerator> _multiValuePredicateGenerators = Arrays.asList(new MultiValueComparisonPredicateGenerator(), new MultiValueInPredicateGenerator(), new MultiValueBetweenPredicateGenerator()); private final String _pinotTableName; private final String _h2TableName; private boolean _skipMultiValuePredicates = false; /** * Constructor for <code>QueryGenerator</code>. * * @param avroFiles list of Avro files. * @param pinotTableName Pinot table name. * @param h2TableName H2 table name. */ public QueryGenerator(List<File> avroFiles, String pinotTableName, String h2TableName) { _pinotTableName = pinotTableName; _h2TableName = h2TableName; // Read Avro schema and initialize storage. File schemaAvroFile = avroFiles.get(0); GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(schemaAvroFile, datumReader)) { Schema schema = fileReader.getSchema(); for (Schema.Field field : schema.getFields()) { String fieldName = field.name(); Schema fieldSchema = field.schema(); Schema.Type fieldType = fieldSchema.getType(); switch (fieldType) { case UNION: _columnNames.add(fieldName); _columnToValueSet.put(fieldName, new HashSet<String>()); Schema.Type type = fieldSchema.getTypes().get(0).getType(); if (type == Schema.Type.ARRAY) { _multiValueColumnMaxNumElements.put(fieldName, 0); } else { _singleValueColumnNames.add(fieldName); if (type != Schema.Type.STRING && type != Schema.Type.BOOLEAN) { _singleValueNumericalColumnNames.add(fieldName); } } break; case ARRAY: _columnNames.add(fieldName); _columnToValueSet.put(fieldName, new HashSet<String>()); _multiValueColumnMaxNumElements.put(fieldName, 0); break; case INT: case LONG: case FLOAT: case DOUBLE: _columnNames.add(fieldName); _columnToValueSet.put(fieldName, new HashSet<String>()); _singleValueColumnNames.add(fieldName); _singleValueNumericalColumnNames.add(fieldName); break; case BOOLEAN: case STRING: _columnNames.add(fieldName); _columnToValueSet.put(fieldName, new HashSet<String>()); _singleValueColumnNames.add(fieldName); break; default: LOGGER.warn("Ignoring field {} of type {}", fieldName, fieldType); break; } } } catch (Exception e) { throw new RuntimeException(e); } // Load Avro data into storage. for (File avroFile : avroFiles) { addAvroData(avroFile); } // Ignore multi-value columns with too many elements. prepareToGenerateQueries(); } /** * Helper method to read in an Avro file and add data to the storage. * * @param avroFile Avro file. */ private void addAvroData(File avroFile) { // Read in records and update the values stored. GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader)) { for (GenericRecord genericRecord : fileReader) { for (String columnName : _columnNames) { Set<String> values = _columnToValueSet.get(columnName); // Turn the Avro value into a valid SQL String token. Object avroValue = genericRecord.get(columnName); if (avroValue != null) { Integer storedMaxNumElements = _multiValueColumnMaxNumElements.get(columnName); if (storedMaxNumElements != null) { // Multi-value column GenericData.Array array = (GenericData.Array) avroValue; int numElements = array.size(); if (storedMaxNumElements < numElements) { _multiValueColumnMaxNumElements.put(columnName, numElements); } for (Object element : array) { storeAvroValueIntoValueSet(values, element); } } else { // Single-value column storeAvroValueIntoValueSet(values, avroValue); } } } } } catch (Exception e) { throw new RuntimeException(e); } } /** * Helper method to store an Avro value into the valid SQL String value set. * * @param valueSet value set. * @param avroValue Avro value. */ private static void storeAvroValueIntoValueSet(Set<String> valueSet, Object avroValue) { if (avroValue instanceof Number) { // For Number object, store raw value. valueSet.add(avroValue.toString()); } else { // For non-Number object, escape single quote. valueSet.add("'" + avroValue.toString().replace("'", "''") + "'"); } } /** * Helper method to finish initialization of the <code>QueryGenerator</code>, removing multi-value columns with too * many elements and dumping storage into the final map from column name to list of column values. * <p>Called after all Avro data loaded. */ private void prepareToGenerateQueries() { Iterator<String> columnNameIterator = _columnNames.iterator(); while (columnNameIterator.hasNext()) { String columnName = columnNameIterator.next(); // Remove multi-value columns with more than MAX_ELEMENTS_IN_MULTI_VALUE elements. Integer maxNumElements = _multiValueColumnMaxNumElements.get(columnName); if (maxNumElements != null && maxNumElements > BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE) { LOGGER.debug("Ignoring column {} with max number of {} elements", columnName, maxNumElements); columnNameIterator.remove(); _multiValueColumnMaxNumElements.remove(columnName); } else { _columnToValueList.put(columnName, new ArrayList<>(_columnToValueSet.get(columnName))); } } // Free the other copy of the data. _columnToValueSet.clear(); } /** * Set whether to skip predicates on multi-value columns. * * @param skipMultiValuePredicates whether to skip predicates on multi-value columns. */ public void setSkipMultiValuePredicates(boolean skipMultiValuePredicates) { _skipMultiValuePredicates = skipMultiValuePredicates; } /** * Helper method to pick a random value from the values list. * * @param list values list. * @param <T> type of the value. * @return randomly picked value. */ private <T> T pickRandom(List<T> list) { return list.get(RANDOM.nextInt(list.size())); } /** * Helper method to join several {@link String} elements with ' '. * * @param elements elements to be joined. * @return joined result. */ private static String joinWithSpaces(String... elements) { StringBuilder stringBuilder = new StringBuilder(); for (String element : elements) { if (!element.isEmpty()) { stringBuilder.append(element).append(' '); } } return stringBuilder.substring(0, stringBuilder.length() - 1); } /** * Query interface with capability of generating PQL and H2 SQL query. */ public interface Query { /** * Generate PQL query. * * @return generated PQL query. */ String generatePql(); /** * Generate H2 SQL queries equivalent to the PQL query. * * @return generated H2 SQL queries. */ List<String> generateH2Sql(); } /** * Selection query. */ private class SelectionQuery implements Query { private final List<String> _projectionColumns; private final PredicateQueryFragment _predicate; private final OrderByQueryFragment _orderBy; private final LimitQueryFragment _limit; /** * Constructor for <code>SelectionQuery</code>. * * @param projectionColumns projection columns. * @param orderBy order by fragment. * @param predicate predicate fragment. * @param limit limit fragment. */ public SelectionQuery(List<String> projectionColumns, PredicateQueryFragment predicate, OrderByQueryFragment orderBy, LimitQueryFragment limit) { _projectionColumns = projectionColumns; _orderBy = orderBy; _predicate = predicate; _limit = limit; } @Override public String generatePql() { return joinWithSpaces("SELECT", StringUtils.join(_projectionColumns, ", "), "FROM", _pinotTableName, _predicate.generatePql(), _orderBy.generatePql(), _limit.generatePql()); } @Override public List<String> generateH2Sql() { List<String> h2ProjectionColumns = new ArrayList<>(); for (String projectionColumn : _projectionColumns) { if (_multiValueColumnMaxNumElements.containsKey(projectionColumn)) { // Multi-value column. for (int i = 0; i < BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE; i++) { h2ProjectionColumns.add(projectionColumn + "__MV" + i); } } else { // Single-value column. h2ProjectionColumns.add(projectionColumn); } } return Collections.singletonList( joinWithSpaces("SELECT", StringUtils.join(h2ProjectionColumns, ", "), "FROM", _h2TableName, _predicate.generateH2Sql(), _orderBy.generateH2Sql(), _limit.generateH2Sql())); } } /** * Aggregation query. */ private class AggregationQuery implements Query { private List<String> _aggregateColumnsAndFunctions; private PredicateQueryFragment _predicate; private Set<String> _groupColumns; private TopQueryFragment _top; /** * Constructor for <code>AggregationQuery</code>. * * @param aggregateColumnsAndFunctions aggregation functions. * @param predicate predicate fragment. * @param groupColumns group-by columns. * @param top top fragment. */ public AggregationQuery(List<String> aggregateColumnsAndFunctions, PredicateQueryFragment predicate, Set<String> groupColumns, TopQueryFragment top) { _aggregateColumnsAndFunctions = aggregateColumnsAndFunctions; _predicate = predicate; _groupColumns = groupColumns; _top = top; } @Override public String generatePql() { if (_groupColumns.isEmpty()) { return joinWithSpaces("SELECT", StringUtils.join(_aggregateColumnsAndFunctions, ", "), "FROM", _pinotTableName, _predicate.generatePql(), _top.generatePql()); } else { return joinWithSpaces("SELECT", StringUtils.join(_aggregateColumnsAndFunctions, ", "), "FROM", _pinotTableName, _predicate.generatePql(), "GROUP BY", StringUtils.join(_groupColumns, ", "), _top.generatePql()); } } @Override public List<String> generateH2Sql() { List<String> queries = new ArrayList<>(); // For each aggregation function, generate one separate H2 SQL query. for (String aggregateColumnAndFunction : _aggregateColumnsAndFunctions) { // Make 'AVG' and 'DISTINCTCOUNT' compatible with H2 SQL query. if (aggregateColumnAndFunction.startsWith("AVG(")) { aggregateColumnAndFunction = aggregateColumnAndFunction.replace("AVG(", "AVG(CAST(").replace(")", " AS DOUBLE))"); } else if (aggregateColumnAndFunction.startsWith("DISTINCTCOUNT(")) { aggregateColumnAndFunction = aggregateColumnAndFunction.replace("DISTINCTCOUNT(", "COUNT(DISTINCT "); } if (_groupColumns.isEmpty()) { // Aggregation query. queries.add( joinWithSpaces("SELECT", aggregateColumnAndFunction, "FROM", _h2TableName, _predicate.generateH2Sql(), _top.generateH2Sql())); } else { // Group-by query. // Unlike PQL, SQL expects the group columns in select statements. String groupByColumns = StringUtils.join(_groupColumns, ", "); queries.add(joinWithSpaces("SELECT", groupByColumns + ",", aggregateColumnAndFunction, "FROM", _h2TableName, _predicate.generateH2Sql(), "GROUP BY", groupByColumns, _top.generateH2Sql())); } } return queries; } } /** * Generate one selection or aggregation query. * * @return generated query. */ public Query generateQuery() { return pickRandom(_queryGenerationStrategies).generateQuery(); } /** * Query fragment interface with capability of generating PQL and H2 SQL query fragment. */ private interface QueryFragment { /** * Generate PQL query fragment. * * @return generated PQL query fragment. */ String generatePql(); /** * Generate H2 SQL query fragment equivalent to the PQL query fragment. * * @return generated H2 SQL query fragment. */ String generateH2Sql(); } /** * Most basic query fragment. */ private class StringQueryFragment implements QueryFragment { String _pql; String _sql; /** * Constructor for with same PQL and H2 SQL query fragment. * * @param pql PQL (H2 SQL) query fragment. */ StringQueryFragment(String pql) { _pql = pql; _sql = pql; } /** * Constructor for <code>StringQueryFragment</code> with different PQL and H2 SQL query fragment. * * @param pql PQL query fragment. * @param sql H2 SQL query fragment. */ StringQueryFragment(String pql, String sql) { _pql = pql; _sql = sql; } @Override public String generatePql() { return _pql; } @Override public String generateH2Sql() { return _sql; } } /** * Limit query fragment for selection queries. * <ul> * <li>SELECT ... FROM ... WHERE ... 'LIMIT ...'</li> * </ul> */ private class LimitQueryFragment extends StringQueryFragment { LimitQueryFragment(int limit) { // When limit is MAX_RESULT_LIMIT, construct query without LIMIT. super(limit == MAX_RESULT_LIMIT ? "" : "LIMIT " + limit, "LIMIT " + BaseClusterIntegrationTest.MAX_COMPARISON_LIMIT); } } /** * Top query fragment for aggregation queries. * <ul> * <li>SELECT ... FROM ... WHERE ... 'TOP ...'</li> * </ul> */ private class TopQueryFragment extends StringQueryFragment { TopQueryFragment(int top) { // When top is MAX_RESULT_LIMIT, construct query without TOP. super(top == MAX_RESULT_LIMIT ? "" : "TOP " + top, "LIMIT " + BaseClusterIntegrationTest.MAX_COMPARISON_LIMIT); } } /** * Order by query fragment for aggregation queries. * <ul> * <li>SELECT ... FROM ... WHERE ... 'ORDER BY ...'</li> * </ul> */ private class OrderByQueryFragment extends StringQueryFragment { OrderByQueryFragment(Set<String> columns) { super(columns.isEmpty() ? "" : "ORDER BY " + StringUtils.join(columns, ", ")); } } /** * Predicate query fragment. * <ul> * <li>SELECT ... FROM ... 'WHERE ...'</li> * </ul> */ private class PredicateQueryFragment implements QueryFragment { List<QueryFragment> _predicates; List<QueryFragment> _operators; /** * Constructor for <code>PredicateQueryFragment</code>. * * @param predicates predicates. * @param operators operators between predicates. */ PredicateQueryFragment(List<QueryFragment> predicates, List<QueryFragment> operators) { _predicates = predicates; _operators = operators; } @Override public String generatePql() { if (_predicates.isEmpty()) { return ""; } else { StringBuilder pql = new StringBuilder("WHERE "); // One less than the number of predicates. int operatorCount = _operators.size(); for (int i = 0; i < operatorCount; i++) { pql.append(_predicates.get(i).generatePql()).append(' ').append(_operators.get(i).generatePql()).append(' '); } pql.append(_predicates.get(operatorCount).generatePql()); return pql.toString(); } } @Override public String generateH2Sql() { if (_predicates.isEmpty()) { return ""; } else { StringBuilder sql = new StringBuilder("WHERE "); // One less than the number of predicates. int operatorCount = _operators.size(); for (int i = 0; i < operatorCount; i++) { sql.append(_predicates.get(i).generateH2Sql()) .append(' ') .append(_operators.get(i).generateH2Sql()) .append(' '); } sql.append(_predicates.get(operatorCount).generateH2Sql()); return sql.toString(); } } } /** * Helper method to generate a predicate query fragment. * * @return generated predicate query fragment. */ private PredicateQueryFragment generatePredicate() { // Generate at most MAX_NUM_PREDICATES predicates. int predicateCount = RANDOM.nextInt(MAX_NUM_PREDICATES + 1); List<QueryFragment> predicates = new ArrayList<>(predicateCount); while (predicates.size() < predicateCount) { String columnName = pickRandom(_columnNames); if (!_columnToValueList.get(columnName).isEmpty()) { if (!_multiValueColumnMaxNumElements.containsKey(columnName)) { // Single-value column. predicates.add(pickRandom(_singleValuePredicateGenerators).generatePredicate(columnName)); } else if (!_skipMultiValuePredicates) { // Multi-value column. predicates.add(pickRandom(_multiValuePredicateGenerators).generatePredicate(columnName)); } } } if (predicateCount < 2) { // No need to join. return new PredicateQueryFragment(predicates, Collections.<QueryFragment>emptyList()); } else { // Join predicates with ANDs and ORs. List<QueryFragment> operators = new ArrayList<>(predicateCount - 1); for (int i = 1; i < predicateCount; i++) { operators.add(new StringQueryFragment(pickRandom(BOOLEAN_OPERATORS))); } return new PredicateQueryFragment(predicates, operators); } } /** * Query generation strategy interface with capability of generating query using specific strategy. */ private interface QueryGenerationStrategy { /** * Generate a query using specific strategy. * * @return generated query. */ Query generateQuery(); } /** * Strategy to generate selection queries. * <ul> * <li>SELECT a, b FROM table WHERE a = 'foo' AND b = 'bar' ORDER BY c LIMIT 10</li> * </ul> */ private class SelectionQueryGenerationStrategy implements QueryGenerationStrategy { @Override public Query generateQuery() { // Select at most MAX_NUM_SELECTION_COLUMNS columns. int projectionColumnCount = Math.min(RANDOM.nextInt(MAX_NUM_SELECTION_COLUMNS) + 1, _columnNames.size()); Set<String> projectionColumns = new HashSet<>(); while (projectionColumns.size() < projectionColumnCount) { projectionColumns.add(pickRandom(_columnNames)); } // Select at most MAX_NUM_ORDER_BY_COLUMNS columns for ORDER BY clause. int orderByColumnCount = Math.min(RANDOM.nextInt(MAX_NUM_ORDER_BY_COLUMNS + 1), _singleValueColumnNames.size()); Set<String> orderByColumns = new HashSet<>(); while (orderByColumns.size() < orderByColumnCount) { orderByColumns.add(pickRandom(_singleValueColumnNames)); } // Generate a predicate. PredicateQueryFragment predicate = generatePredicate(); // Generate a result limit of at most MAX_RESULT_LIMIT columns for ORDER BY clause. int resultLimit = RANDOM.nextInt(MAX_RESULT_LIMIT + 1); LimitQueryFragment limit = new LimitQueryFragment(resultLimit); return new SelectionQuery(new ArrayList<>(projectionColumns), predicate, new OrderByQueryFragment(orderByColumns), limit); } } /** * Strategy to generate aggregation queries. * <ul> * <li>SELECT SUM(a), MAX(b) FROM table WHERE a = 'foo' AND b = 'bar' GROUP BY c TOP 10</li> * </ul> */ private class AggregationQueryGenerationStrategy implements QueryGenerationStrategy { @Override public Query generateQuery() { // Generate at most MAX_NUM_AGGREGATION_COLUMNS columns on which to aggregate, map 0 to 'COUNT(*)'. int aggregationColumnCount = RANDOM.nextInt(MAX_NUM_AGGREGATION_COLUMNS + 1); Set<String> aggregationColumnsAndFunctions = new HashSet<>(); if (aggregationColumnCount == 0) { aggregationColumnsAndFunctions.add("COUNT(*)"); } else { while (aggregationColumnsAndFunctions.size() < aggregationColumnCount) { String aggregationFunction = pickRandom(AGGREGATION_FUNCTIONS); String aggregationColumn; switch (aggregationFunction) { // "COUNT" and "DISTINCTCOUNT" support all single-value columns. case "COUNT": case "DISTINCTCOUNT": aggregationColumn = pickRandom(_singleValueColumnNames); break; // Other functions only support single-value numeric columns. default: aggregationColumn = pickRandom(_singleValueNumericalColumnNames); } aggregationColumnsAndFunctions.add(aggregationFunction + "(" + aggregationColumn + ")"); } } // Generate a predicate. PredicateQueryFragment predicate = generatePredicate(); // Generate at most MAX_NUM_GROUP_BY_COLUMNS columns on which to group. int groupColumnCount = Math.min(RANDOM.nextInt(MAX_NUM_GROUP_BY_COLUMNS + 1), _singleValueColumnNames.size()); Set<String> groupColumns = new HashSet<>(); while (groupColumns.size() < groupColumnCount) { groupColumns.add(pickRandom(_singleValueColumnNames)); } // Generate a result limit of at most MAX_RESULT_LIMIT. TopQueryFragment top = new TopQueryFragment(RANDOM.nextInt(MAX_RESULT_LIMIT + 1)); return new AggregationQuery(new ArrayList<>(aggregationColumnsAndFunctions), predicate, groupColumns, top); } } /** * Predicate generator interface with capability of generating a predicate query fragment on a column. */ private interface PredicateGenerator { /** * Generate a predicate query fragment on a column. * * @param columnName column name. * @return generated predicate query fragment. */ QueryFragment generatePredicate(String columnName); } /** * Generator for single-value column comparison predicate query fragment. */ private class SingleValueComparisonPredicateGenerator implements PredicateGenerator { @Override public QueryFragment generatePredicate(String columnName) { String columnValue = pickRandom(_columnToValueList.get(columnName)); String comparisonOperator = pickRandom(COMPARISON_OPERATORS); return new StringQueryFragment(joinWithSpaces(columnName, comparisonOperator, columnValue)); } } /** * Generator for single-value column <code>IN</code> predicate query fragment. */ private class SingleValueInPredicateGenerator implements PredicateGenerator { @Override public QueryFragment generatePredicate(String columnName) { List<String> columnValues = _columnToValueList.get(columnName); int numValues = Math.min(RANDOM.nextInt(MAX_NUM_IN_CLAUSE_VALUES) + 1, columnValues.size()); Set<String> values = new HashSet<>(); while (values.size() < numValues) { values.add(pickRandom(columnValues)); } String inValues = StringUtils.join(values, ", "); boolean notIn = RANDOM.nextBoolean(); if (notIn) { return new StringQueryFragment(columnName + " NOT IN (" + inValues + ")"); } else { return new StringQueryFragment(columnName + " IN (" + inValues + ")"); } } } /** * Generator for single-value column <code>BETWEEN</code> predicate query fragment. */ private class SingleValueBetweenPredicateGenerator implements PredicateGenerator { @Override public QueryFragment generatePredicate(String columnName) { List<String> columnValues = _columnToValueList.get(columnName); String leftValue = pickRandom(columnValues); String rightValue = pickRandom(columnValues); return new StringQueryFragment(columnName + " BETWEEN " + leftValue + " AND " + rightValue); } } /** * Generator for single-value column <code>REGEX</code> predicate query fragment. */ private class SingleValueRegexPredicateGenerator implements PredicateGenerator { Random random = new Random(); @Override public QueryFragment generatePredicate(String columnName) { List<String> columnValues = _columnToValueList.get(columnName); String value; value = pickRandom(columnValues); StringBuilder pqlRegexBuilder = new StringBuilder(); StringBuilder sqlRegexBuilder = new StringBuilder(); // do regex only for string type if (value.startsWith("'") && value.endsWith("'")) { // replace only one character for now with .* ignore the first and last character int indexToReplaceWithRegex = 1 + random.nextInt(value.length() - 2); for (int i = 1; i < value.length() - 1; i++) { if (i == indexToReplaceWithRegex) { pqlRegexBuilder.append(".*"); sqlRegexBuilder.append(".*"); } else { pqlRegexBuilder.append(value.charAt(i)); sqlRegexBuilder.append(value.charAt(i)); } } String pql = String.format(" REGEXP_LIKE(%s, '%s')", columnName, pqlRegexBuilder.toString()); String sql = String.format(" REGEXP_LIKE(%s, '%s', 'i')", columnName, sqlRegexBuilder.toString()); return new StringQueryFragment(pql, sql); } else { String equalsPredicate = String.format("%s = %s", columnName, value); return new StringQueryFragment(equalsPredicate); } } } /** * Generator for multi-value column comparison predicate query fragment. * <p>DO NOT SUPPORT '<code>NOT EQUAL</code>'. */ private class MultiValueComparisonPredicateGenerator implements PredicateGenerator { @Override public QueryFragment generatePredicate(String columnName) { String columnValue = pickRandom(_columnToValueList.get(columnName)); String comparisonOperator = pickRandom(COMPARISON_OPERATORS); // Not equal works differently on multi-value, so avoid '<>' comparison. while (comparisonOperator.equals("<>")) { comparisonOperator = pickRandom(COMPARISON_OPERATORS); } List<String> h2ComparisonClauses = new ArrayList<>(BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE); for (int i = 0; i < BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE; i++) { h2ComparisonClauses.add(joinWithSpaces(columnName + "__MV" + i, comparisonOperator, columnValue)); } return new StringQueryFragment(joinWithSpaces(columnName, comparisonOperator, columnValue), "(" + StringUtils.join(h2ComparisonClauses, " OR ") + ")"); } } /** * Generator for multi-value column <code>IN</code> predicate query fragment. * <p>DO NOT SUPPORT '<code>NOT IN</code>'. */ private class MultiValueInPredicateGenerator implements PredicateGenerator { @Override public QueryFragment generatePredicate(String columnName) { List<String> columnValues = _columnToValueList.get(columnName); int numValues = Math.min(RANDOM.nextInt(MAX_NUM_IN_CLAUSE_VALUES) + 1, columnValues.size()); Set<String> values = new HashSet<>(); while (values.size() < numValues) { values.add(pickRandom(columnValues)); } String inValues = StringUtils.join(values, ", "); List<String> h2InClauses = new ArrayList<>(BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE); for (int i = 0; i < BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE; i++) { h2InClauses.add(columnName + "__MV" + i + " IN (" + inValues + ")"); } return new StringQueryFragment(columnName + " IN (" + inValues + ")", "(" + StringUtils.join(h2InClauses, " OR ") + ")"); } } /** * Generator for multi-value column <code>BETWEEN</code> predicate query fragment. */ private class MultiValueBetweenPredicateGenerator implements PredicateGenerator { @Override public QueryFragment generatePredicate(String columnName) { List<String> columnValues = _columnToValueList.get(columnName); String leftValue = pickRandom(columnValues); String rightValue = pickRandom(columnValues); List<String> h2ComparisonClauses = new ArrayList<>(BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE); for (int i = 0; i < BaseClusterIntegrationTest.MAX_ELEMENTS_IN_MULTI_VALUE; i++) { h2ComparisonClauses.add(columnName + "__MV" + i + " BETWEEN " + leftValue + " AND " + rightValue); } return new StringQueryFragment(columnName + " BETWEEN " + leftValue + " AND " + rightValue, "(" + StringUtils.join(h2ComparisonClauses, " OR ") + ")"); } } /** * Sample main class for the query generator. * * @param args arguments. */ public static void main(String[] args) throws Exception { File avroFile = new File("pinot-integration-tests/src/test/resources/On_Time_On_Time_Performance_2014_1.avro"); QueryGenerator queryGenerator = new QueryGenerator(Collections.singletonList(avroFile), "mytable", "mytable"); File outputFile = new File( "pinot-integration-tests/src/test/resources/On_Time_On_Time_Performance_2014_100k_subset.test_queries_10K"); try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) { for (int i = 0; i < 10000; i++) { Query query = queryGenerator.generateQuery(); JSONObject queryJson = new JSONObject(); queryJson.put("pql", query.generatePql()); queryJson.put("hsqls", new JSONArray(query.generateH2Sql())); writer.write(queryJson.toString()); writer.newLine(); } } } }