/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.startree.hll;
import com.clearspring.analytics.stream.cardinality.HyperLogLog;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.request.BrokerRequest;
import com.linkedin.pinot.common.segment.SegmentMetadata;
import com.linkedin.pinot.common.utils.request.FilterQueryTree;
import com.linkedin.pinot.common.utils.request.RequestUtils;
import com.linkedin.pinot.core.common.*;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.operator.filter.StarTreeIndexOperator;
import com.linkedin.pinot.core.plan.FilterPlanNode;
import com.linkedin.pinot.core.segment.index.readers.Dictionary;
import com.linkedin.pinot.pql.parsers.Pql2Compiler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import java.util.*;
/**
* Base class containing common functionality for all star-tree integration tests.
*/
public class BaseHllStarTreeIndexTest {
private static final Logger LOGGER = LoggerFactory.getLogger(BaseHllStarTreeIndexTest.class);
protected final long _randomSeed = System.nanoTime();
/**
* We test on d3, d4 since we deliberately make cardinality of d3, d4 much larger than other columns
* to mimic actual use cases
*/
private static final Set<String> columnsToDeriveHllFields = new HashSet<>(Arrays.asList("d3", "d4"));
protected static final HllConfig HLL_CONFIG = new HllConfig(
HllConstants.DEFAULT_LOG2M, columnsToDeriveHllFields, HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX);
protected String[] _hardCodedQueries =
new String[]{
"select fasthll(d4) from T",
"select fasthll(d4) from T where d1 = 'd1-v1'",
"select fasthll(d4) from T where d1 <> 'd1-v1'",
"select fasthll(d4) from T where d1 between 'd1-v1' and 'd1-v3'",
"select fasthll(d4) from T where d1 in ('d1-v1', 'd1-v2')",
"select fasthll(d4) from T where d1 in ('d1-v1', 'd1-v2') and d2 not in ('d2-v1')",
"select fasthll(d4) from T group by d1",
"select fasthll(d4) from T group by d1, d2",
"select fasthll(d4) from T where d1 = 'd1-v2' group by d1",
"select fasthll(d4) from T where d1 between 'd1-v1' and 'd1-v3' group by d2",
"select fasthll(d4) from T where d1 = 'd1-v2' group by d2, d3",
"select fasthll(d4) from T where d1 <> 'd1-v1' group by d2",
"select fasthll(d4) from T where d1 in ('d1-v1', 'd1-v2') group by d2",
"select fasthll(d4) from T where d1 in ('d1-v1', 'd1-v2') and d2 not in ('d2-v1') group by d3",
"select fasthll(d4) from T where d1 in ('d1-v1', 'd1-v2') and d2 not in ('d2-v1') group by d3, d4"
};
void testHardCodedQueries(IndexSegment segment, Schema schema) throws Exception {
// only use metric corresponding to columnsToDeriveHllFields
List<String> metricNames = new ArrayList<>();
for (String column: columnsToDeriveHllFields) {
metricNames.add(column + HLL_CONFIG.getHllDeriveColumnSuffix());
}
SegmentMetadata segmentMetadata = segment.getSegmentMetadata();
LOGGER.info("[Schema] Dim: {} Metric: {}", schema.getDimensionNames(), schema.getMetricNames());
for (int i = 0; i < _hardCodedQueries.length; i++) {
Pql2Compiler compiler = new Pql2Compiler();
BrokerRequest brokerRequest = compiler.compileToBrokerRequest(_hardCodedQueries[i]);
FilterQueryTree filterQueryTree = RequestUtils.generateFilterQueryTree(brokerRequest);
Assert.assertTrue(RequestUtils.isFitForStarTreeIndex(segmentMetadata, filterQueryTree, brokerRequest));
// Group -> Projected values of each group
Map<String, long[]> expectedResult = computeHllUsingRawDocs(segment, metricNames, brokerRequest);
Map<String, long[]> actualResult = computeHllUsingAggregatedDocs(segment, metricNames, brokerRequest);
Assert.assertEquals(expectedResult.size(), actualResult.size(), "Mis-match in number of groups");
for (Map.Entry<String, long[]> entry : expectedResult.entrySet()) {
String expectedKey = entry.getKey();
Assert.assertTrue(actualResult.containsKey(expectedKey));
long[] expectedSums = entry.getValue();
long[] actualSums = actualResult.get(expectedKey);
for (int j = 0; j < expectedSums.length; j++) {
LOGGER.info("actual hll: {} ", actualSums[j]);
LOGGER.info("expected hll: {} ", expectedSums[j]);
Assert.assertEquals(actualSums[j], expectedSums[j],
"Mis-match hll for key '" + expectedKey + "', Metric: " + metricNames.get(j) + ", Random Seed: "
+ _randomSeed);
}
}
}
}
/**
* Helper method to compute the sums using raw index.
* @param metricNames
* @param brokerRequest
*/
private Map<String, long[]> computeHllUsingRawDocs(IndexSegment segment, List<String> metricNames,
BrokerRequest brokerRequest) throws Exception {
FilterPlanNode planNode = new FilterPlanNode(segment, brokerRequest);
Operator rawOperator = planNode.run();
BlockDocIdIterator rawDocIdIterator = rawOperator.nextBlock().getBlockDocIdSet().iterator();
List<String> groupByColumns = Collections.EMPTY_LIST;
if (brokerRequest.isSetAggregationsInfo() && brokerRequest.isSetGroupBy()) {
groupByColumns = brokerRequest.getGroupBy().getColumns();
}
return computeHll(segment, rawDocIdIterator, metricNames, groupByColumns);
}
/**
* Helper method to compute the sum using aggregated docs.
* @param metricNames
* @param brokerRequest
* @return
*/
private Map<String, long[]> computeHllUsingAggregatedDocs(IndexSegment segment, List<String> metricNames,
BrokerRequest brokerRequest) throws Exception {
StarTreeIndexOperator starTreeOperator = new StarTreeIndexOperator(segment, brokerRequest);
starTreeOperator.open();
BlockDocIdIterator starTreeDocIdIterator = starTreeOperator.nextBlock().getBlockDocIdSet().iterator();
List<String> groupByColumns = Collections.EMPTY_LIST;
if (brokerRequest.isSetAggregationsInfo() && brokerRequest.isSetGroupBy()) {
groupByColumns = brokerRequest.getGroupBy().getColumns();
}
return computeHll(segment, starTreeDocIdIterator, metricNames, groupByColumns);
}
/**
* Compute 'sum' for a given list of metrics, by scanning the given set of doc-ids.
*
* @param segment
* @param docIdIterator
* @param metricNames
* @return
*/
private Map<String, long[]> computeHll(IndexSegment segment, BlockDocIdIterator docIdIterator,
List<String> metricNames, List<String> groupByColumns)
throws Exception {
int docId;
int numMetrics = metricNames.size();
Dictionary[] metricDictionaries = new Dictionary[numMetrics];
BlockSingleValIterator[] metricValIterators = new BlockSingleValIterator[numMetrics];
int numGroupByColumns = groupByColumns.size();
Dictionary[] groupByDictionaries = new Dictionary[numGroupByColumns];
BlockSingleValIterator[] groupByValIterators = new BlockSingleValIterator[numGroupByColumns];
for (int i = 0; i < numMetrics; i++) {
String metricName = metricNames.get(i);
DataSource dataSource = segment.getDataSource(metricName);
metricDictionaries[i] = dataSource.getDictionary();
metricValIterators[i] = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
}
for (int i = 0; i < numGroupByColumns; i++) {
String groupByColumn = groupByColumns.get(i);
DataSource dataSource = segment.getDataSource(groupByColumn);
groupByDictionaries[i] = dataSource.getDictionary();
groupByValIterators[i] = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
}
Map<String, HyperLogLog[]> result = new HashMap<>();
while ((docId = docIdIterator.next()) != Constants.EOF) {
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < numGroupByColumns; i++) {
groupByValIterators[i].skipTo(docId);
int dictId = groupByValIterators[i].nextIntVal();
stringBuilder.append(groupByDictionaries[i].getStringValue(dictId));
stringBuilder.append("_");
}
String key = stringBuilder.toString();
if (!result.containsKey(key)) {
// init
HyperLogLog[] initHllArray = new HyperLogLog[numMetrics];
for (int i = 0; i < numMetrics; i++) {
initHllArray[i] = new HyperLogLog(HLL_CONFIG.getHllLog2m());
}
result.put(key, initHllArray);
}
HyperLogLog[] hllSoFar = result.get(key);
for (int i = 0; i < numMetrics; i++) {
metricValIterators[i].skipTo(docId);
int dictId = metricValIterators[i].nextIntVal();
HyperLogLog value = HllUtil.convertStringToHll(metricDictionaries[i].getStringValue(dictId));
hllSoFar[i].addAll(value);
}
}
// construct ret
Map<String, long[]> ret = new HashMap<>();
for (String key: result.keySet()) {
long[] valueArray = new long[numMetrics];
ret.put(key, valueArray);
for (int i = 0; i < numMetrics; i++) {
valueArray[i] = result.get(key)[i].cardinality();
}
}
return ret;
}
}