/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.startree.hll;
import com.linkedin.pinot.common.data.*;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder;
import com.linkedin.pinot.core.startree.StarTreeBuilderConfig;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class OffheapStarTreeBuilderWithHllFieldTest {
private static final Logger LOGGER = LoggerFactory.getLogger(OffheapStarTreeBuilderWithHllFieldTest.class);
private static final long randomSeed = 31; // a fixed value
private final String memberIdFieldName = "id";
private final String hllDeriveFieldSuffix = HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX;
private final int log2m = 8; //HllUtil.Constants.DEFAULT_LOG2M;
private void testSimpleCore(int numDimensions, int numMetrics, int numSkipMaterializationDimensions,
int[] memberIdColumnValues, long preciseCardinality) throws Exception {
StarTreeBuilderConfig builderConfig = null;
try {
builderConfig = new StarTreeBuilderConfig();
Schema schema = new Schema();
builderConfig.dimensionsSplitOrder = new ArrayList<>();
builderConfig.setSkipMaterializationForDimensions(new HashSet<String>());
Set<String> skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();
// add member id dimension spec
String dimName = memberIdFieldName;
DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.INT, true);
schema.addField(dimensionFieldSpec);
// add other dimension specs
for (int i = 1; i < numDimensions; i++) {
dimName = "d" + (i + 1);
dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.STRING, true);
schema.addField(dimensionFieldSpec);
if (i < (numDimensions - numSkipMaterializationDimensions)) {
builderConfig.dimensionsSplitOrder.add(dimName);
} else {
builderConfig.getSkipMaterializationForDimensions().add(dimName);
}
}
schema.setTimeFieldSpec(new TimeFieldSpec("daysSinceEpoch", DataType.INT, TimeUnit.DAYS));
// add other metric specs
for (int i = 0; i < numMetrics - 1; i++) {
String metricName = "m" + (i + 1);
MetricFieldSpec metricFieldSpec = new MetricFieldSpec(metricName, DataType.INT);
schema.addField(metricFieldSpec);
}
// add hll metric
String hllMetricName = memberIdFieldName + hllDeriveFieldSuffix;
MetricFieldSpec hllDerivedFieldSpec = new MetricFieldSpec(hllMetricName, FieldSpec.DataType.STRING,
HllUtil.getHllFieldSizeFromLog2m(log2m), MetricFieldSpec.DerivedMetricType.HLL);
schema.addField(hllDerivedFieldSpec);
//
builderConfig.maxLeafRecords = 10;
builderConfig.schema = schema;
builderConfig.setOutDir(new File("/tmp/startree"));
//
OffHeapStarTreeBuilder builder = new OffHeapStarTreeBuilder();
builder.init(builderConfig);
// fill values
HashMap<String, Object> map = new HashMap<>();
for (int row = 0; row < memberIdColumnValues.length; row++) {
// add member id column
dimName = memberIdFieldName;
map.put(dimName, memberIdColumnValues[row]);
// add other dimensions
for (int i = 1; i < numDimensions; i++) {
dimName = schema.getDimensionFieldSpecs().get(i).getName();
map.put(dimName, dimName + "-v" + row % (numDimensions - i));
}
// add time column
map.put("daysSinceEpoch", 1);
// add other metrics
for (int i = 0; i < numMetrics - 1; i++) {
String metName = schema.getMetricFieldSpecs().get(i).getName();
map.put(metName, 1);
}
// add hll column value
map.put(hllMetricName, HllUtil.singleValueHllAsString(log2m, memberIdColumnValues[row]));
//
GenericRow genericRow = new GenericRow();
genericRow.init(map);
builder.append(genericRow);
}
builder.build();
int totalDocs = builder.getTotalRawDocumentCount() + builder.getTotalAggregateDocumentCount();
Iterator<GenericRow> iterator = builder.iterator(0, totalDocs);
while (iterator.hasNext()) {
GenericRow row = iterator.next();
LOGGER.info(HllUtil.inspectGenericRow(row, hllDeriveFieldSuffix));
}
iterator = builder.iterator(builder.getTotalRawDocumentCount(), totalDocs);
GenericRow lastRow = null;
while (iterator.hasNext()) {
GenericRow row = iterator.next();
for (String skipDimension : skipMaterializationForDimensions) {
String rowValue = (String) row.getValue(skipDimension);
assert (rowValue.equals("ALL"));
}
lastRow = row;
}
assertApproximation(
HllUtil.convertStringToHll((String) lastRow.getValue(hllMetricName)).cardinality(),
preciseCardinality,
0.1
);
} finally {
if (builderConfig != null) {
FileUtils.deleteDirectory(builderConfig.getOutDir());
}
}
}
private static void assertApproximation(double estimate, double actual, double precision) {
estimate = Math.abs(estimate);
actual = Math.abs(actual);
double errorRate = 1;
if (actual > 0) {
errorRate = Math.abs((actual - estimate) / actual);
}
LOGGER.info("estimate: " + estimate + " actual: " + actual + " error (in rate): " + errorRate);
Assert.assertEquals(errorRate < precision, true);
}
private static class RandomNumberArray {
private static Random _rnd = new Random(randomSeed);
private final int[] arr;
private final HashSet<Integer> set = new HashSet<Integer>();
/**
* Data ranges between [0, size)
* @param size
* @param duplicationPerItem
*/
RandomNumberArray(int size, int duplicationPerItem) {
List<Integer> lst = new ArrayList<Integer>();
for (int i = 0; i < size/duplicationPerItem; i++) {
Integer item = _rnd.nextInt(size);
for (int j = 0; j < duplicationPerItem; j++) {
lst.add(item); // add duplicates
}
}
// add remaining items
int st = lst.size();
for (int i = st; i < size; i++) {
Integer item = _rnd.nextInt(size);
lst.add(item);
}
// add to set
set.addAll(lst);
// shuffle
Collections.shuffle(lst, new Random(10L));
// toIntArray
arr = convertToIntArray(lst);
if (arr.length != size) {
throw new RuntimeException("should not happen");
}
}
private int[] convertToIntArray(List<Integer> list){
int[] ret = new int[list.size()];
for(int i = 0;i < ret.length;i++)
ret[i] = list.get(i);
return ret;
}
public int[] toIntArray() {
return arr;
}
public int size() {
return arr.length;
}
public int getPreciseCardinality() {
return set.size();
}
}
@Test
public void testSmallDuplicates() throws Exception {
RandomNumberArray rand = new RandomNumberArray(500, 1);
testSimpleCore(3, 3, 0, rand.toIntArray(), rand.getPreciseCardinality());
}
@Test
public void testMediumDuplicates() throws Exception {
RandomNumberArray rand = new RandomNumberArray(500, 5);
testSimpleCore(3, 3, 0, rand.toIntArray(), rand.getPreciseCardinality());
}
@Test
public void testLargeDuplicates() throws Exception {
RandomNumberArray rand = new RandomNumberArray(500, 50);
testSimpleCore(3, 3, 0, rand.toIntArray(), rand.getPreciseCardinality());
}
@Test
public void testSkipMaterialization() throws Exception {
RandomNumberArray rand = new RandomNumberArray(250, 3);
testSimpleCore(6, 4, 2, rand.toIntArray(), rand.getPreciseCardinality());
}
}