/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.query.aggregation.groupby;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.request.transform.TransformExpressionTree;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.readers.RecordReader;
import com.linkedin.pinot.core.data.readers.TestRecordReader;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.operator.BReusableFilteredDocIdSetOperator;
import com.linkedin.pinot.core.operator.BaseOperator;
import com.linkedin.pinot.core.operator.MProjectionOperator;
import com.linkedin.pinot.core.operator.blocks.TransformBlock;
import com.linkedin.pinot.core.operator.filter.MatchEntireSegmentOperator;
import com.linkedin.pinot.core.operator.transform.TransformExpressionOperator;
import com.linkedin.pinot.core.plan.DocIdSetPlanNode;
import com.linkedin.pinot.core.query.aggregation.groupby.AggregationGroupByTrimmingService;
import com.linkedin.pinot.core.query.aggregation.groupby.GroupKeyGenerator;
import com.linkedin.pinot.core.query.aggregation.groupby.NoDictionaryMultiColumnGroupKeyGenerator;
import com.linkedin.pinot.core.query.aggregation.groupby.NoDictionarySingleColumnGroupKeyGenerator;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
/**
* Unit test for {@link NoDictionaryMultiColumnGroupKeyGenerator}
*/
public class NoDictionaryGroupKeyGeneratorTest {
private static final String SEGMENT_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "rawIndexPerf";
private static final String SEGMENT_NAME = "perfTestSegment";
private static final String STRING_DICT_COLUMN = "string_dict_column";
private static final String[] COLUMN_NAMES =
{"int_column", "long_column", "float_column", "double_column", "string_column", STRING_DICT_COLUMN};
private static final String[] NO_DICT_COLUMN_NAMES =
{"int_column", "long_column", "float_column", "double_column", "string_column"};
private static final FieldSpec.DataType[] DATA_TYPES =
{FieldSpec.DataType.INT, FieldSpec.DataType.LONG, FieldSpec.DataType.FLOAT, FieldSpec.DataType.DOUBLE, FieldSpec.DataType.STRING, FieldSpec.DataType.STRING};
private static final int NUM_COLUMNS = DATA_TYPES.length;
private static final int NUM_ROWS = 1;
private TestRecordReader _recordReader;
private Map<String, BaseOperator> _dataSourceMap;
private IndexSegment _indexSegment;
@BeforeClass
public void setup()
throws Exception {
buildSegment();
// Load the segment.
File segment = new File(SEGMENT_DIR_NAME, SEGMENT_NAME);
_indexSegment = Loaders.IndexSegment.load(segment, ReadMode.heap);
// Build the data source map
_dataSourceMap = new HashMap<>();
for (String column : _indexSegment.getColumnNames()) {
DataSource dataSource = _indexSegment.getDataSource(column);
_dataSourceMap.put(column, dataSource);
}
}
/**
* Unit test for {@link com.linkedin.pinot.core.query.aggregation.groupby.NoDictionarySingleColumnGroupKeyGenerator}
* @throws Exception
*/
@Test
public void testSingleColumnGroupKeyGenerator()
throws Exception {
for (int i = 0; i < COLUMN_NAMES.length; i++) {
testGroupKeyGenerator(new String[]{COLUMN_NAMES[i]}, new FieldSpec.DataType[]{DATA_TYPES[i]});
}
}
/**
* Unit test for {@link NoDictionaryMultiColumnGroupKeyGenerator}
* @throws Exception
*/
@Test
public void testMultiColumnGroupKeyGenerator()
throws Exception {
testGroupKeyGenerator(_indexSegment.getColumnNames(), DATA_TYPES);
}
/**
* Tests multi-column group key generator when at least one column as dictionary, and others don't.
*/
@Test
public void testMultiColumnHybridGroupKeyGenerator()
throws Exception {
for (int i = 0; i < NO_DICT_COLUMN_NAMES.length; i++) {
testGroupKeyGenerator(new String[]{NO_DICT_COLUMN_NAMES[i], STRING_DICT_COLUMN},
new FieldSpec.DataType[]{DATA_TYPES[i], FieldSpec.DataType.STRING});
}
}
private void testGroupKeyGenerator(String[] groupByColumns, FieldSpec.DataType[] dataTypes)
throws Exception {
// Build the projection operator.
MatchEntireSegmentOperator matchEntireSegmentOperator = new MatchEntireSegmentOperator(NUM_ROWS);
BReusableFilteredDocIdSetOperator docIdSetOperator =
new BReusableFilteredDocIdSetOperator(matchEntireSegmentOperator, NUM_ROWS, 10000);
MProjectionOperator projectionOperator = new MProjectionOperator(_dataSourceMap, docIdSetOperator);
TransformExpressionOperator transformOperator =
new TransformExpressionOperator(projectionOperator, new ArrayList<TransformExpressionTree>());
// Iterator over all projection blocks and generate group keys.
TransformBlock transformBlock;
int[] docIdToGroupKeys = new int[DocIdSetPlanNode.MAX_DOC_PER_CALL];
GroupKeyGenerator groupKeyGenerator = null;
while ((transformBlock = (TransformBlock) transformOperator.nextBlock()) != null) {
if (groupKeyGenerator == null) {
// Build the group key generator.
groupKeyGenerator =
(groupByColumns.length == 1) ? new NoDictionarySingleColumnGroupKeyGenerator(groupByColumns[0],
dataTypes[0]) : new NoDictionaryMultiColumnGroupKeyGenerator(transformBlock, groupByColumns);
}
groupKeyGenerator.generateKeysForBlock(transformBlock, docIdToGroupKeys);
}
// Assert total number of group keys is as expected
Assert.assertTrue(groupKeyGenerator != null);
Set<String> expectedGroupKeys = getExpectedGroupKeys(_recordReader, groupByColumns);
Assert.assertEquals(groupKeyGenerator.getCurrentGroupKeyUpperBound(), expectedGroupKeys.size(),
"Number of group keys mis-match.");
// Assert all group key values are as expected
Iterator<GroupKeyGenerator.GroupKey> uniqueGroupKeys = groupKeyGenerator.getUniqueGroupKeys();
while (uniqueGroupKeys.hasNext()) {
GroupKeyGenerator.GroupKey groupKey = uniqueGroupKeys.next();
String actual = groupKey.getStringKey();
Assert.assertTrue(expectedGroupKeys.contains(actual), "Unexpected group key: " + actual);
}
}
/**
* Helper method to build group keys for a given array of group-by columns.
*
* @param groupByColumns Group-by columns for which to generate the group-keys.
* @return Set of unique group keys.
* @throws Exception
*/
private Set<String> getExpectedGroupKeys(RecordReader recordReader, String[] groupByColumns)
throws Exception {
Set<String> groupKeys = new HashSet<>();
StringBuilder stringBuilder = new StringBuilder();
recordReader.rewind();
while (recordReader.hasNext()) {
GenericRow row = recordReader.next();
stringBuilder.setLength(0);
for (int i = 0; i < groupByColumns.length; i++) {
stringBuilder.append(row.getValue(groupByColumns[i]));
if (i < groupByColumns.length - 1) {
stringBuilder.append(AggregationGroupByTrimmingService.GROUP_KEY_DELIMITER);
}
}
groupKeys.add(stringBuilder.toString());
}
return groupKeys;
}
/**
* Helper method to build a segment as follows:
* <ul>
* <li> One string column without dictionary. </li>
* <li> One integer column with dictionary. </li>
* </ul>
*
* It also computes the unique group keys while it generates the index.
*
* @return Set containing unique group keys from the created segment.
*
* @throws Exception
*/
private TestRecordReader buildSegment()
throws Exception {
Schema schema = new Schema();
for (int i = 0; i < COLUMN_NAMES.length; i++) {
DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(COLUMN_NAMES[i], DATA_TYPES[i], true);
schema.addField(dimensionFieldSpec);
}
SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
config.setRawIndexCreationColumns(Arrays.asList(NO_DICT_COLUMN_NAMES));
config.setOutDir(SEGMENT_DIR_NAME);
config.setSegmentName(SEGMENT_NAME);
Random random = new Random();
List<GenericRow> rows = new ArrayList<>(NUM_ROWS);
for (int i = 0; i < NUM_ROWS; i++) {
Map<String, Object> map = new HashMap<>(NUM_COLUMNS);
for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
String column = fieldSpec.getName();
FieldSpec.DataType dataType = fieldSpec.getDataType();
switch (dataType) {
case INT:
map.put(column, random.nextInt());
break;
case LONG:
map.put(column, random.nextLong());
break;
case FLOAT:
map.put(column, random.nextFloat());
break;
case DOUBLE:
map.put(column, random.nextDouble());
break;
case STRING:
map.put(column, "value_" + i);
break;
default:
throw new IllegalArgumentException("Illegal data type specified: " + dataType);
}
}
GenericRow genericRow = new GenericRow();
genericRow.init(map);
rows.add(genericRow);
}
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
_recordReader = new TestRecordReader(rows, schema);
driver.init(config, _recordReader);
driver.build();
return _recordReader;
}
}