/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.startree.hll; import com.linkedin.pinot.common.segment.ReadMode; import com.linkedin.pinot.core.common.DataBlockCache; import com.linkedin.pinot.core.common.DataFetcher; import com.linkedin.pinot.core.indexsegment.IndexSegment; import com.linkedin.pinot.core.indexsegment.columnar.ColumnarSegmentLoader; import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion; import com.linkedin.pinot.core.operator.BaseOperator; import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl; import com.linkedin.pinot.core.segment.index.converter.SegmentV1V2ToV3FormatConverter; import com.linkedin.pinot.core.segment.index.loader.IndexLoadingConfig; import com.linkedin.pinot.core.segment.index.loader.Loaders; import com.linkedin.pinot.core.segment.store.SegmentDirectoryPaths; import java.io.File; import java.nio.file.Files; import java.nio.file.attribute.FileTime; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testng.Assert; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; /** * Dictionary Index Size for Hll Field is roughly 10 times of the corresponding index for Long field. */ public class HllIndexCreationTest { private static final Logger LOGGER = LoggerFactory.getLogger(HllIndexCreationTest.class); private static final String hllDeriveColumnSuffix = HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX; // change this to change the columns that need to create hll index on private static final Set<String> columnsToDeriveHllFields = new HashSet<>(Arrays.asList("column1", "column2", "column3", "count", "weeksSinceEpochSunday", "daysSinceEpoch", "column17", "column18")); private static final String AVRO_DATA = "data/test_data-sv.avro"; private static final String timeColumnName = "daysSinceEpoch"; private static final TimeUnit timeUnit = TimeUnit.DAYS; private static final int hllLog2m = HllConstants.DEFAULT_LOG2M; private IndexLoadingConfig v3LoadingConfig; private HllConfig hllConfig; @BeforeMethod public void setUp() throws Exception { hllConfig = new HllConfig(hllLog2m, columnsToDeriveHllFields, hllDeriveColumnSuffix); v3LoadingConfig = new IndexLoadingConfig(); v3LoadingConfig.setReadMode(ReadMode.mmap); v3LoadingConfig.setSegmentVersion(SegmentVersion.v3); } @AfterMethod public void tearDown() throws Exception {} @Test public void testColumnStatsWithoutStarTree() { SegmentWithHllIndexCreateHelper helper = null; boolean hasException = false; try { LOGGER.debug("================ Without StarTree ================"); helper = new SegmentWithHllIndexCreateHelper( "noStarTree", getClass().getClassLoader().getResource(AVRO_DATA), timeColumnName, timeUnit, "starTreeSegment"); SegmentIndexCreationDriver driver = helper.build(false, null); LOGGER.debug("================ Cardinality ================"); for (String name : helper.getSchema().getColumnNames()) { LOGGER.debug("* " + name + ": " + driver.getColumnStatisticsCollector(name).getCardinality()); } } catch (Exception e) { hasException = true; LOGGER.error(e.getMessage()); } finally { if (helper != null) { helper.cleanTempDir(); } Assert.assertEquals(hasException, false); } } @Test public void testColumnStatsWithStarTree() throws Exception { SegmentWithHllIndexCreateHelper helper = null; boolean hasException = false; int maxDocLength = 10000; try { LOGGER.debug("================ With StarTree ================"); helper = new SegmentWithHllIndexCreateHelper( "withStarTree", getClass().getClassLoader().getResource(AVRO_DATA), timeColumnName, timeUnit, "starTreeSegment"); SegmentIndexCreationDriver driver = helper.build(true, hllConfig); LOGGER.debug("================ Cardinality ================"); for (String name : helper.getSchema().getColumnNames()) { LOGGER.debug("* " + name + ": " + driver.getColumnStatisticsCollector(name).getCardinality()); } LOGGER.debug("Loading ..."); IndexSegment indexSegment = Loaders.IndexSegment.load(helper.getSegmentDirectory(), ReadMode.mmap); int[] docIdSet = new int[maxDocLength]; for (int i = 0; i < maxDocLength; i++) { docIdSet[i] = i; } Map<String, BaseOperator> dataSourceMap = new HashMap<>(); for (String column : indexSegment.getColumnNames()) { dataSourceMap.put(column, indexSegment.getDataSource(column)); } DataBlockCache blockCache = new DataBlockCache(new DataFetcher(dataSourceMap)); blockCache.initNewBlock(docIdSet, 0, maxDocLength); String[] strings = blockCache.getStringValueArrayForColumn("column1_hll"); Assert.assertEquals(strings.length, maxDocLength); double[] ints = blockCache.getDoubleValueArrayForColumn("column1"); Assert.assertEquals(ints.length, maxDocLength); } catch (Exception e) { hasException = true; LOGGER.error(e.getMessage()); } finally { if (helper != null) { helper.cleanTempDir(); } Assert.assertEquals(hasException, false); } } @Test public void testConvert() throws Exception { SegmentWithHllIndexCreateHelper helper = null; try { helper = new SegmentWithHllIndexCreateHelper( "testConvert", getClass().getClassLoader().getResource(AVRO_DATA), timeColumnName, timeUnit, "starTreeSegment"); SegmentIndexCreationDriver driver = helper.build(true, hllConfig); File segmentDirectory = new File(helper.getIndexDir(), driver.getSegmentName()); LOGGER.debug("Segment Directory: " + segmentDirectory.getAbsolutePath()); SegmentV1V2ToV3FormatConverter converter = new SegmentV1V2ToV3FormatConverter(); converter.convert(segmentDirectory); File v3Location = SegmentDirectoryPaths.segmentDirectoryFor(segmentDirectory, SegmentVersion.v3); LOGGER.debug("v3Location: " + v3Location.getAbsolutePath()); Assert.assertTrue(v3Location.exists()); Assert.assertTrue(v3Location.isDirectory()); Assert.assertTrue(new File(v3Location, V1Constants.STAR_TREE_INDEX_FILE).exists()); SegmentMetadataImpl metadata = new SegmentMetadataImpl(v3Location); LOGGER.debug("metadata all columns: " + metadata.getAllColumns()); Assert.assertEquals(metadata.getVersion(), SegmentVersion.v3.toString()); Assert.assertTrue(new File(v3Location, V1Constants.SEGMENT_CREATION_META).exists()); // Drop the star tree index file because it has invalid data // new File(v3Location, V1Constants.STAR_TREE_INDEX_FILE).delete(); // new File(segmentDirectory, V1Constants.STAR_TREE_INDEX_FILE).delete(); FileTime afterConversionTime = Files.getLastModifiedTime(v3Location.toPath()); // verify that the segment loads correctly. This is necessary and sufficient // full proof way to ensure that segment is correctly translated IndexSegment indexSegment = ColumnarSegmentLoader.load(segmentDirectory, v3LoadingConfig); Assert.assertNotNull(indexSegment); Assert.assertEquals(indexSegment.getSegmentName(), metadata.getName()); Assert.assertEquals(SegmentVersion.v3, SegmentVersion.valueOf(indexSegment.getSegmentMetadata().getVersion())); } finally { if (helper != null) { helper.cleanTempDir(); } } } }