HllIndexCreationTest.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.startree.hll;

import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.common.DataBlockCache;
import com.linkedin.pinot.core.common.DataFetcher;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.indexsegment.columnar.ColumnarSegmentLoader;
import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion;
import com.linkedin.pinot.core.operator.BaseOperator;
import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl;
import com.linkedin.pinot.core.segment.index.converter.SegmentV1V2ToV3FormatConverter;
import com.linkedin.pinot.core.segment.index.loader.IndexLoadingConfig;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.core.segment.store.SegmentDirectoryPaths;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.attribute.FileTime;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;

/**
 * Dictionary Index Size for Hll Field is roughly 10 times of the corresponding index for Long field.
 */
public class HllIndexCreationTest {
  private static final Logger LOGGER = LoggerFactory.getLogger(HllIndexCreationTest.class);
  private static final String hllDeriveColumnSuffix = HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX;

  // change this to change the columns that need to create hll index on
  private static final Set<String> columnsToDeriveHllFields =
      new HashSet<>(Arrays.asList("column1", "column2", "column3",
          "count", "weeksSinceEpochSunday", "daysSinceEpoch",
          "column17", "column18"));
  private static final String AVRO_DATA = "data/test_data-sv.avro";
  private static final String timeColumnName = "daysSinceEpoch";
  private static final TimeUnit timeUnit = TimeUnit.DAYS;

  private static final int hllLog2m = HllConstants.DEFAULT_LOG2M;

  private IndexLoadingConfig v3LoadingConfig;

  private HllConfig hllConfig;

  @BeforeMethod
  public void setUp() throws Exception {
    hllConfig = new HllConfig(hllLog2m, columnsToDeriveHllFields, hllDeriveColumnSuffix);

    v3LoadingConfig = new IndexLoadingConfig();
    v3LoadingConfig.setReadMode(ReadMode.mmap);
    v3LoadingConfig.setSegmentVersion(SegmentVersion.v3);
  }

  @AfterMethod
  public void tearDown() throws Exception {}

  @Test
  public void testColumnStatsWithoutStarTree() {
    SegmentWithHllIndexCreateHelper helper = null;
    boolean hasException = false;
    try {
      LOGGER.debug("================ Without StarTree ================");
      helper = new SegmentWithHllIndexCreateHelper(
          "noStarTree", getClass().getClassLoader().getResource(AVRO_DATA), timeColumnName, timeUnit, "starTreeSegment");
      SegmentIndexCreationDriver driver = helper.build(false, null);
      LOGGER.debug("================ Cardinality ================");
      for (String name : helper.getSchema().getColumnNames()) {
        LOGGER.debug("* " + name + ": " + driver.getColumnStatisticsCollector(name).getCardinality());
      }
    } catch (Exception e) {
      hasException = true;
      LOGGER.error(e.getMessage());
    } finally {
      if (helper != null) {
        helper.cleanTempDir();
      }
      Assert.assertEquals(hasException, false);
    }
  }

  @Test
  public void testColumnStatsWithStarTree() throws Exception {
    SegmentWithHllIndexCreateHelper helper = null;
    boolean hasException = false;
    int maxDocLength = 10000;
    try {
      LOGGER.debug("================ With StarTree ================");
      helper = new SegmentWithHllIndexCreateHelper(
          "withStarTree", getClass().getClassLoader().getResource(AVRO_DATA), timeColumnName, timeUnit, "starTreeSegment");
      SegmentIndexCreationDriver driver = helper.build(true, hllConfig);
      LOGGER.debug("================ Cardinality ================");
      for (String name : helper.getSchema().getColumnNames()) {
        LOGGER.debug("* " + name + ": " + driver.getColumnStatisticsCollector(name).getCardinality());
      }
      LOGGER.debug("Loading ...");
      IndexSegment indexSegment = Loaders.IndexSegment.load(helper.getSegmentDirectory(), ReadMode.mmap);

      int[] docIdSet = new int[maxDocLength];
      for (int i = 0; i < maxDocLength; i++) {
        docIdSet[i] = i;
      }
      Map<String, BaseOperator> dataSourceMap = new HashMap<>();
      for (String column : indexSegment.getColumnNames()) {
        dataSourceMap.put(column, indexSegment.getDataSource(column));
      }
      DataBlockCache blockCache = new DataBlockCache(new DataFetcher(dataSourceMap));
      blockCache.initNewBlock(docIdSet, 0, maxDocLength);

      String[] strings = blockCache.getStringValueArrayForColumn("column1_hll");
      Assert.assertEquals(strings.length, maxDocLength);

      double[] ints = blockCache.getDoubleValueArrayForColumn("column1");
      Assert.assertEquals(ints.length, maxDocLength);
    } catch (Exception e) {
      hasException = true;
      LOGGER.error(e.getMessage());
    } finally {
      if (helper != null) {
        helper.cleanTempDir();
      }
      Assert.assertEquals(hasException, false);
    }
  }

  @Test
  public void testConvert() throws Exception {
    SegmentWithHllIndexCreateHelper helper = null;
    try {
      helper = new SegmentWithHllIndexCreateHelper(
          "testConvert", getClass().getClassLoader().getResource(AVRO_DATA), timeColumnName, timeUnit, "starTreeSegment");

      SegmentIndexCreationDriver driver = helper.build(true, hllConfig);

      File segmentDirectory = new File(helper.getIndexDir(), driver.getSegmentName());
      LOGGER.debug("Segment Directory: " + segmentDirectory.getAbsolutePath());


      SegmentV1V2ToV3FormatConverter converter = new SegmentV1V2ToV3FormatConverter();
      converter.convert(segmentDirectory);
      File v3Location = SegmentDirectoryPaths.segmentDirectoryFor(segmentDirectory, SegmentVersion.v3);
      LOGGER.debug("v3Location: " + v3Location.getAbsolutePath());

      Assert.assertTrue(v3Location.exists());
      Assert.assertTrue(v3Location.isDirectory());
      Assert.assertTrue(new File(v3Location, V1Constants.STAR_TREE_INDEX_FILE).exists());

      SegmentMetadataImpl metadata = new SegmentMetadataImpl(v3Location);
      LOGGER.debug("metadata all columns: " + metadata.getAllColumns());

      Assert.assertEquals(metadata.getVersion(), SegmentVersion.v3.toString());
      Assert.assertTrue(new File(v3Location, V1Constants.SEGMENT_CREATION_META).exists());
      // Drop the star tree index file because it has invalid data
      // new File(v3Location, V1Constants.STAR_TREE_INDEX_FILE).delete();
      // new File(segmentDirectory, V1Constants.STAR_TREE_INDEX_FILE).delete();

      FileTime afterConversionTime = Files.getLastModifiedTime(v3Location.toPath());

      // verify that the segment loads correctly. This is necessary and sufficient
      // full proof way to ensure that segment is correctly translated
      IndexSegment indexSegment = ColumnarSegmentLoader.load(segmentDirectory, v3LoadingConfig);
      Assert.assertNotNull(indexSegment);
      Assert.assertEquals(indexSegment.getSegmentName(), metadata.getName());
      Assert.assertEquals(SegmentVersion.v3,
          SegmentVersion.valueOf(indexSegment.getSegmentMetadata().getVersion()));

    } finally {
      if (helper != null) {
        helper.cleanTempDir();
      }
    }
  }

}