/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.startree.hll;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.StarTreeIndexSpec;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion;
import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver;
import com.linkedin.pinot.core.segment.creator.impl.SegmentCreationDriverFactory;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.segments.v1.creator.SegmentTestUtils;
import com.linkedin.pinot.util.TestUtils;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Files;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SegmentWithHllIndexCreateHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(SegmentWithHllIndexCreateHelper.class);
private static final String hllDeriveColumnSuffix = HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX;
private final String tableName;
private final File INDEX_DIR;
private final File inputAvro;
private final String timeColumnName;
private final TimeUnit timeUnit;
private String segmentName = "starTreeSegment";
private Schema schema;
public SegmentWithHllIndexCreateHelper(String tableName, URL avroUrl, String timeColumnName,
TimeUnit timeUnit, String segmentName) throws IOException {
this(tableName, TestUtils.getFileFromResourceUrl(avroUrl), timeColumnName, timeUnit, segmentName);
}
public SegmentWithHllIndexCreateHelper(String tableName, String avroDataPath, String timeColumnName,
TimeUnit timeUnit, String segmentName) throws IOException {
INDEX_DIR = Files.createTempDirectory(SegmentWithHllIndexCreateHelper.class.getName() + "_" + tableName).toFile();
LOGGER.info("INDEX_DIR: {}", INDEX_DIR.getAbsolutePath());
inputAvro = new File(avroDataPath);
LOGGER.info("Input Avro: {}", inputAvro.getAbsolutePath());
this.timeColumnName = timeColumnName;
this.timeUnit = timeUnit;
this.tableName = tableName;
this.segmentName = segmentName;
}
/**
* must call this to clean up
*/
public void cleanTempDir() {
if (INDEX_DIR != null) {
FileUtils.deleteQuietly(INDEX_DIR);
}
}
private static void printSchema(Schema schema) {
LOGGER.info("schemaName: {}", schema.getSchemaName());
LOGGER.info("Dimension columnNames: ");
int i = 0;
for (DimensionFieldSpec spec: schema.getDimensionFieldSpecs()) {
String columnInfo = i + " " + spec.getName();
if (!spec.isSingleValueField()) {
LOGGER.info(columnInfo + " Multi-Value.");
} else {
LOGGER.info(columnInfo);
}
i += 1;
}
LOGGER.info("Metric columnNames: ");
i = 0;
for (MetricFieldSpec spec: schema.getMetricFieldSpecs()) {
String columnInfo = i + " " + spec.getName();
if (!spec.isSingleValueField()) {
LOGGER.info(columnInfo + " Multi-Value.");
} else {
LOGGER.info(columnInfo);
}
i += 1;
}
LOGGER.info("Time column: {}", schema.getTimeColumnName());
}
private void setupStarTreeConfig(SegmentGeneratorConfig segmentGenConfig) {
// StarTree related
segmentGenConfig.setEnableStarTreeIndex(true);
StarTreeIndexSpec starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
segmentGenConfig.setStarTreeIndexSpec(starTreeIndexSpec);
LOGGER.info("segmentGenConfig Schema (w/o derived fields): ");
printSchema(segmentGenConfig.getSchema());
}
public SegmentIndexCreationDriver build(boolean enableStarTree, HllConfig hllConfig) throws Exception {
final SegmentGeneratorConfig segmentGenConfig = new SegmentGeneratorConfig(
SegmentTestUtils.extractSchemaFromAvroWithoutTime(inputAvro));
// set other fields in segmentGenConfig
segmentGenConfig.setInputFilePath(inputAvro.getAbsolutePath());
segmentGenConfig.setTimeColumnName(timeColumnName);
segmentGenConfig.setSegmentTimeUnit(timeUnit);
segmentGenConfig.setFormat(FileFormat.AVRO);
segmentGenConfig.setSegmentVersion(SegmentVersion.v1);
segmentGenConfig.setTableName(tableName);
segmentGenConfig.setOutDir(INDEX_DIR.getAbsolutePath());
segmentGenConfig.createInvertedIndexForAllColumns();
segmentGenConfig.setSegmentName(segmentName);
segmentGenConfig.setSegmentNamePostfix("1");
if (enableStarTree) {
setupStarTreeConfig(segmentGenConfig);
segmentGenConfig.setHllConfig(hllConfig);
}
if (hllConfig != null) {
segmentGenConfig.setHllConfig(hllConfig);
}
final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
driver.init(segmentGenConfig);
/**
* derived field (hll) is added during the segment build process
*
* {@link SegmentIndexCreationDriverImpl#buildStarTree}
* {@link SegmentIndexCreationDriverImpl#augmentSchemaWithDerivedColumns}
* {@link SegmentIndexCreationDriverImpl#populateDefaultDerivedColumnValues}
*/
driver.build();
LOGGER.info("segmentGenConfig Schema (w/ derived fields): ");
schema = segmentGenConfig.getSchema();
printSchema(schema);
return driver;
}
public Schema getSchema() {
if (schema == null) {
throw new RuntimeException("Call build first to get schema.");
}
return schema;
}
public String getSegmentName() {
return segmentName;
}
public File getSegmentDirectory() {
return new File(INDEX_DIR, segmentName);
}
public File getIndexDir() {
return INDEX_DIR;
}
}