/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.indexsegment.generator; import com.google.common.base.Preconditions; import com.linkedin.pinot.common.config.SegmentPartitionConfig; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.FieldSpec.FieldType; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.data.StarTreeIndexSpec; import com.linkedin.pinot.common.data.TimeFieldSpec; import com.linkedin.pinot.core.data.readers.CSVRecordReaderConfig; import com.linkedin.pinot.core.data.readers.FileFormat; import com.linkedin.pinot.core.data.readers.RecordReaderConfig; import com.linkedin.pinot.core.indexsegment.utils.AvroUtils; import com.linkedin.pinot.core.segment.DefaultSegmentNameGenerator; import com.linkedin.pinot.core.segment.SegmentNameGenerator; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import com.linkedin.pinot.core.startree.hll.HllConfig; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import javax.annotation.Nonnull; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnoreProperties; import org.codehaus.jackson.map.ObjectMapper; import org.joda.time.format.DateTimeFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Configuration properties used in the creation of index segments. */ @JsonIgnoreProperties(ignoreUnknown = true) public class SegmentGeneratorConfig { public enum TimeColumnType { EPOCH, SIMPLE_DATE } private static final Logger LOGGER = LoggerFactory.getLogger(SegmentGeneratorConfig.class); private Map<String, String> _customProperties = new HashMap<>(); private Set<String> _rawIndexCreationColumns = new HashSet<>(); private List<String> _invertedIndexCreationColumns = new ArrayList<>(); private String _dataDir = null; private String _inputFilePath = null; private FileFormat _format = FileFormat.AVRO; private String _outDir = null; private boolean _overwrite = false; private String _tableName = null; private String _segmentName = null; private String _segmentNamePostfix = null; private String _segmentTimeColumnName = null; private TimeUnit _segmentTimeUnit = null; private String _segmentCreationTime = null; private String _segmentStartTime = null; private String _segmentEndTime = null; private SegmentVersion _segmentVersion = SegmentVersion.v3; private String _schemaFile = null; private Schema _schema = null; private String _readerConfigFile = null; private RecordReaderConfig _readerConfig = null; private boolean _enableStarTreeIndex = false; private String _starTreeIndexSpecFile = null; private StarTreeIndexSpec _starTreeIndexSpec = null; private String _creatorVersion = null; private char _paddingCharacter = V1Constants.Str.DEFAULT_STRING_PAD_CHAR; private HllConfig _hllConfig = null; private SegmentNameGenerator _segmentNameGenerator = null; private SegmentPartitionConfig _segmentPartitionConfig = null; private int _sequenceId = -1; private TimeColumnType _timeColumnType = TimeColumnType.EPOCH; private String _simpleDateFormat = null; public SegmentGeneratorConfig() { } /** * @deprecated To be replaced by a builder pattern. Use set methods in the meantime. * For now, this works only if no setters are called after this copy constructor. * @param config to copy from */ @Deprecated public SegmentGeneratorConfig(SegmentGeneratorConfig config) { Preconditions.checkNotNull(config); _customProperties.putAll(config._customProperties); _rawIndexCreationColumns.addAll(config._rawIndexCreationColumns); _invertedIndexCreationColumns.addAll(config._invertedIndexCreationColumns); _dataDir = config._dataDir; _inputFilePath = config._inputFilePath; _format = config._format; _outDir = config._outDir; _overwrite = config._overwrite; _tableName = config._tableName; _segmentName = config._segmentName; _segmentNamePostfix = config._segmentNamePostfix; _segmentTimeColumnName = config._segmentTimeColumnName; _segmentTimeUnit = config._segmentTimeUnit; _segmentCreationTime = config._segmentCreationTime; _segmentStartTime = config._segmentStartTime; _segmentEndTime = config._segmentEndTime; _segmentVersion = config._segmentVersion; _schemaFile = config._schemaFile; _schema = config._schema; _readerConfigFile = config._readerConfigFile; _readerConfig = config._readerConfig; _enableStarTreeIndex = config._enableStarTreeIndex; _starTreeIndexSpecFile = config._starTreeIndexSpecFile; _starTreeIndexSpec = config._starTreeIndexSpec; _creatorVersion = config._creatorVersion; _paddingCharacter = config._paddingCharacter; _hllConfig = config._hllConfig; _segmentVersion = config._segmentVersion; _segmentName = config._segmentName; _segmentNameGenerator = config._segmentNameGenerator; _sequenceId = config._sequenceId; } public SegmentGeneratorConfig(Schema schema) { _schema = schema; } public Map<String, String> getCustomProperties() { return _customProperties; } public void setCustomProperties(Map<String, String> properties) { Preconditions.checkNotNull(properties); _customProperties.putAll(properties); } public void setSimpleDateFormat(@Nonnull String simpleDateFormat) { _timeColumnType = TimeColumnType.SIMPLE_DATE; try { DateTimeFormat.forPattern(simpleDateFormat); } catch (Exception e) { throw new RuntimeException("Illegal simple date format specification", e); } _simpleDateFormat = simpleDateFormat; } public String getSimpleDateFormat() { return _simpleDateFormat; } public TimeColumnType getTimeColumnType() { return _timeColumnType; } public boolean containsCustomProperty(String key) { Preconditions.checkNotNull(key); return _customProperties.containsKey(key); } public Set<String> getRawIndexCreationColumns() { return _rawIndexCreationColumns; } public List<String> getInvertedIndexCreationColumns() { return _invertedIndexCreationColumns; } public void setRawIndexCreationColumns(List<String> rawIndexCreationColumns) { Preconditions.checkNotNull(rawIndexCreationColumns); _rawIndexCreationColumns.addAll(rawIndexCreationColumns); } public void setInvertedIndexCreationColumns(List<String> indexCreationColumns) { Preconditions.checkNotNull(indexCreationColumns); _invertedIndexCreationColumns.addAll(indexCreationColumns); } public void createInvertedIndexForColumn(String column) { Preconditions.checkNotNull(column); if (_schema != null && _schema.getFieldSpecFor(column) == null) { LOGGER.warn("Cannot find column {} in schema, will not create inverted index.", column); return; } if (_schema == null) { LOGGER.warn("Schema has not been set, column {} might not exist in schema after all.", column); } _invertedIndexCreationColumns.add(column); } public void createInvertedIndexForAllColumns() { if (_schema == null) { LOGGER.warn("Schema has not been set, will not create inverted index for all columns."); return; } for (FieldSpec spec : _schema.getAllFieldSpecs()) { _invertedIndexCreationColumns.add(spec.getName()); } } public String getDataDir() { return _dataDir; } public void setDataDir(String dataDir) { _dataDir = dataDir; } public String getInputFilePath() { return _inputFilePath; } public void setInputFilePath(String inputFilePath) { Preconditions.checkNotNull(inputFilePath); File inputFile = new File(inputFilePath); Preconditions.checkState(inputFile.exists(), "Input path {} does not exist.", inputFilePath); _inputFilePath = inputFile.getAbsolutePath(); } public FileFormat getFormat() { return _format; } public void setFormat(FileFormat format) { _format = format; } public String getOutDir() { return _outDir; } public void setOutDir(String dir) { Preconditions.checkNotNull(dir); final File outputDir = new File(dir); if (outputDir.exists()) { Preconditions.checkState(outputDir.isDirectory(), "Path {} is not a directory.", dir); } else { Preconditions.checkState(outputDir.mkdirs(), "Cannot create output dir: {}", dir); } _outDir = outputDir.getAbsolutePath(); } public boolean isOverwrite() { return _overwrite; } public void setOverwrite(boolean overwrite) { _overwrite = overwrite; } public String getTableName() { return _tableName; } public void setTableName(String tableName) { _tableName = tableName; } public String getSegmentName() { return _segmentName; } public void setSegmentName(String segmentName) { _segmentName = segmentName; } public String getCreatorVersion() { return _creatorVersion; } public void setCreatorVersion(String creatorVersion) { _creatorVersion = creatorVersion; } public char getPaddingCharacter() { return _paddingCharacter; } public void setPaddingCharacter(char paddingCharacter) { _paddingCharacter = paddingCharacter; } public String getSegmentNamePostfix() { return _segmentNamePostfix; } /** * If you are adding a sequence Id to the segment, please use setSequenceId. */ public void setSegmentNamePostfix(String postfix) { _segmentNamePostfix = postfix; } public String getTimeColumnName() { if (_segmentTimeColumnName != null) { return _segmentTimeColumnName; } return getQualifyingDimensions(FieldType.TIME); } public void setTimeColumnName(String timeColumnName) { _segmentTimeColumnName = timeColumnName; } public int getSequenceId() { return _sequenceId; } /** * This method should be used instead of setPostfix if you are adding a sequence number. */ public void setSequenceId(int sequenceId) { _sequenceId = sequenceId; } public TimeUnit getSegmentTimeUnit() { if (_segmentTimeUnit != null) { return _segmentTimeUnit; } else { if (_schema.getTimeFieldSpec() != null) { if (_schema.getTimeFieldSpec().getOutgoingGranularitySpec() != null) { return _schema.getTimeFieldSpec().getOutgoingGranularitySpec().getTimeType(); } if (_schema.getTimeFieldSpec().getIncomingGranularitySpec() != null) { return _schema.getTimeFieldSpec().getIncomingGranularitySpec().getTimeType(); } } return TimeUnit.DAYS; } } public void setSegmentTimeUnit(TimeUnit timeUnit) { _segmentTimeUnit = timeUnit; } public String getCreationTime() { return _segmentCreationTime; } public void setCreationTime(String creationTime) { _segmentCreationTime = creationTime; } public String getStartTime() { return _segmentStartTime; } public void setStartTime(String startTime) { _segmentStartTime = startTime; } public String getEndTime() { return _segmentEndTime; } public void setEndTime(String endTime) { _segmentEndTime = endTime; } public SegmentVersion getSegmentVersion() { return _segmentVersion; } public void setSegmentVersion(SegmentVersion segmentVersion) { _segmentVersion = segmentVersion; } public String getSchemaFile() { return _schemaFile; } public void setSchemaFile(String schemaFile) { _schemaFile = schemaFile; } public Schema getSchema() { return _schema; } public void setSchema(Schema schema) { Preconditions.checkNotNull(schema); _schema = schema; if (_invertedIndexCreationColumns != null) { Iterator<String> iterator = _invertedIndexCreationColumns.iterator(); while (iterator.hasNext()) { String column = iterator.next(); if (_schema.getFieldSpecFor(column) == null) { LOGGER.warn("Cannot find column {} in schema, will not create inverted index.", column); iterator.remove(); } } } } public String getReaderConfigFile() { return _readerConfigFile; } public void setReaderConfigFile(String readerConfigFile) { _readerConfigFile = readerConfigFile; } public RecordReaderConfig getReaderConfig() { return _readerConfig; } public void setReaderConfig(RecordReaderConfig readerConfig) { _readerConfig = readerConfig; } public boolean isEnableStarTreeIndex() { return _enableStarTreeIndex; } public void setEnableStarTreeIndex(boolean enableStarTreeIndex) { _enableStarTreeIndex = enableStarTreeIndex; } public String getStarTreeIndexSpecFile() { return _starTreeIndexSpecFile; } public void setStarTreeIndexSpecFile(String starTreeIndexSpecFile) { _starTreeIndexSpecFile = starTreeIndexSpecFile; } public StarTreeIndexSpec getStarTreeIndexSpec() { return _starTreeIndexSpec; } public void setStarTreeIndexSpec(StarTreeIndexSpec starTreeIndexSpec) { _starTreeIndexSpec = starTreeIndexSpec; } public HllConfig getHllConfig() { return _hllConfig; } public void setHllConfig(HllConfig hllConfig) { _hllConfig = hllConfig; } public SegmentNameGenerator getSegmentNameGenerator() { if (_segmentNameGenerator != null) { return _segmentNameGenerator; } if (_segmentName != null) { return new DefaultSegmentNameGenerator(_segmentName); } return new DefaultSegmentNameGenerator(getTimeColumnName(), getTableName(), getSegmentNamePostfix(), getSequenceId()); } public void setSegmentNameGenerator(SegmentNameGenerator segmentNameGenerator) { _segmentNameGenerator = segmentNameGenerator; } @JsonIgnore public String getMetrics() { return getQualifyingDimensions(FieldType.METRIC); } /** * @deprecated Load outside the class and use the setter for schema setting. * @throws IOException */ @Deprecated public void loadConfigFiles() throws IOException { ObjectMapper objectMapper = new ObjectMapper(); Schema schema; if (_schemaFile != null) { schema = Schema.fromFile(new File(_schemaFile)); setSchema(schema); } else if (_format == FileFormat.AVRO) { schema = AvroUtils.extractSchemaFromAvro(new File(_inputFilePath)); setSchema(schema); } else { throw new RuntimeException("Input format " + _format + " requires schema."); } setTimeColumnName(schema.getTimeColumnName()); TimeFieldSpec timeFieldSpec = schema.getTimeFieldSpec(); if (timeFieldSpec != null) { setSegmentTimeUnit(timeFieldSpec.getIncomingGranularitySpec().getTimeType()); } else { setSegmentTimeUnit(TimeUnit.DAYS); } if (_readerConfigFile != null) { setReaderConfig(objectMapper.readValue(new File(_readerConfigFile), CSVRecordReaderConfig.class)); } if (_starTreeIndexSpecFile != null) { setStarTreeIndexSpec(objectMapper.readValue(new File(_starTreeIndexSpecFile), StarTreeIndexSpec.class)); } } @JsonIgnore public String getDimensions() { return getQualifyingDimensions(FieldType.DIMENSION); } public void setSegmentPartitionConfig(SegmentPartitionConfig segmentPartitionConfig) { _segmentPartitionConfig = segmentPartitionConfig; } public SegmentPartitionConfig getSegmentPartitionConfig() { return _segmentPartitionConfig; } /** * Returns a comma separated list of qualifying dimension name strings * @param type FieldType to filter on * @return */ @JsonIgnore private String getQualifyingDimensions(FieldType type) { List<String> dimensions = new ArrayList<>(); for (final FieldSpec spec : getSchema().getAllFieldSpecs()) { if (spec.getFieldType() == type) { dimensions.add(spec.getName()); } } Collections.sort(dimensions); return StringUtils.join(dimensions, ","); } }