/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.tools.segment.converter;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.common.utils.TarGzCompressionUtils;
import com.linkedin.pinot.core.common.BlockSingleValIterator;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.common.DataSourceMetadata;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.segment.creator.SingleValueRawIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.core.segment.index.readers.Dictionary;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.charset.Charset;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.io.FileUtils;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class to convert segment with dictionary encoded column to raw index (without dictionary).
*/
@SuppressWarnings({"FieldCanBeLocal", "unused"})
public class DictionaryToRawIndexConverter {
private static final Logger LOGGER = LoggerFactory.getLogger(DictionaryToRawIndexConverter.class);
private static final Charset UTF_8 = Charset.forName("UTF-8");
@Option(name = "-dataDir", required = true, usage = "Directory containing uncompressed segments")
private String _dataDir = null;
@Option(name = "-columns", required = true, usage = "Comma separated list of column names to convert")
private String _columns = null;
@Option(name = "-tableName", required = false, usage = "New table name, if different from original")
private String _tableName = null;
@Option(name = "-outputDir", required = true, usage = "Output directory for writing results")
private String _outputDir = null;
@Option(name = "-overwrite", required = false, usage = "Overwrite output directory")
private boolean _overwrite = false;
@Option(name = "-numThreads", required = false, usage = "Number of threads to launch for conversion")
private int _numThreads = 4;
@Option(name = "-compressOutput", required = false, usage = "Compress (tar + gzip) output segment")
private boolean _compressOutput = false;
@Option(name = "-help", required = false, help = true, aliases = {"-h"}, usage = "print this message")
private boolean _help = false;
/**
* Setter for {@link #_dataDir}
* @param dataDir Data directory containing un-tarred segments.
* @return this
*/
public DictionaryToRawIndexConverter setDataDir(String dataDir) {
_dataDir = dataDir;
return this;
}
/**
* Setter for {@link #_outputDir}
*
* @param outputDir Directory where output segments should be written
* @return this
*/
public DictionaryToRawIndexConverter setOutputDir(String outputDir) {
_outputDir = outputDir;
return this;
}
/**
* Setter for columns to convert.
*
* @param columns Comma separated list of columns
* @return this
*/
public DictionaryToRawIndexConverter setColumns(String columns) {
_columns = columns;
return this;
}
/**
* Setter for {@link #_overwrite}
* When set to true, already existing output directory is overwritten.
*
* @param overwrite True for overwriting existing output dir, False otherwise
* @return this
*/
public DictionaryToRawIndexConverter setOverwrite(boolean overwrite) {
_overwrite = overwrite;
return this;
}
/**
* Method to perform the conversion for a set of segments in the {@link #_dataDir}
*
* @return True if successful, False otherwise
* @throws Exception
*/
public boolean convert()
throws Exception {
if (_help) {
printUsage();
return true;
}
File dataDir = new File(_dataDir);
File outputDir = new File(_outputDir);
if (!dataDir.exists()) {
LOGGER.error("Data directory '{}' does not exist.", _dataDir);
return false;
} else if (outputDir.exists()) {
if (_overwrite) {
LOGGER.info("Overwriting existing output directory '{}'", _outputDir);
FileUtils.deleteQuietly(outputDir);
outputDir = new File(_outputDir);
outputDir.mkdir();
} else {
LOGGER.error("Output directory '{}' already exists, use -overwrite to overwrite", outputDir);
return false;
}
}
File[] segmentFiles = dataDir.listFiles();
if (segmentFiles == null || segmentFiles.length == 0) {
LOGGER.error("Empty data directory '{}'.", _dataDir);
return false;
}
boolean ret = true;
final File outDir = outputDir;
ExecutorService executorService = Executors.newFixedThreadPool(_numThreads);
for (final File segmentDir : segmentFiles) {
executorService.execute(new Runnable() {
@Override
public void run() {
try {
convertSegment(segmentDir, _columns.split("\\s*,\\s*"), outDir, _compressOutput);
} catch (Exception e) {
LOGGER.error("Exception caught while converting segment {}", segmentDir.getName(), e);
e.printStackTrace();
}
}
});
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.HOURS);
return ret;
}
/**
* This method converts the specified columns of the given segment from dictionary encoded
* forward index to raw index without dictionary.
*
* @param segmentDir Segment directory
* @param columns Columns to convert
* @param outputDir Directory for writing output segment
* @param compressOutput Tar/gzip the output segment
* @return True if successful, False otherwise
* @throws Exception
*/
public boolean convertSegment(File segmentDir, String[] columns, File outputDir, boolean compressOutput)
throws Exception {
File newSegment;
if (segmentDir.isFile()) {
if (segmentDir.getName().endsWith(".tar.gz") || segmentDir.getName().endsWith(".tgz")) {
LOGGER.info("Uncompressing input segment '{}'", segmentDir);
newSegment = TarGzCompressionUtils.unTar(segmentDir, outputDir).get(0);
} else {
LOGGER.warn("Skipping non-segment file '{}'", segmentDir.getAbsoluteFile());
return false;
}
} else {
newSegment = new File(outputDir, segmentDir.getName());
newSegment.mkdir();
FileUtils.copyDirectory(segmentDir, newSegment);
}
IndexSegment segment = Loaders.IndexSegment.load(newSegment, ReadMode.mmap);
for (String column : columns) {
LOGGER.info("Converting column '{}' for segment '{}'.", column, segment.getSegmentName());
convertOneColumn(segment, column, newSegment);
}
updateMetadata(newSegment, columns, _tableName);
segment.destroy();
if (compressOutput) {
LOGGER.info("Compressing segment '{}'", newSegment);
TarGzCompressionUtils.createTarGzOfDirectory(newSegment.getAbsolutePath(), newSegment.getAbsolutePath());
FileUtils.deleteQuietly(newSegment);
}
return true;
}
/**
* Helper method to update the metadata.properties for the converted segment.
*
* @param segmentDir Segment directory
* @param columns Converted columns
* @param tableName New table name to be written in the meta-data. Skipped if null.
* @throws IOException
* @throws ConfigurationException
*/
private void updateMetadata(File segmentDir, String[] columns, String tableName)
throws IOException, ConfigurationException {
File metadataFile = new File(segmentDir, V1Constants.MetadataKeys.METADATA_FILE_NAME);
PropertiesConfiguration properties = new PropertiesConfiguration(metadataFile);
if (tableName != null) {
properties.setProperty(V1Constants.MetadataKeys.Segment.TABLE_NAME, tableName);
}
for (String column : columns) {
properties.setProperty(
V1Constants.MetadataKeys.Column.getKeyFor(column, V1Constants.MetadataKeys.Column.HAS_DICTIONARY), false);
properties.setProperty(
V1Constants.MetadataKeys.Column.getKeyFor(column, V1Constants.MetadataKeys.Column.BITS_PER_ELEMENT), -1);
}
properties.save();
}
/**
* Helper method to print usage at the command line interface.
*/
private static void printUsage() {
System.out.println("Usage: DictionaryTORawIndexConverter");
for (Field field : ColumnarToStarTreeConverter.class.getDeclaredFields()) {
if (field.isAnnotationPresent(Option.class)) {
Option option = field.getAnnotation(Option.class);
System.out.println(
String.format("\t%-15s: %s (required=%s)", option.name(), option.usage(), option.required()));
}
}
}
/**
* Helper method to perform conversion for the specific column.
*
* @param segment Input segment to convert
* @param column Column to convert
* @param newSegment Directory where raw index to be written
* @throws IOException
*/
private void convertOneColumn(IndexSegment segment, String column, File newSegment)
throws IOException {
DataSource dataSource = segment.getDataSource(column);
Dictionary dictionary = dataSource.getDictionary();
if (dictionary == null) {
LOGGER.error("Column '{}' does not have dictionary, cannot convert to raw index.", column);
return;
}
DataSourceMetadata dataSourceMetadata = dataSource.getDataSourceMetadata();
if (!dataSourceMetadata.isSingleValue()) {
LOGGER.error("Cannot convert multi-valued columns '{}'", column);
return;
}
int totalDocs = segment.getSegmentMetadata().getTotalDocs();
BlockSingleValIterator bvIter = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
FieldSpec.DataType dataType = dataSourceMetadata.getDataType();
int lengthOfLongestEntry =
(dataType == FieldSpec.DataType.STRING) ? getLengthOfLongestEntry(bvIter, dictionary) : -1;
SingleValueRawIndexCreator rawIndexCreator =
SegmentColumnarIndexCreator.getRawIndexCreatorForColumn(newSegment, column, dataType, totalDocs,
lengthOfLongestEntry);
int docId = 0;
bvIter.reset();
while (bvIter.hasNext()) {
int dictId = bvIter.nextIntVal();
Object value = dictionary.get(dictId);
rawIndexCreator.index(docId++, value);
if (docId % 1000000 == 0) {
LOGGER.info("Converted {} records.", docId);
}
}
rawIndexCreator.close();
deleteForwardIndex(newSegment.getParentFile(), column, dataSourceMetadata.isSorted());
}
/**
* Helper method to remove the forward index for the given column.
*
* @param segmentDir Segment directory from which to remove the forward index.
* @param column Column for which to remove the index.
* @param sorted True if column is sorted, False otherwise
*/
private void deleteForwardIndex(File segmentDir, String column, boolean sorted) {
File dictionaryFile = new File(segmentDir, (column + V1Constants.Dict.FILE_EXTENTION));
FileUtils.deleteQuietly(dictionaryFile);
String fwdIndexFileExtension = (sorted) ? V1Constants.Indexes.SORTED_FWD_IDX_FILE_EXTENTION
: V1Constants.Indexes.UN_SORTED_SV_FWD_IDX_FILE_EXTENTION;
File fwdIndexFile = new File(segmentDir, (column + fwdIndexFileExtension));
FileUtils.deleteQuietly(fwdIndexFile);
}
/**
* Helper method to get the length
* @param bvIter Data source blockvalset iterator
* @param dictionary Column dictionary
* @return Length of longest entry
*/
private int getLengthOfLongestEntry(BlockSingleValIterator bvIter, Dictionary dictionary) {
int lengthOfLongestEntry = 0;
bvIter.reset();
while (bvIter.hasNext()) {
int dictId = bvIter.nextIntVal();
String value = (String) dictionary.get(dictId);
lengthOfLongestEntry = Math.max(lengthOfLongestEntry, value.getBytes(UTF_8).length);
}
return lengthOfLongestEntry;
}
/**
* Main method for the class.
*
* @param args Arguments for the converter
* @throws Exception
*/
public static void main(String[] args)
throws Exception {
DictionaryToRawIndexConverter converter = new DictionaryToRawIndexConverter();
CmdLineParser parser = new CmdLineParser(converter);
parser.parseArgument(args);
converter.convert();
}
}