DictionaryToRawIndexConverter.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.tools.segment.converter;

import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.common.utils.TarGzCompressionUtils;
import com.linkedin.pinot.core.common.BlockSingleValIterator;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.common.DataSourceMetadata;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.segment.creator.SingleValueRawIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.core.segment.index.readers.Dictionary;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.charset.Charset;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.io.FileUtils;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Class to convert segment with dictionary encoded column to raw index (without dictionary).
 */
@SuppressWarnings({"FieldCanBeLocal", "unused"})
public class DictionaryToRawIndexConverter {
  private static final Logger LOGGER = LoggerFactory.getLogger(DictionaryToRawIndexConverter.class);
  private static final Charset UTF_8 = Charset.forName("UTF-8");

  @Option(name = "-dataDir", required = true, usage = "Directory containing uncompressed segments")
  private String _dataDir = null;

  @Option(name = "-columns", required = true, usage = "Comma separated list of column names to convert")
  private String _columns = null;

  @Option(name = "-tableName", required = false, usage = "New table name, if different from original")
  private String _tableName = null;

  @Option(name = "-outputDir", required = true, usage = "Output directory for writing results")
  private String _outputDir = null;

  @Option(name = "-overwrite", required = false, usage = "Overwrite output directory")
  private boolean _overwrite = false;

  @Option(name = "-numThreads", required = false, usage = "Number of threads to launch for conversion")
  private int _numThreads = 4;

  @Option(name = "-compressOutput", required = false, usage = "Compress (tar + gzip) output segment")
  private boolean _compressOutput = false;

  @Option(name = "-help", required = false, help = true, aliases = {"-h"}, usage = "print this message")
  private boolean _help = false;

  /**
   * Setter for {@link #_dataDir}
   * @param dataDir Data directory containing un-tarred segments.
   * @return this
   */
  public DictionaryToRawIndexConverter setDataDir(String dataDir) {
    _dataDir = dataDir;
    return this;
  }

  /**
   * Setter for {@link #_outputDir}
   *
   * @param outputDir Directory where output segments should be written
   * @return this
   */
  public DictionaryToRawIndexConverter setOutputDir(String outputDir) {
    _outputDir = outputDir;
    return this;
  }

  /**
   * Setter for columns to convert.
   *
   * @param columns Comma separated list of columns
   * @return this
   */
  public DictionaryToRawIndexConverter setColumns(String columns) {
    _columns = columns;
    return this;
  }

  /**
   * Setter for {@link #_overwrite}
   * When set to true, already existing output directory is overwritten.
   *
   * @param overwrite True for overwriting existing output dir, False otherwise
   * @return this
   */
  public DictionaryToRawIndexConverter setOverwrite(boolean overwrite) {
    _overwrite = overwrite;
    return this;
  }

  /**
   * Method to perform the conversion for a set of segments in the {@link #_dataDir}
   *
   * @return True if successful, False otherwise
   * @throws Exception
   */
  public boolean convert()
      throws Exception {
    if (_help) {
      printUsage();
      return true;
    }

    File dataDir = new File(_dataDir);
    File outputDir = new File(_outputDir);

    if (!dataDir.exists()) {
      LOGGER.error("Data directory '{}' does not exist.", _dataDir);
      return false;
    } else if (outputDir.exists()) {
      if (_overwrite) {
        LOGGER.info("Overwriting existing output directory '{}'", _outputDir);
        FileUtils.deleteQuietly(outputDir);
        outputDir = new File(_outputDir);
        outputDir.mkdir();
      } else {
        LOGGER.error("Output directory '{}' already exists, use -overwrite to overwrite", outputDir);
        return false;
      }
    }

    File[] segmentFiles = dataDir.listFiles();
    if (segmentFiles == null || segmentFiles.length == 0) {
      LOGGER.error("Empty data directory '{}'.", _dataDir);
      return false;
    }

    boolean ret = true;
    final File outDir = outputDir;
    ExecutorService executorService = Executors.newFixedThreadPool(_numThreads);
    for (final File segmentDir : segmentFiles) {
      executorService.execute(new Runnable() {
        @Override
        public void run() {
          try {
            convertSegment(segmentDir, _columns.split("\\s*,\\s*"), outDir, _compressOutput);
          } catch (Exception e) {
            LOGGER.error("Exception caught while converting segment {}", segmentDir.getName(), e);
            e.printStackTrace();
          }
        }
      });
    }

    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.HOURS);
    return ret;
  }

  /**
   * This method converts the specified columns of the given segment from dictionary encoded
   * forward index to raw index without dictionary.
   *
   * @param segmentDir Segment directory
   * @param columns Columns to convert
   * @param outputDir Directory for writing output segment
   * @param compressOutput Tar/gzip the output segment
   * @return True if successful, False otherwise
   * @throws Exception
   */
  public boolean convertSegment(File segmentDir, String[] columns, File outputDir, boolean compressOutput)
      throws Exception {
    File newSegment;

    if (segmentDir.isFile()) {
      if (segmentDir.getName().endsWith(".tar.gz") || segmentDir.getName().endsWith(".tgz")) {
        LOGGER.info("Uncompressing input segment '{}'", segmentDir);
        newSegment = TarGzCompressionUtils.unTar(segmentDir, outputDir).get(0);
      } else {
        LOGGER.warn("Skipping non-segment file '{}'", segmentDir.getAbsoluteFile());
        return false;
      }
    } else {
      newSegment = new File(outputDir, segmentDir.getName());
      newSegment.mkdir();
      FileUtils.copyDirectory(segmentDir, newSegment);
    }

    IndexSegment segment = Loaders.IndexSegment.load(newSegment, ReadMode.mmap);
    for (String column : columns) {
      LOGGER.info("Converting column '{}' for segment '{}'.", column, segment.getSegmentName());
      convertOneColumn(segment, column, newSegment);
    }

    updateMetadata(newSegment, columns, _tableName);
    segment.destroy();

    if (compressOutput) {
      LOGGER.info("Compressing segment '{}'", newSegment);
      TarGzCompressionUtils.createTarGzOfDirectory(newSegment.getAbsolutePath(), newSegment.getAbsolutePath());
      FileUtils.deleteQuietly(newSegment);
    }
    return true;
  }

  /**
   * Helper method to update the metadata.properties for the converted segment.
   *
   * @param segmentDir Segment directory
   * @param columns Converted columns
   * @param tableName New table name to be written in the meta-data. Skipped if null.
   * @throws IOException
   * @throws ConfigurationException
   */
  private void updateMetadata(File segmentDir, String[] columns, String tableName)
      throws IOException, ConfigurationException {
    File metadataFile = new File(segmentDir, V1Constants.MetadataKeys.METADATA_FILE_NAME);
    PropertiesConfiguration properties = new PropertiesConfiguration(metadataFile);

    if (tableName != null) {
      properties.setProperty(V1Constants.MetadataKeys.Segment.TABLE_NAME, tableName);
    }

    for (String column : columns) {
      properties.setProperty(
          V1Constants.MetadataKeys.Column.getKeyFor(column, V1Constants.MetadataKeys.Column.HAS_DICTIONARY), false);
      properties.setProperty(
          V1Constants.MetadataKeys.Column.getKeyFor(column, V1Constants.MetadataKeys.Column.BITS_PER_ELEMENT), -1);
    }
    properties.save();
  }

  /**
   * Helper method to print usage at the command line interface.
   */
  private static void printUsage() {
    System.out.println("Usage: DictionaryTORawIndexConverter");
    for (Field field : ColumnarToStarTreeConverter.class.getDeclaredFields()) {

      if (field.isAnnotationPresent(Option.class)) {
        Option option = field.getAnnotation(Option.class);

        System.out.println(
            String.format("\t%-15s: %s (required=%s)", option.name(), option.usage(), option.required()));
      }
    }
  }

  /**
   * Helper method to perform conversion for the specific column.
   *
   * @param segment Input segment to convert
   * @param column Column to convert
   * @param newSegment Directory where raw index to be written
   * @throws IOException
   */
  private void convertOneColumn(IndexSegment segment, String column, File newSegment)
      throws IOException {
    DataSource dataSource = segment.getDataSource(column);
    Dictionary dictionary = dataSource.getDictionary();

    if (dictionary == null) {
      LOGGER.error("Column '{}' does not have dictionary, cannot convert to raw index.", column);
      return;
    }

    DataSourceMetadata dataSourceMetadata = dataSource.getDataSourceMetadata();
    if (!dataSourceMetadata.isSingleValue()) {
      LOGGER.error("Cannot convert multi-valued columns '{}'", column);
      return;
    }

    int totalDocs = segment.getSegmentMetadata().getTotalDocs();
    BlockSingleValIterator bvIter = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();

    FieldSpec.DataType dataType = dataSourceMetadata.getDataType();
    int lengthOfLongestEntry =
        (dataType == FieldSpec.DataType.STRING) ? getLengthOfLongestEntry(bvIter, dictionary) : -1;

    SingleValueRawIndexCreator rawIndexCreator =
        SegmentColumnarIndexCreator.getRawIndexCreatorForColumn(newSegment, column, dataType, totalDocs,
            lengthOfLongestEntry);

    int docId = 0;
    bvIter.reset();
    while (bvIter.hasNext()) {
      int dictId = bvIter.nextIntVal();
      Object value = dictionary.get(dictId);
      rawIndexCreator.index(docId++, value);

      if (docId % 1000000 == 0) {
        LOGGER.info("Converted {} records.", docId);
      }
    }
    rawIndexCreator.close();
    deleteForwardIndex(newSegment.getParentFile(), column, dataSourceMetadata.isSorted());
  }

  /**
   * Helper method to remove the forward index for the given column.
   *
   * @param segmentDir Segment directory from which to remove the forward index.
   * @param column Column for which to remove the index.
   * @param sorted True if column is sorted, False otherwise
   */
  private void deleteForwardIndex(File segmentDir, String column, boolean sorted) {
    File dictionaryFile = new File(segmentDir, (column + V1Constants.Dict.FILE_EXTENTION));
    FileUtils.deleteQuietly(dictionaryFile);

    String fwdIndexFileExtension = (sorted) ? V1Constants.Indexes.SORTED_FWD_IDX_FILE_EXTENTION
        : V1Constants.Indexes.UN_SORTED_SV_FWD_IDX_FILE_EXTENTION;
    File fwdIndexFile = new File(segmentDir, (column + fwdIndexFileExtension));
    FileUtils.deleteQuietly(fwdIndexFile);
  }

  /**
   * Helper method to get the length
   * @param bvIter Data source blockvalset iterator
   * @param dictionary Column dictionary
   * @return Length of longest entry
   */
  private int getLengthOfLongestEntry(BlockSingleValIterator bvIter, Dictionary dictionary) {
    int lengthOfLongestEntry = 0;

    bvIter.reset();
    while (bvIter.hasNext()) {
      int dictId = bvIter.nextIntVal();
      String value = (String) dictionary.get(dictId);
      lengthOfLongestEntry = Math.max(lengthOfLongestEntry, value.getBytes(UTF_8).length);
    }

    return lengthOfLongestEntry;
  }

  /**
   * Main method for the class.
   *
   * @param args Arguments for the converter
   * @throws Exception
   */
  public static void main(String[] args)
      throws Exception {
    DictionaryToRawIndexConverter converter = new DictionaryToRawIndexConverter();
    CmdLineParser parser = new CmdLineParser(converter);
    parser.parseArgument(args);
    converter.convert();
  }
}