/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.tools.segment.converter; import com.linkedin.pinot.common.utils.TarGzCompressionUtils; import com.linkedin.pinot.core.data.readers.FileFormat; import com.linkedin.pinot.tools.AbstractBaseCommand; import com.linkedin.pinot.tools.Command; import java.io.File; import java.util.HashMap; import java.util.Map; import org.apache.commons.io.FileUtils; import org.kohsuke.args4j.Option; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The <code>PinotSegmentConvertCommand</code> class provides tools to convert Pinot segments into another format. * <p>Currently support converting Pinot segments to: * <ul> * <li>AVRO format</li> * <li>CSV format</li> * <li>JSON format</li> * </ul> */ @SuppressWarnings("FieldCanBeLocal") public class PinotSegmentConvertCommand extends AbstractBaseCommand implements Command { private static final Logger LOGGER = LoggerFactory.getLogger(PinotSegmentConvertCommand.class); private static final String TEMP_DIR_NAME = "temp"; @Option(name = "-dataDir", required = true, metaVar = "<String>", usage = "Path to data directory containing Pinot segments.") private String _dataDir; @Option(name = "-outputDir", required = true, metaVar = "<String>", usage = "Path to output directory.") private String _outputDir; @Option(name = "-outputFormat", required = true, metaVar = "<String>", usage = "Format to convert to (AVRO/CSV/JSON).") private String _outputFormat; @Option(name = "-csvDelimiter", required = false, metaVar = "<char>", usage = "CSV delimiter (default ',').") private char _csvDelimiter = ','; @Option(name = "-csvListDelimiter", required = false, metaVar = "<char>", usage = "CSV List delimiter for multi-value columns (default ';').") private char _csvListDelimiter = ';'; @Option(name = "-csvWithHeader", required = false, metaVar = "<boolean>", usage = "Print CSV Header (default false).") private boolean _csvWithHeader; @Option(name = "-overwrite", required = false, metaVar = "<boolean>", usage = "Overwrite the existing file (default false).") private boolean _overwrite; @Option(name = "-help", required = false, help = true, aliases = {"-h", "--h", "--help"}, usage = "Print this message.") private boolean _help; @Override public boolean execute() throws Exception { // Make sure output directory is empty. File outputDir = new File(_outputDir); if (outputDir.exists()) { if (_overwrite) { if (!FileUtils.deleteQuietly(outputDir)) { throw new RuntimeException( "Output directory: " + outputDir.getAbsolutePath() + " already exists and cannot be deleted."); } } else { throw new RuntimeException( "Output directory: " + outputDir.getAbsolutePath() + " already exists and overwrite flag is not set."); } } if (!outputDir.mkdirs()) { throw new RuntimeException("Output directory: " + outputDir.getAbsolutePath() + " cannot be created."); } File tempDir = new File(outputDir, TEMP_DIR_NAME); try { // Add all segments to the segment path map. Map<String, String> segmentPath = new HashMap<>(); File dataDir = new File(_dataDir); File[] files = dataDir.listFiles(); if (files == null || files.length == 0) { throw new RuntimeException("Data directory does not contain any files."); } for (File file : files) { String fileName = file.getName(); if (file.isDirectory()) { // Uncompressed segment. if (segmentPath.containsKey(fileName)) { throw new RuntimeException("Multiple segments with the same segment name: " + fileName); } segmentPath.put(fileName, file.getAbsolutePath()); } else if (fileName.toLowerCase().endsWith(".tar.gz") || fileName.toLowerCase().endsWith(".tgz")) { // Compressed segment. File segment = TarGzCompressionUtils.unTar(file, new File(tempDir, fileName)).get(0); String segmentName = segment.getName(); if (segmentPath.containsKey(segmentName)) { throw new RuntimeException("Multiple segments with the same segment name: " + fileName); } segmentPath.put(segmentName, segment.getAbsolutePath()); } } // Do the conversion according to the output format. for (Map.Entry<String, String> entry : segmentPath.entrySet()) { String segmentName = entry.getKey(); String inputPath = entry.getValue(); String outputPath = new File(outputDir, segmentName).getAbsolutePath(); switch (FileFormat.valueOf(_outputFormat.toUpperCase())) { case AVRO: outputPath += ".avro"; new PinotSegmentToAvroConverter(inputPath, outputPath).convert(); break; case CSV: outputPath += ".csv"; new PinotSegmentToCsvConverter(inputPath, outputPath, _csvDelimiter, _csvDelimiter, _csvWithHeader) .convert(); break; case JSON: outputPath += ".json"; new PinotSegmentToJsonConverter(inputPath, outputPath).convert(); break; default: throw new RuntimeException("Unsupported conversion to file format: " + _outputFormat); } LOGGER.info("Finish converting segment: {} into file: {}", segmentName, outputPath); } return true; } finally { FileUtils.deleteQuietly(tempDir); } } @Override public String description() { return "Convert Pinot segments to another format such as AVRO/CSV/JSON."; } @Override public boolean getHelp() { return _help; } }