/*
* Copyright 2014 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.cli.commands;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.hadoop.io.IOUtils;
import org.kitesdk.cli.commands.tarimport.avro.TarFileEntry;
import org.kitesdk.data.*;
import org.slf4j.Logger;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
@Parameters(commandDescription = "Import files in tarball into a Dataset")
public class TarImportCommand extends BaseDatasetCommand {
protected enum CompressionType {
NONE,
GZIP,
BZIP2
}
private static final List<String> SUPPORTED_TAR_COMPRESSION_TYPES =
Lists.newArrayList("", "none", "gzip", "bzip2");
private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024;
public TarImportCommand(Logger console) {
super(console);
}
@Parameter(description = "<tar path> <dataset URI>")
List<String> targets;
@Parameter(names = "--compression",
description = "Override compression type (none, gzip, bzip2)")
String compressionType = "";
@Override
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() == 2,
"Tar path and target dataset URI are required.");
Preconditions.checkArgument(
SUPPORTED_TAR_COMPRESSION_TYPES.contains(compressionType),
"Compression type " + compressionType + " is not supported");
String source = targets.get(0);
String datasetUri = targets.get(1);
long blockSize = getConf().getLong("dfs.blocksize", DEFAULT_BLOCK_SIZE);
int success = 0;
View<TarFileEntry> targetDataset;
if (Datasets.exists(datasetUri)) {
console.debug("Using existing dataset: {}", datasetUri);
targetDataset = Datasets.load(datasetUri, TarFileEntry.class);
} else {
console.info("Creating new dataset: {}", datasetUri);
DatasetDescriptor.Builder descriptorBuilder =
new DatasetDescriptor.Builder();
descriptorBuilder.format(Formats.AVRO);
descriptorBuilder.schema(TarFileEntry.class);
targetDataset = Datasets.create(datasetUri,
descriptorBuilder.build(), TarFileEntry.class);
}
DatasetWriter<TarFileEntry> writer = targetDataset.newWriter();
// Create a Tar input stream wrapped in appropriate decompressor
// TODO: Enhancement would be to use native compression libs
TarArchiveInputStream tis;
CompressionType tarCompressionType = CompressionType.NONE;
if (compressionType.isEmpty()) {
if (source.endsWith(".tar")) {
tarCompressionType = CompressionType.NONE;
} else if (source.endsWith(".tar.gz")) {
tarCompressionType = CompressionType.GZIP;
} else if (source.endsWith(".tar.bz2")) {
tarCompressionType = CompressionType.BZIP2;
}
} else if (compressionType.equals("gzip")) {
tarCompressionType = CompressionType.GZIP;
} else if (compressionType.equals("bzip2")) {
tarCompressionType = CompressionType.BZIP2;
} else {
tarCompressionType = CompressionType.NONE;
}
console.info("Using {} compression", tarCompressionType);
switch (tarCompressionType) {
case GZIP:
tis = new TarArchiveInputStream(
new GzipCompressorInputStream(open(source)));
break;
case BZIP2:
tis = new TarArchiveInputStream(
new BZip2CompressorInputStream(open(source)));
break;
case NONE:
default:
tis = new TarArchiveInputStream(open(source));
}
TarArchiveEntry entry;
try {
int count = 0;
while ((entry = tis.getNextTarEntry()) != null) {
if (!entry.isDirectory()) {
long size = entry.getSize();
if (size >= blockSize) {
console.warn("Entry \"{}\" (size {}) is larger than the " +
"HDFS block size of {}. This may result in remote block reads",
new Object[] { entry.getName(), size, blockSize });
}
byte[] buf = new byte[(int) size];
try {
IOUtils.readFully(tis, buf, 0, (int) size);
} catch (IOException e) {
console.error("Did not read entry {} successfully (entry size {})",
entry.getName(), size);
success = 1;
throw e;
}
writer.write(
TarFileEntry.newBuilder().setFilename(entry.getName())
.setFilecontent(ByteBuffer.wrap(buf)).build()
);
count++;
}
}
console.info("Added {} records to \"{}\"", count,
targetDataset.getDataset().getName());
} finally {
IOUtils.closeStream(writer);
IOUtils.closeStream(tis);
}
return success;
}
@Override
public List<String> getExamples() {
return Lists.newArrayList(
"# Copy the contents of from sample.tar.gz to HDFS dataset \"sample\"",
"path/to/sample.tar.gz dataset:hdfs:/path/to/sample"
);
}
}