/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.copy.writer; import gobblin.configuration.State; import gobblin.data.management.copy.CopyableFile; import gobblin.data.management.copy.FileAwareInputStream; import gobblin.util.io.StreamCopier; import gobblin.util.io.StreamUtils; import java.io.IOException; import java.io.InputStream; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.channels.WritableByteChannel; import java.util.zip.GZIPInputStream; import lombok.extern.slf4j.Slf4j; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; /** * An {@link FileAwareInputStreamDataWriter} to write archived {@link InputStream}s. The {@link #write(FileAwareInputStream)} * method receives a {@link GZIPInputStream} and converts it to a {@link TarArchiveInputStream}. Each * {@link TarArchiveEntry} is then written to the {@link FileSystem}. */ @Slf4j public class TarArchiveInputStreamDataWriter extends FileAwareInputStreamDataWriter { public TarArchiveInputStreamDataWriter(State state, int numBranches, int branchId) throws IOException { super(state, numBranches, branchId); } /** * Untars the passed in {@link FileAwareInputStream} to the task's staging directory. Uses the name of the root * {@link TarArchiveEntry} in the stream as the directory name for the untarred file. The method also commits the data * by moving the file from staging to output directory. * * @see gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter#write(gobblin.data.management.copy.FileAwareInputStream) */ @Override public void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile) throws IOException { this.closer.register(inputStream); TarArchiveInputStream tarIn = new TarArchiveInputStream(inputStream); final ReadableByteChannel inputChannel = Channels.newChannel(tarIn); TarArchiveEntry tarEntry; // flush the first entry in the tar, which is just the root directory tarEntry = tarIn.getNextTarEntry(); String tarEntryRootName = StringUtils.remove(tarEntry.getName(), Path.SEPARATOR); log.info("Unarchiving at " + writeAt); try { while ((tarEntry = tarIn.getNextTarEntry()) != null) { // the API tarEntry.getName() is misleading, it is actually the path of the tarEntry in the tar file String newTarEntryPath = tarEntry.getName().replace(tarEntryRootName, writeAt.getName()); Path tarEntryStagingPath = new Path(writeAt.getParent(), newTarEntryPath); if (tarEntry.isDirectory() && !this.fs.exists(tarEntryStagingPath)) { this.fs.mkdirs(tarEntryStagingPath); } else if (!tarEntry.isDirectory()) { FSDataOutputStream out = this.fs.create(tarEntryStagingPath, true); final WritableByteChannel outputChannel = Channels.newChannel(out); try { StreamCopier copier = new StreamCopier(inputChannel, outputChannel); if (isInstrumentationEnabled()) { copier.withCopySpeedMeter(this.copySpeedMeter); } this.bytesWritten.addAndGet(copier.copy()); if (isInstrumentationEnabled()) { log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate()); } else { log.info("File {} copied.", copyableFile.getOrigin().getPath()); } } finally { out.close(); outputChannel.close(); } } } } finally { tarIn.close(); inputChannel.close(); inputStream.close(); } } }