/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import com.google.common.io.ByteStreams; import com.google.common.primitives.Ints; import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; import java.io.Serializable; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.util.NoSuchElementException; import java.util.zip.GZIPInputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import javax.annotation.concurrent.GuardedBy; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.joda.time.Instant; /** * A Source that reads from compressed files. A {@code CompressedSources} wraps a delegate * {@link FileBasedSource} that is able to read the decompressed file format. * * <p>For example, use the following to read from a gzip-compressed file-based source: * * <pre> {@code * FileBasedSource<T> mySource = ...; * PCollection<T> collection = p.apply(Read.from(CompressedSource * .from(mySource) * .withDecompression(CompressedSource.CompressionMode.GZIP))); * } </pre> * * <p>Supported compression algorithms are {@link CompressionMode#GZIP}, * {@link CompressionMode#BZIP2}, {@link CompressionMode#ZIP} and {@link CompressionMode#DEFLATE}. * User-defined compression types are supported by implementing * {@link DecompressingChannelFactory}. * * <p>By default, the compression algorithm is selected from those supported in * {@link CompressionMode} based on the file name provided to the source, namely * {@code ".bz2"} indicates {@link CompressionMode#BZIP2}, {@code ".gz"} indicates * {@link CompressionMode#GZIP}, {@code ".zip"} indicates {@link CompressionMode#ZIP} and * {@code ".deflate"} indicates {@link CompressionMode#DEFLATE}. If the file name does not match * any of the supported * algorithms, it is assumed to be uncompressed data. * * @param <T> The type to read from the compressed file. */ @Experimental(Experimental.Kind.SOURCE_SINK) public class CompressedSource<T> extends FileBasedSource<T> { /** * Factory interface for creating channels that decompress the content of an underlying channel. */ public interface DecompressingChannelFactory extends Serializable { /** * Given a channel, create a channel that decompresses the content read from the channel. */ ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException; } /** * Factory interface for creating channels that decompress the content of an underlying channel, * based on both the channel and the file name. */ private interface FileNameBasedDecompressingChannelFactory extends DecompressingChannelFactory { /** * Given a channel, create a channel that decompresses the content read from the channel. */ ReadableByteChannel createDecompressingChannel(String fileName, ReadableByteChannel channel) throws IOException; /** * Given a file name, returns true if the file name matches any supported compression * scheme. */ boolean isCompressed(String fileName); } /** * Default compression types supported by the {@code CompressedSource}. */ public enum CompressionMode implements DecompressingChannelFactory { /** * Reads a byte channel assuming it is compressed with gzip. */ GZIP { @Override public boolean matches(String fileName) { return fileName.toLowerCase().endsWith(".gz"); } @Override public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException { // Determine if the input stream is gzipped. The input stream returned from the // GCS connector may already be decompressed; GCS does this based on the // content-encoding property. PushbackInputStream stream = new PushbackInputStream(Channels.newInputStream(channel), 2); byte[] headerBytes = new byte[2]; int bytesRead = ByteStreams.read( stream /* source */, headerBytes /* dest */, 0 /* offset */, 2 /* len */); stream.unread(headerBytes, 0, bytesRead); if (bytesRead >= 2) { byte zero = 0x00; int header = Ints.fromBytes(zero, zero, headerBytes[1], headerBytes[0]); if (header == GZIPInputStream.GZIP_MAGIC) { return Channels.newChannel(new GzipCompressorInputStream(stream, true)); } } return Channels.newChannel(stream); } }, /** * Reads a byte channel assuming it is compressed with bzip2. */ BZIP2 { @Override public boolean matches(String fileName) { return fileName.toLowerCase().endsWith(".bz2"); } @Override public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException { return Channels.newChannel( new BZip2CompressorInputStream(Channels.newInputStream(channel))); } }, /** * Reads a byte channel assuming it is compressed with zip. * If the zip file contains multiple entries, files in the zip are concatenated all together. */ ZIP { @Override public boolean matches(String fileName) { return fileName.toLowerCase().endsWith(".zip"); } public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException { FullZipInputStream zip = new FullZipInputStream(Channels.newInputStream(channel)); return Channels.newChannel(zip); } }, /** * Reads a byte channel assuming it is compressed with deflate. */ DEFLATE { @Override public boolean matches(String fileName) { return fileName.toLowerCase().endsWith(".deflate"); } public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException { return Channels.newChannel( new DeflateCompressorInputStream(Channels.newInputStream(channel))); } }; /** * Extend of {@link ZipInputStream} to automatically read all entries in the zip. */ private static class FullZipInputStream extends InputStream { private ZipInputStream zipInputStream; private ZipEntry currentEntry; public FullZipInputStream(InputStream is) throws IOException { super(); zipInputStream = new ZipInputStream(is); currentEntry = zipInputStream.getNextEntry(); } @Override public int read() throws IOException { int result = zipInputStream.read(); while (result == -1) { currentEntry = zipInputStream.getNextEntry(); if (currentEntry == null) { return -1; } else { result = zipInputStream.read(); } } return result; } @Override public int read(byte[] b, int off, int len) throws IOException { int result = zipInputStream.read(b, off, len); while (result == -1) { currentEntry = zipInputStream.getNextEntry(); if (currentEntry == null) { return -1; } else { result = zipInputStream.read(b, off, len); } } return result; } } /** * Returns {@code true} if the given file name implies that the contents are compressed * according to the compression embodied by this factory. */ public abstract boolean matches(String fileName); @Override public abstract ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException; } /** * Reads a byte channel detecting compression according to the file name. If the filename * is not any other known {@link CompressionMode}, it is presumed to be uncompressed. */ private static class DecompressAccordingToFilename implements FileNameBasedDecompressingChannelFactory { @Override public ReadableByteChannel createDecompressingChannel( String fileName, ReadableByteChannel channel) throws IOException { for (CompressionMode type : CompressionMode.values()) { if (type.matches(fileName)) { return type.createDecompressingChannel(channel); } } // Uncompressed return channel; } @Override public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) { throw new UnsupportedOperationException( String.format("%s does not support createDecompressingChannel(%s) but only" + " createDecompressingChannel(%s,%s)", getClass().getSimpleName(), String.class.getSimpleName(), ReadableByteChannel.class.getSimpleName(), ReadableByteChannel.class.getSimpleName())); } @Override public boolean isCompressed(String fileName) { for (CompressionMode type : CompressionMode.values()) { if (type.matches(fileName)) { return true; } } return false; } } private final FileBasedSource<T> sourceDelegate; private final DecompressingChannelFactory channelFactory; /** * Creates a {@code CompressedSource} from an underlying {@code FileBasedSource}. The type * of compression used will be based on the file name extension unless explicitly * configured via {@link CompressedSource#withDecompression}. */ public static <T> CompressedSource<T> from(FileBasedSource<T> sourceDelegate) { return new CompressedSource<>(sourceDelegate, new DecompressAccordingToFilename()); } /** * Return a {@code CompressedSource} that is like this one but will decompress its underlying file * with the given {@link DecompressingChannelFactory}. */ public CompressedSource<T> withDecompression(DecompressingChannelFactory channelFactory) { return new CompressedSource<>(this.sourceDelegate, channelFactory); } /** * Creates a {@code CompressedSource} from a delegate file based source and a decompressing * channel factory. */ private CompressedSource( FileBasedSource<T> sourceDelegate, DecompressingChannelFactory channelFactory) { super(sourceDelegate.getFileOrPatternSpecProvider(), Long.MAX_VALUE); this.sourceDelegate = sourceDelegate; this.channelFactory = channelFactory; } /** * Creates a {@code CompressedSource} for an individual file. Used by {@link * CompressedSource#createForSubrangeOfFile}. */ private CompressedSource(FileBasedSource<T> sourceDelegate, DecompressingChannelFactory channelFactory, Metadata metadata, long minBundleSize, long startOffset, long endOffset) { super(metadata, minBundleSize, startOffset, endOffset); this.sourceDelegate = sourceDelegate; this.channelFactory = channelFactory; boolean splittable; try { splittable = isSplittable(); } catch (Exception e) { throw new RuntimeException("Failed to determine if the source is splittable", e); } checkArgument( splittable || startOffset == 0, "CompressedSources must start reading at offset 0. Requested offset: %s", startOffset); } /** * Validates that the delegate source is a valid source and that the channel factory is not null. */ @Override public void validate() { super.validate(); checkNotNull(sourceDelegate); sourceDelegate.validate(); checkNotNull(channelFactory); } /** * Creates a {@code CompressedSource} for a subrange of a file. Called by superclass to create a * source for a single file. */ @Override protected FileBasedSource<T> createForSubrangeOfFile(Metadata metadata, long start, long end) { return new CompressedSource<>(sourceDelegate.createForSubrangeOfFile(metadata, start, end), channelFactory, metadata, sourceDelegate.getMinBundleSize(), start, end); } /** * Determines whether a single file represented by this source is splittable. Returns true * if we are using the default decompression factory and and it determines * from the requested file name that the file is not compressed. */ @Override protected final boolean isSplittable() throws Exception { if (channelFactory instanceof FileNameBasedDecompressingChannelFactory) { FileNameBasedDecompressingChannelFactory fileNameBasedChannelFactory = (FileNameBasedDecompressingChannelFactory) channelFactory; return !fileNameBasedChannelFactory.isCompressed(getFileOrPatternSpec()) && sourceDelegate.isSplittable(); } return false; } /** * Creates a {@code FileBasedReader} to read a single file. * * <p>Uses the delegate source to create a single file reader for the delegate source. * Utilizes the default decompression channel factory to not wrap the source reader * if the file name does not represent a compressed file allowing for splitting of * the source. */ @Override protected final FileBasedReader<T> createSingleFileReader(PipelineOptions options) { if (channelFactory instanceof FileNameBasedDecompressingChannelFactory) { FileNameBasedDecompressingChannelFactory fileNameBasedChannelFactory = (FileNameBasedDecompressingChannelFactory) channelFactory; if (!fileNameBasedChannelFactory.isCompressed(getFileOrPatternSpec())) { return sourceDelegate.createSingleFileReader(options); } } return new CompressedReader<T>( this, sourceDelegate.createSingleFileReader(options)); } @Override public void populateDisplayData(DisplayData.Builder builder) { // We explicitly do not register base-class data, instead we use the delegate inner source. builder .include("source", sourceDelegate) .add(DisplayData.item("source", sourceDelegate.getClass()) .withLabel("Read Source")); if (channelFactory instanceof Enum) { // GZIP, BZIP, ZIP and DEFLATE are implemented as enums; Enum classes are anonymous, so use // the .name() value instead builder.add(DisplayData.item("compressionMode", ((Enum) channelFactory).name()) .withLabel("Compression Mode")); } else { builder.add(DisplayData.item("compressionMode", channelFactory.getClass()) .withLabel("Compression Mode")); } } /** * Returns the delegate source's default output coder. */ @Override public final Coder<T> getDefaultOutputCoder() { return sourceDelegate.getDefaultOutputCoder(); } public final DecompressingChannelFactory getChannelFactory() { return channelFactory; } /** * Reader for a {@link CompressedSource}. Decompresses its input and uses a delegate * reader to read elements from the decompressed input. * @param <T> The type of records read from the source. */ public static class CompressedReader<T> extends FileBasedReader<T> { private final FileBasedReader<T> readerDelegate; private final CompressedSource<T> source; private final Object progressLock = new Object(); @GuardedBy("progressLock") private int numRecordsRead; @GuardedBy("progressLock") private CountingChannel channel; /** * Create a {@code CompressedReader} from a {@code CompressedSource} and delegate reader. */ public CompressedReader(CompressedSource<T> source, FileBasedReader<T> readerDelegate) { super(source); this.source = source; this.readerDelegate = readerDelegate; } /** * Gets the current record from the delegate reader. */ @Override public T getCurrent() throws NoSuchElementException { return readerDelegate.getCurrent(); } @Override public boolean allowsDynamicSplitting() { return false; } @Override public final long getSplitPointsConsumed() { synchronized (progressLock) { return (isDone() && numRecordsRead > 0) ? 1 : 0; } } @Override public final long getSplitPointsRemaining() { return isDone() ? 0 : 1; } /** * Returns true only for the first record; compressed sources cannot be split. */ @Override protected final boolean isAtSplitPoint() { // We have to return true for the first record, but not for the state before reading it, // and not for the state after reading any other record. Hence == rather than >= or <=. // This is required because FileBasedReader is intended for readers that can read a range // of offsets in a file and where the range can be split in parts. CompressedReader, // however, is a degenerate case because it cannot be split, but it has to satisfy the // semantics of offsets and split points anyway. synchronized (progressLock) { return numRecordsRead == 1; } } private static class CountingChannel implements ReadableByteChannel { long count; private final ReadableByteChannel inner; public CountingChannel(ReadableByteChannel inner, long count) { this.inner = inner; this.count = count; } public long getCount() { return count; } @Override public int read(ByteBuffer dst) throws IOException { int bytes = inner.read(dst); if (bytes > 0) { // Avoid the -1 from EOF. count += bytes; } return bytes; } @Override public boolean isOpen() { return inner.isOpen(); } @Override public void close() throws IOException { inner.close(); } } /** * Creates a decompressing channel from the input channel and passes it to its delegate reader's * {@link FileBasedReader#startReading(ReadableByteChannel)}. */ @Override protected final void startReading(ReadableByteChannel channel) throws IOException { synchronized (progressLock) { this.channel = new CountingChannel(channel, getCurrentSource().getStartOffset()); channel = this.channel; } if (source.getChannelFactory() instanceof FileNameBasedDecompressingChannelFactory) { FileNameBasedDecompressingChannelFactory channelFactory = (FileNameBasedDecompressingChannelFactory) source.getChannelFactory(); readerDelegate.startReading(channelFactory.createDecompressingChannel( getCurrentSource().getFileOrPatternSpec(), channel)); } else { readerDelegate.startReading(source.getChannelFactory().createDecompressingChannel( channel)); } } /** * Reads the next record via the delegate reader. */ @Override protected final boolean readNextRecord() throws IOException { if (!readerDelegate.readNextRecord()) { return false; } synchronized (progressLock) { ++numRecordsRead; } return true; } // Unsplittable: returns the offset in the input stream that has been read by the input. // these positions are likely to be coarse-grained (in the event of buffering) and // over-estimates (because they reflect the number of bytes read to produce an element, not its // start) but both of these provide better data than e.g., reporting the start of the file. @Override protected final long getCurrentOffset() throws NoSuchElementException { synchronized (progressLock) { if (numRecordsRead <= 1) { // Since the first record is at a split point, it should start at the beginning of the // file. This avoids the bad case where the decompressor read the entire file, which // would cause the file to be treated as empty when returning channel.getCount() as it // is outside the valid range. return 0; } return channel.getCount(); } } @Override public Instant getCurrentTimestamp() throws NoSuchElementException { return readerDelegate.getCurrentTimestamp(); } } }