/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import java.io.IOException; import java.util.NoSuchElementException; import javax.annotation.Nullable; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; /** * A {@code BlockBasedSource} is a {@link FileBasedSource} where a file consists of blocks of * records. * * <p>{@code BlockBasedSource} should be derived from when a file format does not support efficient * seeking to a record in the file, but can support efficient seeking to a block. Alternatively, * records in the file cannot be offset-addressed, but blocks can (it is not possible to say * that record {code i} starts at offset {@code m}, but it is possible to say that block {@code j} * starts at offset {@code n}). * * <p>The records that will be read from a {@code BlockBasedSource} that corresponds to a subrange * of a file {@code [startOffset, endOffset)} are those records such that the record is contained in * a block that starts at offset {@code i}, where {@code i >= startOffset} and * {@code i < endOffset}. In other words, a record will be read from the source if its first byte is * contained in a block that begins within the range described by the source. * * <p>This entails that it is possible to determine the start offsets of all blocks in a file. * * <p>Progress reporting for reading from a {@code BlockBasedSource} is inaccurate. A {@link * BlockBasedReader} reports its current offset as {@code (offset of current block) + (current block * size) * (fraction of block consumed)}. However, only the offset of the current block is required * to be accurately reported by subclass implementations. As such, in the worst case, the current * offset is only updated at block boundaries. * * <p>{@code BlockBasedSource} supports dynamic splitting. However, because records in a {@code * BlockBasedSource} are not required to have offsets and progress reporting is inaccurate, {@code * BlockBasedReader} only supports splitting at block boundaries. * In other words, {@link BlockBasedReader#atSplitPoint} returns true iff the current record is the * first record in a block. See {@link FileBasedSource.FileBasedReader} for discussion about split * points. * * @param <T> The type of records to be read from the source. */ @Experimental(Experimental.Kind.SOURCE_SINK) public abstract class BlockBasedSource<T> extends FileBasedSource<T> { /** * Creates a {@code BlockBasedSource} based on a file name or pattern. Subclasses must call this * constructor when creating a {@code BlockBasedSource} for a file pattern. See * {@link FileBasedSource} for more information. */ public BlockBasedSource(String fileOrPatternSpec, long minBundleSize) { super(StaticValueProvider.of(fileOrPatternSpec), minBundleSize); } /** * Creates a {@code BlockBasedSource} for a single file. Subclasses must call this constructor * when implementing {@link BlockBasedSource#createForSubrangeOfFile}. See documentation in * {@link FileBasedSource}. */ public BlockBasedSource(Metadata metadata, long minBundleSize, long startOffset, long endOffset) { super(metadata, minBundleSize, startOffset, endOffset); } /** * Creates a {@code BlockBasedSource} for the specified range in a single file. */ @Override protected abstract BlockBasedSource<T> createForSubrangeOfFile( Metadata metadata, long start, long end); /** * Creates a {@code BlockBasedReader}. */ @Override protected abstract BlockBasedReader<T> createSingleFileReader(PipelineOptions options); /** * A {@code Block} represents a block of records that can be read. */ @Experimental(Experimental.Kind.SOURCE_SINK) protected abstract static class Block<T> { /** * Returns the current record. */ public abstract T getCurrentRecord(); /** * Reads the next record from the block and returns true iff one exists. */ public abstract boolean readNextRecord() throws IOException; /** * Returns the fraction of the block already consumed, if possible, as a value in * {@code [0, 1]}. It should not include the current record. Successive results from this method * must be monotonically increasing. * * <p>If it is not possible to compute the fraction of the block consumed this method may * return zero. For example, when the total number of records in the block is unknown. */ public abstract double getFractionOfBlockConsumed(); } /** * A {@code Reader} that reads records from a {@link BlockBasedSource}. If the source is a * subrange of a file, the blocks that will be read by this reader are those such that the first * byte of the block is within the range {@code [start, end)}. */ @Experimental(Experimental.Kind.SOURCE_SINK) protected abstract static class BlockBasedReader<T> extends FileBasedReader<T> { private boolean atSplitPoint; protected BlockBasedReader(BlockBasedSource<T> source) { super(source); } /** * Read the next block from the input. */ public abstract boolean readNextBlock() throws IOException; /** * Returns the current block (the block that was read by the last successful call to * {@link BlockBasedReader#readNextBlock}). May return null initially, or if no block has been * successfully read. */ @Nullable public abstract Block<T> getCurrentBlock(); /** * Returns the size of the current block in bytes as it is represented in the underlying file, * if possible. This method may return {@code 0} if the size of the current block is unknown. * * <p>The size returned by this method must be such that for two successive blocks A and B, * {@code offset(A) + size(A) <= offset(B)}. If this is not satisfied, the progress reported * by the {@code BlockBasedReader} will be non-monotonic and will interfere with the quality * (but not correctness) of dynamic work rebalancing. * * <p>This method and {@link Block#getFractionOfBlockConsumed} are used to provide an estimate * of progress within a block ({@code getCurrentBlock().getFractionOfBlockConsumed() * * getCurrentBlockSize()}). It is acceptable for the result of this computation to be {@code 0}, * but progress estimation will be inaccurate. */ public abstract long getCurrentBlockSize(); /** * Returns the largest offset such that starting to read from that offset includes the current * block. */ public abstract long getCurrentBlockOffset(); @Override public final T getCurrent() throws NoSuchElementException { Block<T> currentBlock = getCurrentBlock(); if (currentBlock == null) { throw new NoSuchElementException( "No block has been successfully read from " + getCurrentSource()); } return currentBlock.getCurrentRecord(); } /** * Returns true if the reader is at a split point. A {@code BlockBasedReader} is at a split * point if the current record is the first record in a block. In other words, split points * are block boundaries. */ @Override protected boolean isAtSplitPoint() { return atSplitPoint; } /** * Reads the next record from the {@link #getCurrentBlock() current block} if * possible. Will call {@link #readNextBlock()} to advance to the next block if not. * * <p>The first record read from a block is treated as a split point. */ @Override protected final boolean readNextRecord() throws IOException { atSplitPoint = false; while (getCurrentBlock() == null || !getCurrentBlock().readNextRecord()) { if (!readNextBlock()) { return false; } // The first record in a block is a split point. atSplitPoint = true; } return true; } @Override @Nullable public Double getFractionConsumed() { if (!isStarted()) { return 0.0; } if (isDone()) { return 1.0; } FileBasedSource<T> source = getCurrentSource(); if (source.getEndOffset() == Long.MAX_VALUE) { // Unknown end offset, so we cannot tell. return null; } long currentBlockOffset = getCurrentBlockOffset(); long startOffset = source.getStartOffset(); long endOffset = source.getEndOffset(); double fractionAtBlockStart = ((double) (currentBlockOffset - startOffset)) / (endOffset - startOffset); double fractionAtBlockEnd = ((double) (currentBlockOffset + getCurrentBlockSize() - startOffset) / (endOffset - startOffset)); double blockFraction = getCurrentBlock().getFractionOfBlockConsumed(); return Math.min( 1.0, fractionAtBlockStart + blockFraction * (fractionAtBlockEnd - fractionAtBlockStart)); } @Override protected long getCurrentOffset() { return getCurrentBlockOffset(); } } }