/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static com.google.common.base.Preconditions.checkArgument; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; import org.apache.beam.sdk.io.range.OffsetRangeTracker; import org.apache.beam.sdk.io.range.RangeTracker; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.display.DisplayData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A {@link BoundedSource} that uses offsets to define starting and ending positions. * * <p>{@link OffsetBasedSource} is a common base class for all bounded sources where the input can * be represented as a single range, and an input can be efficiently processed in parallel by * splitting the range into a set of disjoint ranges whose union is the original range. This class * should be used for sources that can be cheaply read starting at any given offset. * {@link OffsetBasedSource} stores the range and implements splitting into bundles. * * <p>Extend {@link OffsetBasedSource} to implement your own offset-based custom source. * {@link FileBasedSource}, which is a subclass of this, adds additional functionality useful for * custom sources that are based on files. If possible implementors should start from * {@link FileBasedSource} instead of {@link OffsetBasedSource}. * * <p>Consult {@link RangeTracker} for important semantics common to all sources defined by a range * of positions of a certain type, including the semantics of split points * ({@link OffsetBasedReader#isAtSplitPoint}). * * @param <T> Type of records represented by the source. * @see BoundedSource * @see FileBasedSource * @see RangeTracker */ public abstract class OffsetBasedSource<T> extends BoundedSource<T> { private final long startOffset; private final long endOffset; private final long minBundleSize; /** * @param startOffset starting offset (inclusive) of the source. Must be non-negative. * * @param endOffset ending offset (exclusive) of the source. Use {@link Long#MAX_VALUE} to * indicate that the entire source after {@code startOffset} should be read. Must be * {@code > startOffset}. * * @param minBundleSize minimum bundle size in offset units that should be used when splitting the * source into sub-sources. This value may not be respected if the total * range of the source is smaller than the specified {@code minBundleSize}. * Must be non-negative. */ public OffsetBasedSource(long startOffset, long endOffset, long minBundleSize) { this.startOffset = startOffset; this.endOffset = endOffset; this.minBundleSize = minBundleSize; } /** * Returns the starting offset of the source. */ public long getStartOffset() { return startOffset; } /** * Returns the specified ending offset of the source. Any returned value greater than or equal to * {@link #getMaxEndOffset(PipelineOptions)} should be treated as * {@link #getMaxEndOffset(PipelineOptions)}. */ public long getEndOffset() { return endOffset; } /** * Returns the minimum bundle size that should be used when splitting the source into sub-sources. * This value may not be respected if the total range of the source is smaller than the specified * {@code minBundleSize}. */ public long getMinBundleSize() { return minBundleSize; } @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { long trueEndOffset = (endOffset == Long.MAX_VALUE) ? getMaxEndOffset(options) : endOffset; return getBytesPerOffset() * (trueEndOffset - getStartOffset()); } @Override public List<? extends OffsetBasedSource<T>> split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { // Split the range into bundles based on the desiredBundleSizeBytes. Final bundle is adjusted to // make sure that we do not end up with a too small bundle at the end. If the desired bundle // size is smaller than the minBundleSize of the source then minBundleSize will be used instead. long desiredBundleSizeOffsetUnits = Math.max( Math.max(1, desiredBundleSizeBytes / getBytesPerOffset()), minBundleSize); List<OffsetBasedSource<T>> subSources = new ArrayList<>(); long start = startOffset; long maxEnd = Math.min(endOffset, getMaxEndOffset(options)); while (start < maxEnd) { long end = start + desiredBundleSizeOffsetUnits; end = Math.min(end, maxEnd); // Avoid having a too small bundle at the end and ensure that we respect minBundleSize. long remaining = maxEnd - end; if ((remaining < desiredBundleSizeOffsetUnits / 4) || (remaining < minBundleSize)) { end = maxEnd; } subSources.add(createSourceForSubrange(start, end)); start = end; } return subSources; } @Override public void validate() { checkArgument( this.startOffset >= 0, "Start offset has value %s, must be non-negative", this.startOffset); checkArgument( this.endOffset >= 0, "End offset has value %s, must be non-negative", this.endOffset); checkArgument( this.startOffset <= this.endOffset, "Start offset %s may not be larger than end offset %s", this.startOffset, this.endOffset); checkArgument( this.minBundleSize >= 0, "minBundleSize has value %s, must be non-negative", this.minBundleSize); } @Override public String toString() { return "[" + startOffset + ", " + endOffset + ")"; } /** * Returns approximately how many bytes of data correspond to a single offset in this source. * Used for translation between this source's range and methods defined in terms of bytes, such * as {@link #getEstimatedSizeBytes} and {@link #split}. * * <p>Defaults to {@code 1} byte, which is the common case for, e.g., file sources. */ public long getBytesPerOffset() { return 1L; } /** * Returns the actual ending offset of the current source. The value returned by this function * will be used to clip the end of the range {@code [startOffset, endOffset)} such that the * range used is {@code [startOffset, min(endOffset, maxEndOffset))}. * * <p>As an example in which {@link OffsetBasedSource} is used to implement a file source, suppose * that this source was constructed with an {@code endOffset} of {@link Long#MAX_VALUE} to * indicate that a file should be read to the end. Then this function should determine * the actual, exact size of the file in bytes and return it. */ public abstract long getMaxEndOffset(PipelineOptions options) throws Exception; /** * Returns an {@link OffsetBasedSource} for a subrange of the current source. The * subrange {@code [start, end)} must be within the range {@code [startOffset, endOffset)} of * the current source, i.e. {@code startOffset <= start < end <= endOffset}. */ public abstract OffsetBasedSource<T> createSourceForSubrange(long start, long end); @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotDefault(DisplayData.item("minBundleSize", minBundleSize) .withLabel("Minimum Bundle Size"), 1L) .addIfNotDefault(DisplayData.item("startOffset", startOffset) .withLabel("Start Read Offset"), 0L) .addIfNotDefault(DisplayData.item("endOffset", endOffset) .withLabel("End Read Offset"), Long.MAX_VALUE); } /** * A {@link Source.Reader} that implements code common to readers of all * {@link OffsetBasedSource}s. * * <p>Subclasses have to implement: * <ul> * <li>The methods {@link #startImpl} and {@link #advanceImpl} for reading the * first or subsequent records. * <li>The methods {@link #getCurrent}, {@link #getCurrentOffset}, and optionally * {@link #isAtSplitPoint} and {@link #getCurrentTimestamp} to access properties of * the last record successfully read by {@link #startImpl} or {@link #advanceImpl}. * </ul> */ public abstract static class OffsetBasedReader<T> extends BoundedReader<T> { private static final Logger LOG = LoggerFactory.getLogger(OffsetBasedReader.class); private OffsetBasedSource<T> source; /** * Returns true if the last call to {@link #start} or {@link #advance} returned false. */ public final boolean isDone() { return rangeTracker.isDone(); } /** * Returns true if there has been a call to {@link #start}. */ public final boolean isStarted() { return rangeTracker.isStarted(); } /** The {@link OffsetRangeTracker} managing the range and current position of the source. */ private final OffsetRangeTracker rangeTracker; /** * @param source the {@link OffsetBasedSource} to be read by the current reader. */ public OffsetBasedReader(OffsetBasedSource<T> source) { this.source = source; this.rangeTracker = new OffsetRangeTracker(source.getStartOffset(), source.getEndOffset()); } /** * Returns the <i>starting</i> offset of the {@link Source.Reader#getCurrent current record}, * which has been read by the last successful {@link Source.Reader#start} or * {@link Source.Reader#advance} call. * * <p>If no such call has been made yet, the return value is unspecified. * * <p>See {@link RangeTracker} for description of offset semantics. */ protected abstract long getCurrentOffset() throws NoSuchElementException; /** * Returns whether the current record is at a split point (i.e., whether the current record * would be the first record to be read by a source with a specified start offset of * {@link #getCurrentOffset}). * * <p>See detailed documentation about split points in {@link RangeTracker}. */ protected boolean isAtSplitPoint() throws NoSuchElementException { return true; } @Override public final boolean start() throws IOException { return startImpl() && rangeTracker.tryReturnRecordAt(isAtSplitPoint(), getCurrentOffset()) || rangeTracker.markDone(); } @Override public final boolean advance() throws IOException { return advanceImpl() && rangeTracker.tryReturnRecordAt(isAtSplitPoint(), getCurrentOffset()) || rangeTracker.markDone(); } /** * Initializes the {@link OffsetBasedSource.OffsetBasedReader} and advances to the first record, * returning {@code true} if there is a record available to be read. This method will be * invoked exactly once and may perform expensive setup operations that are needed to * initialize the reader. * * <p>This function is the {@code OffsetBasedReader} implementation of * {@link BoundedReader#start}. The key difference is that the implementor can ignore the * possibility that it should no longer produce the first record, either because it has exceeded * the original {@code endOffset} assigned to the reader, or because a concurrent call to * {@link #splitAtFraction} has changed the source to shrink the offset range being read. * * @see BoundedReader#start */ protected abstract boolean startImpl() throws IOException; /** * Advances to the next record and returns {@code true}, or returns false if there is no next * record. * * <p>This function is the {@code OffsetBasedReader} implementation of * {@link BoundedReader#advance}. The key difference is that the implementor can ignore the * possibility that it should no longer produce the next record, either because it has exceeded * the original {@code endOffset} assigned to the reader, or because a concurrent call to * {@link #splitAtFraction} has changed the source to shrink the offset range being read. * * @see BoundedReader#advance */ protected abstract boolean advanceImpl() throws IOException; @Override public synchronized OffsetBasedSource<T> getCurrentSource() { return source; } @Override public Double getFractionConsumed() { return rangeTracker.getFractionConsumed(); } @Override public long getSplitPointsConsumed() { return rangeTracker.getSplitPointsProcessed(); } @Override public long getSplitPointsRemaining() { if (isDone()) { return 0; } else if (!isStarted()) { // Note that even if the current source does not allow splitting, we don't know that // it's non-empty so we return UNKNOWN instead of 1. return BoundedReader.SPLIT_POINTS_UNKNOWN; } else if (!allowsDynamicSplitting()) { // Started (so non-empty) and unsplittable, so only the current task. return 1; } else if (getCurrentOffset() >= rangeTracker.getStopPosition() - 1) { // If this is true, the next element is outside the range. Note that even getCurrentOffset() // might be larger than the stop position when the current record is not a split point. return 1; } else { // Use the default. return super.getSplitPointsRemaining(); } } /** * Whether this reader should allow dynamic splitting of the offset ranges. * * <p>True by default. Override this to return false if the reader cannot * support dynamic splitting correctly. If this returns false, * {@link OffsetBasedReader#splitAtFraction} will refuse all split requests. */ public boolean allowsDynamicSplitting() { return true; } @Override public final synchronized OffsetBasedSource<T> splitAtFraction(double fraction) { if (!allowsDynamicSplitting()) { return null; } if (rangeTracker.getStopPosition() == Long.MAX_VALUE) { LOG.debug( "Refusing to split unbounded OffsetBasedReader {} at fraction {}", rangeTracker, fraction); return null; } long splitOffset = rangeTracker.getPositionForFractionConsumed(fraction); LOG.debug( "Proposing to split OffsetBasedReader {} at fraction {} (offset {})", rangeTracker, fraction, splitOffset); long start = source.getStartOffset(); long end = source.getEndOffset(); OffsetBasedSource<T> primary = source.createSourceForSubrange(start, splitOffset); OffsetBasedSource<T> residual = source.createSourceForSubrange(splitOffset, end); if (!rangeTracker.trySplitAtPosition(splitOffset)) { return null; } this.source = primary; return residual; } } }