/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io;
import java.io.IOException;
import java.util.List;
import java.util.NoSuchElementException;
import javax.annotation.Nullable;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.io.range.OffsetRangeTracker;
import org.apache.beam.sdk.io.range.RangeTracker;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.joda.time.Instant;
/**
* A {@link Source} that reads a finite amount of input and, because of that, supports
* some additional operations.
*
* <p>The operations are:
* <ul>
* <li>Splitting into sources that read bundles of given size: {@link #split};
* <li>Size estimation: {@link #getEstimatedSizeBytes};
* <li>The accompanying {@link BoundedReader reader} has additional functionality to enable runners
* to dynamically adapt based on runtime conditions.
* <ul>
* <li>Progress estimation ({@link BoundedReader#getFractionConsumed})
* <li>Tracking of parallelism, to determine whether the current source can be split
* ({@link BoundedReader#getSplitPointsConsumed()} and
* {@link BoundedReader#getSplitPointsRemaining()}).
* <li>Dynamic splitting of the current source ({@link BoundedReader#splitAtFraction}).
* </ul>
* </li>
* </ul>
*
* @param <T> Type of records read by the source.
*/
public abstract class BoundedSource<T> extends Source<T> {
/**
* Splits the source into bundles of approximately {@code desiredBundleSizeBytes}.
*/
public abstract List<? extends BoundedSource<T>> split(
long desiredBundleSizeBytes, PipelineOptions options) throws Exception;
/**
* An estimate of the total size (in bytes) of the data that would be read from this source.
* This estimate is in terms of external storage size, before any decompression or other
* processing done by the reader.
*
* <p>If there is no way to estimate the size of the source
* implementations MAY return 0L.
*/
public abstract long getEstimatedSizeBytes(PipelineOptions options) throws Exception;
/**
* Returns a new {@link BoundedReader} that reads from this source.
*/
public abstract BoundedReader<T> createReader(PipelineOptions options) throws IOException;
/**
* A {@code Reader} that reads a bounded amount of input and supports some additional
* operations, such as progress estimation and dynamic work rebalancing.
*
* <h3>Boundedness</h3>
*
* <p>Once {@link #start} or {@link #advance} has returned false, neither will be called
* again on this object.
*
* <h3>Thread safety</h3>
*
* <p>All methods will be run from the same thread except {@link #splitAtFraction},
* {@link #getFractionConsumed}, {@link #getCurrentSource}, {@link #getSplitPointsConsumed()},
* and {@link #getSplitPointsRemaining()}, all of which can be called concurrently
* from a different thread. There will not be multiple concurrent calls to
* {@link #splitAtFraction}.
*
* <p>It must be safe to call {@link #splitAtFraction}, {@link #getFractionConsumed},
* {@link #getCurrentSource}, {@link #getSplitPointsConsumed()}, and
* {@link #getSplitPointsRemaining()} concurrently with other methods.
*
* <p>Additionally, a successful {@link #splitAtFraction} call must, by definition, cause
* {@link #getCurrentSource} to start returning a different value.
* Callers of {@link #getCurrentSource} need to be aware of the possibility that the returned
* value can change at any time, and must only access the properties of the source returned by
* {@link #getCurrentSource} which do not change between {@link #splitAtFraction} calls.
*
* <h3>Implementing {@link #splitAtFraction}</h3>
*
* <p>In the course of dynamic work rebalancing, the method {@link #splitAtFraction}
* may be called concurrently with {@link #advance} or {@link #start}. It is critical that
* their interaction is implemented in a thread-safe way, otherwise data loss is possible.
*
* <p>Sources which support dynamic work rebalancing should use
* {@link org.apache.beam.sdk.io.range.RangeTracker} to manage the (source-specific)
* range of positions that is being split.
*/
@Experimental(Experimental.Kind.SOURCE_SINK)
public abstract static class BoundedReader<T> extends Source.Reader<T> {
/**
* Returns a value in [0, 1] representing approximately what fraction of the
* {@link #getCurrentSource current source} this reader has read so far, or {@code null} if such
* an estimate is not available.
*
* <p>It is recommended that this method should satisfy the following properties:
* <ul>
* <li>Should return 0 before the {@link #start} call.
* <li>Should return 1 after a {@link #start} or {@link #advance} call that returns false.
* <li>The returned values should be non-decreasing (though they don't have to be unique).
* </ul>
*
* <p>By default, returns null to indicate that this cannot be estimated.
*
* <h3>Thread safety</h3>
* If {@link #splitAtFraction} is implemented, this method can be called concurrently to other
* methods (including itself), and it is therefore critical for it to be implemented
* in a thread-safe way.
*/
@Nullable
public Double getFractionConsumed() {
return null;
}
/**
* A constant to use as the return value for {@link #getSplitPointsConsumed()} or
* {@link #getSplitPointsRemaining()} when the exact value is unknown.
*/
public static final long SPLIT_POINTS_UNKNOWN = -1;
/**
* Returns the total amount of parallelism in the consumed (returned and processed) range of
* this reader's current {@link BoundedSource} (as would be returned by
* {@link #getCurrentSource}). This corresponds to all split point records (see
* {@link RangeTracker}) returned by this reader, <em>excluding</em> the last split point
* returned if the reader is not finished.
*
* <p>Consider the following examples: (1) An input that can be read in parallel down to the
* individual records, such as {@link CountingSource#upTo}, is called "perfectly splittable".
* (2) a "block-compressed" file format such as {@link AvroIO}, in which a block of records has
* to be read as a whole, but different blocks can be read in parallel. (3) An "unsplittable"
* input such as a cursor in a database.
*
* <ul>
* <li>Any {@link BoundedReader reader} that is unstarted (aka, has never had a call to
* {@link #start}) has a consumed parallelism of 0. This condition holds independent of whether
* the input is splittable.
* <li>Any {@link BoundedReader reader} that has only returned its first element (aka,
* has never had a call to {@link #advance}) has a consumed parallelism of 0: the first element
* is the current element and is still being processed. This condition holds independent of
* whether the input is splittable.
* <li>For an empty reader (in which the call to {@link #start} returned false), the
* consumed parallelism is 0. This condition holds independent of whether the input is
* splittable.
* <li>For a non-empty, finished reader (in which the call to {@link #start} returned true and
* a call to {@link #advance} has returned false), the value returned must be at least 1
* and should equal the total parallelism in the source.
* <li>For example (1): After returning record #30 (starting at 1) out of 50 in a perfectly
* splittable 50-record input, this value should be 29. When finished, the consumed parallelism
* should be 50.
* <li>For example (2): In a block-compressed value consisting of 5 blocks, the value should
* stay at 0 until the first record of the second block is returned; stay at 1 until the first
* record of the third block is returned, etc. Only once the end-of-file is reached then the
* fifth block has been consumed and the value should stay at 5.
* <li>For example (3): For any non-empty unsplittable input, the consumed parallelism is 0
* until the reader is finished (because the last call to {@link #advance} returned false, at
* which point it becomes 1.
* </ul>
*
* <p>A reader that is implemented using a {@link RangeTracker} is encouraged to use the
* range tracker's ability to count split points to implement this method. See
* {@link OffsetBasedSource.OffsetBasedReader} and {@link OffsetRangeTracker} for an example.
*
* <p>Defaults to {@link #SPLIT_POINTS_UNKNOWN}. Any value less than 0 will be interpreted
* as unknown.
*
* <h3>Thread safety</h3>
* See the javadoc on {@link BoundedReader} for information about thread safety.
*
* @see #getSplitPointsRemaining()
*/
public long getSplitPointsConsumed() {
return SPLIT_POINTS_UNKNOWN;
}
/**
* Returns the total amount of parallelism in the unprocessed part of this reader's current
* {@link BoundedSource} (as would be returned by {@link #getCurrentSource}). This corresponds
* to all unprocessed split point records (see {@link RangeTracker}), including the last
* split point returned, in the remainder part of the source.
*
* <p>This function should be implemented only <strong>in addition to
* {@link #getSplitPointsConsumed()}</strong> and only if <em>an exact value can be
* returned</em>.
*
* <p>Consider the following examples: (1) An input that can be read in parallel down to the
* individual records, such as {@link CountingSource#upTo}, is called "perfectly splittable".
* (2) a "block-compressed" file format such as {@link AvroIO}, in which a block of records has
* to be read as a whole, but different blocks can be read in parallel. (3) An "unsplittable"
* input such as a cursor in a database.
*
* <p>Assume for examples (1) and (2) that the number of records or blocks remaining is known:
*
* <ul>
* <li>Any {@link BoundedReader reader} for which the last call to {@link #start} or
* {@link #advance} has returned true should should not return 0, because this reader itself
* represents parallelism at least 1. This condition holds independent of whether the input is
* splittable.
* <li>A finished reader (for which {@link #start} or {@link #advance}) has returned false
* should return a value of 0. This condition holds independent of whether the input is
* splittable.
* <li>For example 1: After returning record #30 (starting at 1) out of 50 in a perfectly
* splittable 50-record input, this value should be 21 (20 remaining + 1 current) if the total
* number of records is known.
* <li>For example 2: After returning a record in block 3 in a block-compressed file
* consisting of 5 blocks, this value should be 3 (since blocks 4 and 5 can be processed in
* parallel by new readers produced via dynamic work rebalancing, while the current reader
* continues processing block 3) if the total number of blocks is known.
* <li>For example (3): a reader for any non-empty unsplittable input, should return 1 until
* it is finished, at which point it should return 0.
* <li>For any reader: After returning the last split point in a file (e.g., the last record
* in example (1), the first record in the last block for example (2), or the first record in
* the file for example (3), this value should be 1: apart from the current task, no additional
* remainder can be split off.
* </ul>
*
* <p>Defaults to {@link #SPLIT_POINTS_UNKNOWN}. Any value less than 0 will be interpreted as
* unknown.
*
* <h3>Thread safety</h3>
* See the javadoc on {@link BoundedReader} for information about thread safety.
*
* @see #getSplitPointsConsumed()
*/
public long getSplitPointsRemaining() {
return SPLIT_POINTS_UNKNOWN;
}
/**
* Returns a {@code Source} describing the same input that this {@code Reader} currently reads
* (including items already read).
*
* <h3>Usage</h3>
*
* <p>Reader subclasses can use this method for convenience to access unchanging properties of
* the source being read. Alternatively, they can cache these properties in the constructor.
*
* <p>The framework will call this method in the course of dynamic work rebalancing, e.g. after
* a successful {@link BoundedSource.BoundedReader#splitAtFraction} call.
*
* <h3>Mutability and thread safety</h3>
*
* <p>Remember that {@link Source} objects must always be immutable. However, the return value
* of this function may be affected by dynamic work rebalancing, happening asynchronously via
* {@link BoundedSource.BoundedReader#splitAtFraction}, meaning it can return a different
* {@link Source} object. However, the returned object itself will still itself be immutable.
* Callers must take care not to rely on properties of the returned source that may be
* asynchronously changed as a result of this process (e.g. do not cache an end offset when
* reading a file).
*
* <h3>Implementation</h3>
*
* <p>For convenience, subclasses should usually return the most concrete subclass of
* {@link Source} possible.
* In practice, the implementation of this method should nearly always be one of the following:
* <ul>
* <li>Source that inherits from a base class that already implements
* {@link #getCurrentSource}: delegate to base class. In this case, it is almost always
* an error for the subclass to maintain its own copy of the source.
* <pre>{@code
* public FooReader(FooSource<T> source) {
* super(source);
* }
*
* public FooSource<T> getCurrentSource() {
* return (FooSource<T>)super.getCurrentSource();
* }
* }</pre>
* <li>Source that does not support dynamic work rebalancing: return a private final variable.
* <pre>{@code
* private final FooSource<T> source;
*
* public FooReader(FooSource<T> source) {
* this.source = source;
* }
*
* public FooSource<T> getCurrentSource() {
* return source;
* }
* }</pre>
* <li>{@link BoundedSource.BoundedReader} that explicitly supports dynamic work rebalancing:
* maintain a variable pointing to an immutable source object, and protect it with
* synchronization.
* <pre>{@code
* private FooSource<T> source;
*
* public FooReader(FooSource<T> source) {
* this.source = source;
* }
*
* public synchronized FooSource<T> getCurrentSource() {
* return source;
* }
*
* public synchronized FooSource<T> splitAtFraction(double fraction) {
* ...
* FooSource<T> primary = ...;
* FooSource<T> residual = ...;
* this.source = primary;
* return residual;
* }
* }</pre>
* </ul>
*/
@Override
public abstract BoundedSource<T> getCurrentSource();
/**
* Tells the reader to narrow the range of the input it's going to read and give up
* the remainder, so that the new range would contain approximately the given
* fraction of the amount of data in the current range.
*
* <p>Returns a {@code BoundedSource} representing the remainder.
*
* <h3>Detailed description</h3>
* Assuming the following sequence of calls:
* <pre>{@code
* BoundedSource<T> initial = reader.getCurrentSource();
* BoundedSource<T> residual = reader.splitAtFraction(fraction);
* BoundedSource<T> primary = reader.getCurrentSource();
* }</pre>
* <ul>
* <li> The "primary" and "residual" sources, when read, should together cover the same
* set of records as "initial".
* <li> The current reader should continue to be in a valid state, and continuing to read
* from it should, together with the records it already read, yield the same records
* as would have been read by "primary".
* <li> The amount of data read by "primary" should ideally represent approximately
* the given fraction of the amount of data read by "initial".
* </ul>
* For example, a reader that reads a range of offsets <i>[A, B)</i> in a file might implement
* this method by truncating the current range to <i>[A, A + fraction*(B-A))</i> and returning
* a Source representing the range <i>[A + fraction*(B-A), B)</i>.
*
* <p>This method should return {@code null} if the split cannot be performed for this fraction
* while satisfying the semantics above. E.g., a reader that reads a range of offsets
* in a file should return {@code null} if it is already past the position in its range
* corresponding to the given fraction. In this case, the method MUST have no effect
* (the reader must behave as if the method hadn't been called at all).
*
* <h3>Statefulness</h3>
* Since this method (if successful) affects the reader's source, in subsequent invocations
* "fraction" should be interpreted relative to the new current source.
*
* <h3>Thread safety and blocking</h3>
* This method will be called concurrently to other methods (however there will not be multiple
* concurrent invocations of this method itself), and it is critical for it to be implemented
* in a thread-safe way (otherwise data loss is possible).
*
* <p>It is also very important that this method always completes quickly. In particular,
* it should not perform or wait on any blocking operations such as I/O, RPCs etc. Violating
* this requirement may stall completion of the work item or even cause it to fail.
*
* <p>It is incorrect to make both this method and {@link #start}/{@link #advance}
* {@code synchronized}, because those methods can perform blocking operations, and then
* this method would have to wait for those calls to complete.
*
* <p>{@link org.apache.beam.sdk.io.range.RangeTracker} makes it easy to implement
* this method safely and correctly.
*
* <p>By default, returns null to indicate that splitting is not possible.
*/
@Nullable
public BoundedSource<T> splitAtFraction(double fraction) {
return null;
}
/**
* By default, returns the minimum possible timestamp.
*/
@Override
public Instant getCurrentTimestamp() throws NoSuchElementException {
return BoundedWindow.TIMESTAMP_MIN_VALUE;
}
}
}