/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io;
import java.io.IOException;
import java.io.Serializable;
import java.util.NoSuchElementException;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.HasDisplayData;
import org.joda.time.Instant;
/**
* Base class for defining input formats and creating a {@code Source} for reading the input.
*
* <p>This class is not intended to be subclassed directly. Instead, to define
* a bounded source (a source which produces a finite amount of input), subclass
* {@link BoundedSource}; to define an unbounded source, subclass {@link UnboundedSource}.
*
* <p>A {@code Source} passed to a {@code Read} transform must be
* {@code Serializable}. This allows the {@code Source} instance
* created in this "main program" to be sent (in serialized form) to
* remote worker machines and reconstituted for each batch of elements
* of the input {@code PCollection} being processed or for each source splitting
* operation. A {@code Source} can have instance variable state, and
* non-transient instance variable state will be serialized in the main program
* and then deserialized on remote worker machines.
*
* <p>{@code Source} classes MUST be effectively immutable. The only acceptable use of
* mutable fields is to cache the results of expensive operations, and such fields MUST be
* marked {@code transient}.
*
* <p>{@code Source} objects should override {@link Object#toString}, as it will be
* used in important error and debugging messages.
*
* @param <T> Type of elements read by the source.
*/
@Experimental(Experimental.Kind.SOURCE_SINK)
public abstract class Source<T> implements Serializable, HasDisplayData {
/**
* Checks that this source is valid, before it can be used in a pipeline.
*
* <p>It is recommended to use {@link com.google.common.base.Preconditions} for implementing
* this method.
*/
public abstract void validate();
/**
* Returns the default {@code Coder} to use for the data read from this source.
*/
public abstract Coder<T> getDefaultOutputCoder();
/**
* {@inheritDoc}
*
* <p>By default, does not register any display data. Implementors may override this method
* to provide their own display data.
*/
@Override
public void populateDisplayData(DisplayData.Builder builder) {}
/**
* The interface that readers of custom input sources must implement.
*
* <p>This interface is deliberately distinct from {@link java.util.Iterator} because
* the current model tends to be easier to program and more efficient in practice
* for iterating over sources such as files, databases etc. (rather than pure collections).
*
* <p>Reading data from the {@link Reader} must obey the following access pattern:
* <ul>
* <li> One call to {@link #start}
* <ul><li>If {@link #start} returned true, any number of calls to {@code getCurrent}*
* methods</ul>
* <li> Repeatedly, a call to {@link #advance}. This may be called regardless
* of what the previous {@link #start}/{@link #advance} returned.
* <ul><li>If {@link #advance} returned true, any number of calls to {@code getCurrent}*
* methods</ul>
* </ul>
*
* <p>For example, if the reader is reading a fixed set of data:
* <pre>
* try {
* for (boolean available = reader.start(); available; available = reader.advance()) {
* T item = reader.getCurrent();
* Instant timestamp = reader.getCurrentTimestamp();
* ...
* }
* } finally {
* reader.close();
* }
* </pre>
*
* <p>If the set of data being read is continually growing:
* <pre>
* try {
* boolean available = reader.start();
* while (true) {
* if (available) {
* T item = reader.getCurrent();
* Instant timestamp = reader.getCurrentTimestamp();
* ...
* resetExponentialBackoff();
* } else {
* exponentialBackoff();
* }
* available = reader.advance();
* }
* } finally {
* reader.close();
* }
* </pre>
*
* <p>Note: this interface is a work-in-progress and may change.
*
* <p>All {@code Reader} functions except {@link #getCurrentSource} do not need to be thread-safe;
* they may only be accessed by a single thread at once. However, {@link #getCurrentSource} needs
* to be thread-safe, and other functions should assume that its returned value can change
* asynchronously.
*/
public abstract static class Reader<T> implements AutoCloseable {
/**
* Initializes the reader and advances the reader to the first record.
*
* <p>This method should be called exactly once. The invocation should occur prior to calling
* {@link #advance} or {@link #getCurrent}. This method may perform expensive operations that
* are needed to initialize the reader.
*
* @return {@code true} if a record was read, {@code false} if there is no more input available.
*/
public abstract boolean start() throws IOException;
/**
* Advances the reader to the next valid record.
*
* <p>It is an error to call this without having called {@link #start} first.
*
* @return {@code true} if a record was read, {@code false} if there is no more input available.
*/
public abstract boolean advance() throws IOException;
/**
* Returns the value of the data item that was read by the last {@link #start} or
* {@link #advance} call. The returned value must be effectively immutable and remain valid
* indefinitely.
*
* <p>Multiple calls to this method without an intervening call to {@link #advance} should
* return the same result.
*
* @throws java.util.NoSuchElementException if {@link #start} was never called, or if
* the last {@link #start} or {@link #advance} returned {@code false}.
*/
public abstract T getCurrent() throws NoSuchElementException;
/**
* Returns the timestamp associated with the current data item.
*
* <p>If the source does not support timestamps, this should return
* {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
*
* <p>Multiple calls to this method without an intervening call to {@link #advance} should
* return the same result.
*
* @throws NoSuchElementException if the reader is at the beginning of the input and
* {@link #start} or {@link #advance} wasn't called, or if the last {@link #start} or
* {@link #advance} returned {@code false}.
*/
public abstract Instant getCurrentTimestamp() throws NoSuchElementException;
/**
* Closes the reader. The reader cannot be used after this method is called.
*/
@Override
public abstract void close() throws IOException;
/**
* Returns a {@code Source} describing the same input that this {@code Reader} currently reads
* (including items already read).
*
* <p>Usually, an implementation will simply return the immutable {@link Source} object from
* which the current {@link Reader} was constructed, or delegate to the base class.
* However, when using or implementing this method on a {@link BoundedSource.BoundedReader},
* special considerations apply, see documentation for
* {@link BoundedSource.BoundedReader#getCurrentSource}.
*/
public abstract Source<T> getCurrentSource();
}
}