OffsetBasedSource.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import org.apache.beam.sdk.io.range.OffsetRangeTracker;
import org.apache.beam.sdk.io.range.RangeTracker;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A {@link BoundedSource} that uses offsets to define starting and ending positions.
 *
 * <p>{@link OffsetBasedSource} is a common base class for all bounded sources where the input can
 * be represented as a single range, and an input can be efficiently processed in parallel by
 * splitting the range into a set of disjoint ranges whose union is the original range. This class
 * should be used for sources that can be cheaply read starting at any given offset.
 * {@link OffsetBasedSource} stores the range and implements splitting into bundles.
 *
 * <p>Extend {@link OffsetBasedSource} to implement your own offset-based custom source.
 * {@link FileBasedSource}, which is a subclass of this, adds additional functionality useful for
 * custom sources that are based on files. If possible implementors should start from
 * {@link FileBasedSource} instead of {@link OffsetBasedSource}.
 *
 * <p>Consult {@link RangeTracker} for important semantics common to all sources defined by a range
 * of positions of a certain type, including the semantics of split points
 * ({@link OffsetBasedReader#isAtSplitPoint}).
 *
 * @param <T> Type of records represented by the source.
 * @see BoundedSource
 * @see FileBasedSource
 * @see RangeTracker
 */
public abstract class OffsetBasedSource<T> extends BoundedSource<T> {
  private final long startOffset;
  private final long endOffset;
  private final long minBundleSize;

  /**
   * @param startOffset starting offset (inclusive) of the source. Must be non-negative.
   *
   * @param endOffset ending offset (exclusive) of the source. Use {@link Long#MAX_VALUE} to
   *        indicate that the entire source after {@code startOffset} should be read. Must be
   *        {@code > startOffset}.
   *
   * @param minBundleSize minimum bundle size in offset units that should be used when splitting the
   *                      source into sub-sources. This value may not be respected if the total
   *                      range of the source is smaller than the specified {@code minBundleSize}.
   *                      Must be non-negative.
   */
  public OffsetBasedSource(long startOffset, long endOffset, long minBundleSize) {
    this.startOffset = startOffset;
    this.endOffset = endOffset;
    this.minBundleSize = minBundleSize;
  }

  /**
   * Returns the starting offset of the source.
   */
  public long getStartOffset() {
    return startOffset;
  }

  /**
   * Returns the specified ending offset of the source. Any returned value greater than or equal to
   * {@link #getMaxEndOffset(PipelineOptions)} should be treated as
   * {@link #getMaxEndOffset(PipelineOptions)}.
   */
  public long getEndOffset() {
    return endOffset;
  }

  /**
   * Returns the minimum bundle size that should be used when splitting the source into sub-sources.
   * This value may not be respected if the total range of the source is smaller than the specified
   * {@code minBundleSize}.
   */
  public long getMinBundleSize() {
    return minBundleSize;
  }

  @Override
  public long getEstimatedSizeBytes(PipelineOptions options) throws Exception {
    long trueEndOffset = (endOffset == Long.MAX_VALUE) ? getMaxEndOffset(options) : endOffset;
    return getBytesPerOffset() * (trueEndOffset - getStartOffset());
  }

  @Override
  public List<? extends OffsetBasedSource<T>> split(
      long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
    // Split the range into bundles based on the desiredBundleSizeBytes. Final bundle is adjusted to
    // make sure that we do not end up with a too small bundle at the end. If the desired bundle
    // size is smaller than the minBundleSize of the source then minBundleSize will be used instead.

    long desiredBundleSizeOffsetUnits = Math.max(
        Math.max(1, desiredBundleSizeBytes / getBytesPerOffset()),
        minBundleSize);

    List<OffsetBasedSource<T>> subSources = new ArrayList<>();
    long start = startOffset;
    long maxEnd = Math.min(endOffset, getMaxEndOffset(options));

    while (start < maxEnd) {
      long end = start + desiredBundleSizeOffsetUnits;
      end = Math.min(end, maxEnd);
      // Avoid having a too small bundle at the end and ensure that we respect minBundleSize.
      long remaining = maxEnd - end;
      if ((remaining < desiredBundleSizeOffsetUnits / 4) || (remaining < minBundleSize)) {
        end = maxEnd;
      }
      subSources.add(createSourceForSubrange(start, end));

      start = end;
    }
    return subSources;
  }

  @Override
  public void validate() {
    checkArgument(
        this.startOffset >= 0,
        "Start offset has value %s, must be non-negative", this.startOffset);
    checkArgument(
        this.endOffset >= 0,
        "End offset has value %s, must be non-negative", this.endOffset);
    checkArgument(
        this.startOffset <= this.endOffset,
        "Start offset %s may not be larger than end offset %s",
        this.startOffset, this.endOffset);
    checkArgument(
        this.minBundleSize >= 0,
        "minBundleSize has value %s, must be non-negative",
        this.minBundleSize);
  }

  @Override
  public String toString() {
    return "[" + startOffset + ", " + endOffset + ")";
  }

  /**
   * Returns approximately how many bytes of data correspond to a single offset in this source.
   * Used for translation between this source's range and methods defined in terms of bytes, such
   * as {@link #getEstimatedSizeBytes} and {@link #split}.
   *
   * <p>Defaults to {@code 1} byte, which is the common case for, e.g., file sources.
   */
  public long getBytesPerOffset() {
    return 1L;
  }

  /**
   * Returns the actual ending offset of the current source. The value returned by this function
   * will be used to clip the end of the range {@code [startOffset, endOffset)} such that the
   * range used is {@code [startOffset, min(endOffset, maxEndOffset))}.
   *
   * <p>As an example in which {@link OffsetBasedSource} is used to implement a file source, suppose
   * that this source was constructed with an {@code endOffset} of {@link Long#MAX_VALUE} to
   * indicate that a file should be read to the end. Then this function should determine
   * the actual, exact size of the file in bytes and return it.
   */
  public abstract long getMaxEndOffset(PipelineOptions options) throws Exception;

  /**
   * Returns an {@link OffsetBasedSource} for a subrange of the current source. The
   * subrange {@code [start, end)} must be within the range {@code [startOffset, endOffset)} of
   * the current source, i.e. {@code startOffset <= start < end <= endOffset}.
   */
  public abstract OffsetBasedSource<T> createSourceForSubrange(long start, long end);

  @Override
  public void populateDisplayData(DisplayData.Builder builder) {
    super.populateDisplayData(builder);
    builder
        .addIfNotDefault(DisplayData.item("minBundleSize", minBundleSize)
            .withLabel("Minimum Bundle Size"), 1L)
        .addIfNotDefault(DisplayData.item("startOffset", startOffset)
            .withLabel("Start Read Offset"), 0L)
        .addIfNotDefault(DisplayData.item("endOffset", endOffset)
            .withLabel("End Read Offset"), Long.MAX_VALUE);
  }

  /**
   * A {@link Source.Reader} that implements code common to readers of all
   * {@link OffsetBasedSource}s.
   *
   * <p>Subclasses have to implement:
   * <ul>
   *   <li>The methods {@link #startImpl} and {@link #advanceImpl} for reading the
   *   first or subsequent records.
   *   <li>The methods {@link #getCurrent}, {@link #getCurrentOffset}, and optionally
   *   {@link #isAtSplitPoint} and {@link #getCurrentTimestamp} to access properties of
   *   the last record successfully read by {@link #startImpl} or {@link #advanceImpl}.
   * </ul>
   */
  public abstract static class OffsetBasedReader<T> extends BoundedReader<T> {
    private static final Logger LOG = LoggerFactory.getLogger(OffsetBasedReader.class);
    private OffsetBasedSource<T> source;

    /**
     * Returns true if the last call to {@link #start} or {@link #advance} returned false.
     */
    public final boolean isDone() {
      return rangeTracker.isDone();
    }

    /**
     * Returns true if there has been a call to {@link #start}.
     */
    public final boolean isStarted() {
      return rangeTracker.isStarted();
    }

    /** The {@link OffsetRangeTracker} managing the range and current position of the source. */
    private final OffsetRangeTracker rangeTracker;

    /**
     * @param source the {@link OffsetBasedSource} to be read by the current reader.
     */
    public OffsetBasedReader(OffsetBasedSource<T> source) {
      this.source = source;
      this.rangeTracker = new OffsetRangeTracker(source.getStartOffset(), source.getEndOffset());
    }

    /**
     * Returns the <i>starting</i> offset of the {@link Source.Reader#getCurrent current record},
     * which has been read by the last successful {@link Source.Reader#start} or
     * {@link Source.Reader#advance} call.
     *
     * <p>If no such call has been made yet, the return value is unspecified.
     *
     * <p>See {@link RangeTracker} for description of offset semantics.
     */
    protected abstract long getCurrentOffset() throws NoSuchElementException;

    /**
     * Returns whether the current record is at a split point (i.e., whether the current record
     * would be the first record to be read by a source with a specified start offset of
     * {@link #getCurrentOffset}).
     *
     * <p>See detailed documentation about split points in {@link RangeTracker}.
     */
    protected boolean isAtSplitPoint() throws NoSuchElementException {
      return true;
    }

    @Override
    public final boolean start() throws IOException {
      return startImpl() && rangeTracker.tryReturnRecordAt(isAtSplitPoint(), getCurrentOffset())
          || rangeTracker.markDone();
    }

    @Override
    public final boolean advance() throws IOException {
      return advanceImpl() && rangeTracker.tryReturnRecordAt(isAtSplitPoint(), getCurrentOffset())
          || rangeTracker.markDone();
    }

    /**
     * Initializes the {@link OffsetBasedSource.OffsetBasedReader} and advances to the first record,
     * returning {@code true} if there is a record available to be read. This method will be
     * invoked exactly once and may perform expensive setup operations that are needed to
     * initialize the reader.
     *
     * <p>This function is the {@code OffsetBasedReader} implementation of
     * {@link BoundedReader#start}. The key difference is that the implementor can ignore the
     * possibility that it should no longer produce the first record, either because it has exceeded
     * the original {@code endOffset} assigned to the reader, or because a concurrent call to
     * {@link #splitAtFraction} has changed the source to shrink the offset range being read.
     *
     * @see BoundedReader#start
     */
    protected abstract boolean startImpl() throws IOException;

    /**
     * Advances to the next record and returns {@code true}, or returns false if there is no next
     * record.
     *
     * <p>This function is the {@code OffsetBasedReader} implementation of
     * {@link BoundedReader#advance}. The key difference is that the implementor can ignore the
     * possibility that it should no longer produce the next record, either because it has exceeded
     * the original {@code endOffset} assigned to the reader, or because a concurrent call to
     * {@link #splitAtFraction} has changed the source to shrink the offset range being read.
     *
     * @see BoundedReader#advance
     */
    protected abstract boolean advanceImpl() throws IOException;

    @Override
    public synchronized OffsetBasedSource<T> getCurrentSource() {
      return source;
    }

    @Override
    public Double getFractionConsumed() {
      return rangeTracker.getFractionConsumed();
    }

    @Override
    public long getSplitPointsConsumed() {
      return rangeTracker.getSplitPointsProcessed();
    }

    @Override
    public long getSplitPointsRemaining() {
      if (isDone()) {
        return 0;
      } else if (!isStarted()) {
        // Note that even if the current source does not allow splitting, we don't know that
        // it's non-empty so we return UNKNOWN instead of 1.
        return BoundedReader.SPLIT_POINTS_UNKNOWN;
      } else if (!allowsDynamicSplitting()) {
        // Started (so non-empty) and unsplittable, so only the current task.
        return 1;
      } else if (getCurrentOffset() >= rangeTracker.getStopPosition() - 1) {
        // If this is true, the next element is outside the range. Note that even getCurrentOffset()
        // might be larger than the stop position when the current record is not a split point.
        return 1;
      } else {
        // Use the default.
        return super.getSplitPointsRemaining();
      }
    }

    /**
     * Whether this reader should allow dynamic splitting of the offset ranges.
     *
     * <p>True by default. Override this to return false if the reader cannot
     * support dynamic splitting correctly. If this returns false,
     * {@link OffsetBasedReader#splitAtFraction} will refuse all split requests.
     */
    public boolean allowsDynamicSplitting() {
      return true;
    }

    @Override
    public final synchronized OffsetBasedSource<T> splitAtFraction(double fraction) {
      if (!allowsDynamicSplitting()) {
        return null;
      }
      if (rangeTracker.getStopPosition() == Long.MAX_VALUE) {
        LOG.debug(
            "Refusing to split unbounded OffsetBasedReader {} at fraction {}",
            rangeTracker, fraction);
        return null;
      }
      long splitOffset = rangeTracker.getPositionForFractionConsumed(fraction);
      LOG.debug(
          "Proposing to split OffsetBasedReader {} at fraction {} (offset {})",
          rangeTracker, fraction, splitOffset);
      long start = source.getStartOffset();
      long end = source.getEndOffset();
      OffsetBasedSource<T> primary = source.createSourceForSubrange(start, splitOffset);
      OffsetBasedSource<T> residual = source.createSourceForSubrange(splitOffset, end);
      if (!rangeTracker.trySplitAtPosition(splitOffset)) {
        return null;
      }
      this.source = primary;
      return residual;
    }
  }
}