/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import com.google.common.collect.ImmutableList; import java.io.IOException; import java.util.List; import java.util.NoSuchElementException; import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.DefaultCoder; import org.apache.beam.sdk.coders.VarLongCoder; import org.apache.beam.sdk.io.UnboundedSource.UnboundedReader; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.SourceMetrics; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.PCollection; import org.joda.time.Duration; import org.joda.time.Instant; /** * Most users should use {@link GenerateSequence} instead. * * <p>A source that produces longs. When used as a {@link BoundedSource}, {@link CountingSource} * starts at {@code 0} and counts up to a specified maximum. When used as an {@link * UnboundedSource}, it counts up to {@link Long#MAX_VALUE} and then never produces more output. (In * practice, this limit should never be reached.) * * <p>The bounded {@link CountingSource} is implemented based on {@link OffsetBasedSource} and * {@link OffsetBasedSource.OffsetBasedReader}, so it performs efficient initial splitting and it * supports dynamic work rebalancing. * * <p>To produce a bounded source, use {@link #createSourceForSubrange(long, long)}. To produce an * unbounded source, use {@link #createUnboundedFrom(long)}. */ public class CountingSource { /** * Creates a {@link BoundedSource} that will produce the specified number of elements, * from {@code 0} to {@code numElements - 1}. * * @deprecated use {@link GenerateSequence} instead */ @Deprecated public static BoundedSource<Long> upTo(long numElements) { checkArgument(numElements >= 0, "numElements (%s) must be greater than or equal to 0", numElements); return new BoundedCountingSource(0, numElements); } /** * Creates a {@link BoundedSource} that will produce elements * starting from {@code startIndex} (inclusive) to {@code endIndex} (exclusive). * If {@code startIndex == endIndex}, then no elements will be produced. */ static BoundedSource<Long> createSourceForSubrange(long startIndex, long endIndex) { checkArgument(endIndex >= startIndex, "endIndex (%s) must be greater than or equal to startIndex (%s)", endIndex, startIndex); return new BoundedCountingSource(startIndex, endIndex); } /** * Create a new {@link UnboundedCountingSource}. */ // package-private to return a typed UnboundedCountingSource rather than the UnboundedSource type. static UnboundedCountingSource createUnboundedFrom(long start) { return new UnboundedCountingSource(start, 1, 1L, Duration.ZERO, new NowTimestampFn()); } /** * Creates an {@link UnboundedSource} that will produce numbers starting from {@code 0} up to * {@link Long#MAX_VALUE}. * * <p>After {@link Long#MAX_VALUE}, the source never produces more output. (In practice, this * limit should never be reached.) * * <p>Elements in the resulting {@link PCollection PCollection<Long>} will have timestamps * corresponding to processing time at element generation, provided by {@link Instant#now}. * * @deprecated use {@link GenerateSequence} instead */ @Deprecated public static UnboundedSource<Long, CounterMark> unbounded() { return unboundedWithTimestampFn(new NowTimestampFn()); } /** * Creates an {@link UnboundedSource} that will produce numbers starting from {@code 0} up to * {@link Long#MAX_VALUE}, with element timestamps supplied by the specified function. * * <p>After {@link Long#MAX_VALUE}, the source never produces more output. (In practice, this * limit should never be reached.) * * <p>Note that the timestamps produced by {@code timestampFn} may not decrease. * * @deprecated use {@link GenerateSequence} and call * {@link GenerateSequence#withTimestampFn(SerializableFunction)} instead */ @Deprecated public static UnboundedSource<Long, CounterMark> unboundedWithTimestampFn( SerializableFunction<Long, Instant> timestampFn) { return new UnboundedCountingSource(0, 1, 1L, Duration.ZERO, timestampFn); } ///////////////////////////////////////////////////////////////////////////////////////////// /** Prevent instantiation. */ private CountingSource() {} /** * A function that returns {@link Instant#now} as the timestamp for each generated element. */ static class NowTimestampFn implements SerializableFunction<Long, Instant> { @Override public Instant apply(Long input) { return Instant.now(); } } /** * An implementation of {@link CountingSource} that produces a bounded {@link PCollection}. * It is implemented on top of {@link OffsetBasedSource} (with associated reader * {@link BoundedCountingReader}) and performs efficient initial splitting and supports dynamic * work rebalancing. */ private static class BoundedCountingSource extends OffsetBasedSource<Long> { /** * Creates a {@link BoundedCountingSource} that generates the numbers in the specified * {@code [start, end)} range. */ public BoundedCountingSource(long start, long end) { super(start, end, 1 /* can be split every 1 offset */); } //////////////////////////////////////////////////////////////////////////////////////////// @Override public long getBytesPerOffset() { return 8; } @Override public long getMaxEndOffset(PipelineOptions options) throws Exception { return getEndOffset(); } @Override public OffsetBasedSource<Long> createSourceForSubrange(long start, long end) { return new BoundedCountingSource(start, end); } @Override public org.apache.beam.sdk.io.BoundedSource.BoundedReader<Long> createReader( PipelineOptions options) throws IOException { return new BoundedCountingReader(this); } @Override public Coder<Long> getDefaultOutputCoder() { return VarLongCoder.of(); } } /** * The reader associated with {@link BoundedCountingSource}. * * @see BoundedCountingSource */ private static class BoundedCountingReader extends OffsetBasedSource.OffsetBasedReader<Long> { private long current; private final Counter elementsRead = SourceMetrics.elementsRead(); public BoundedCountingReader(OffsetBasedSource<Long> source) { super(source); } @Override protected long getCurrentOffset() throws NoSuchElementException { return current; } @Override public synchronized long getSplitPointsRemaining() { return Math.max(0, getCurrentSource().getEndOffset() - current); } @Override public synchronized BoundedCountingSource getCurrentSource() { return (BoundedCountingSource) super.getCurrentSource(); } @Override public Long getCurrent() throws NoSuchElementException { return current; } @Override protected boolean startImpl() throws IOException { current = getCurrentSource().getStartOffset(); return true; } @Override protected boolean advanceImpl() throws IOException { elementsRead.inc(); current++; return true; } @Override public void close() throws IOException {} } /** * An implementation of {@link CountingSource} that produces an unbounded {@link PCollection}. */ static class UnboundedCountingSource extends UnboundedSource<Long, CounterMark> { /** The first number (>= 0) generated by this {@link UnboundedCountingSource}. */ private final long start; /** The interval between numbers generated by this {@link UnboundedCountingSource}. */ private final long stride; /** The number of elements to produce each period. */ private final long elementsPerPeriod; /** The time between producing numbers from this {@link UnboundedCountingSource}. */ private final Duration period; /** The function used to produce timestamps for the generated elements. */ private final SerializableFunction<Long, Instant> timestampFn; /** * Creates an {@link UnboundedSource} that will produce numbers starting from {@code 0} up to * {@link Long#MAX_VALUE}, with element timestamps supplied by the specified function. * * <p>After {@link Long#MAX_VALUE}, the source never produces more output. (In practice, this * limit should never be reached.) * * <p>Note that the timestamps produced by {@code timestampFn} may not decrease. */ private UnboundedCountingSource( long start, long stride, long elementsPerPeriod, Duration period, SerializableFunction<Long, Instant> timestampFn) { this.start = start; this.stride = stride; checkArgument( elementsPerPeriod > 0L, "Must produce at least one element per period, got %s", elementsPerPeriod); this.elementsPerPeriod = elementsPerPeriod; checkArgument( period.getMillis() >= 0L, "Must have a non-negative period length, got %s", period); this.period = period; this.timestampFn = timestampFn; } /** * Returns an {@link UnboundedCountingSource} like this one with the specified period. Elements * will be produced with an interval between them equal to the period. */ public UnboundedCountingSource withRate(long elementsPerPeriod, Duration period) { return new UnboundedCountingSource(start, stride, elementsPerPeriod, period, timestampFn); } /** * Returns an {@link UnboundedCountingSource} like this one where the timestamp of output * elements are supplied by the specified function. * * <p>Note that timestamps produced by {@code timestampFn} may not decrease. */ public UnboundedCountingSource withTimestampFn( SerializableFunction<Long, Instant> timestampFn) { checkNotNull(timestampFn); return new UnboundedCountingSource(start, stride, elementsPerPeriod, period, timestampFn); } /** * Splits an unbounded source {@code desiredNumSplits} ways by giving each split every * {@code desiredNumSplits}th element that this {@link UnboundedCountingSource} * produces. * * <p>E.g., if a source produces all even numbers {@code [0, 2, 4, 6, 8, ...)} and we want to * split into 3 new sources, then the new sources will produce numbers that are 6 apart and * are offset at the start by the original stride: {@code [0, 6, 12, ...)}, * {@code [2, 8, 14, ...)}, and {@code [4, 10, 16, ...)}. */ @Override public List<? extends UnboundedSource<Long, CountingSource.CounterMark>> split( int desiredNumSplits, PipelineOptions options) throws Exception { // Using Javadoc example, stride 2 with 3 splits becomes stride 6. long newStride = stride * desiredNumSplits; ImmutableList.Builder<UnboundedCountingSource> splits = ImmutableList.builder(); for (int i = 0; i < desiredNumSplits; ++i) { // Starts offset by the original stride. Using Javadoc example, this generates starts of // 0, 2, and 4. splits.add( new UnboundedCountingSource( start + i * stride, newStride, elementsPerPeriod, period, timestampFn)); } return splits.build(); } @Override public UnboundedReader<Long> createReader( PipelineOptions options, CounterMark checkpointMark) { return new UnboundedCountingReader(this, checkpointMark); } @Override public Coder<CountingSource.CounterMark> getCheckpointMarkCoder() { return AvroCoder.of(CountingSource.CounterMark.class); } @Override public void validate() {} @Override public Coder<Long> getDefaultOutputCoder() { return VarLongCoder.of(); } } /** * The reader associated with {@link UnboundedCountingSource}. * * @see UnboundedCountingSource */ private static class UnboundedCountingReader extends UnboundedReader<Long> { private UnboundedCountingSource source; private long current; private Instant currentTimestamp; private Instant firstStarted; private final Counter elementsRead = SourceMetrics.elementsRead(); public UnboundedCountingReader(UnboundedCountingSource source, CounterMark mark) { this.source = source; if (mark == null) { // Because we have not emitted an element yet, and start() calls advance, we need to // "un-advance" so that start() produces the correct output. this.current = source.start - source.stride; } else { this.current = mark.getLastEmitted(); this.firstStarted = mark.getStartTime(); } } @Override public boolean start() throws IOException { if (firstStarted == null) { this.firstStarted = Instant.now(); } return advance(); } @Override public boolean advance() throws IOException { // Overflow-safe check that (current + source.stride) <= LONG.MAX_VALUE. Else, stop producing. if (Long.MAX_VALUE - source.stride < current) { return false; } long nextValue = current + source.stride; if (expectedValue() < nextValue) { return false; } elementsRead.inc(); current = nextValue; currentTimestamp = source.timestampFn.apply(current); return true; } private long expectedValue() { if (source.period.getMillis() == 0L) { return Long.MAX_VALUE; } double periodsElapsed = (Instant.now().getMillis() - firstStarted.getMillis()) / (double) source.period.getMillis(); return (long) (source.elementsPerPeriod * periodsElapsed); } @Override public Instant getWatermark() { return source.timestampFn.apply(current); } @Override public CounterMark getCheckpointMark() { return new CounterMark(current, firstStarted); } @Override public UnboundedSource<Long, CounterMark> getCurrentSource() { return source; } @Override public Long getCurrent() throws NoSuchElementException { return current; } @Override public Instant getCurrentTimestamp() throws NoSuchElementException { return currentTimestamp; } @Override public void close() throws IOException {} @Override public long getSplitBacklogBytes() { long expected = expectedValue(); return Math.max(0L, 8 * (expected - current) / source.stride); } } /** * The checkpoint for an unbounded {@link CountingSource} is simply the last value produced. The * associated source object encapsulates the information needed to produce the next value. */ @DefaultCoder(AvroCoder.class) public static class CounterMark implements UnboundedSource.CheckpointMark { /** The last value emitted. */ private final long lastEmitted; private final Instant startTime; /** * Creates a checkpoint mark reflecting the last emitted value. */ public CounterMark(long lastEmitted, Instant startTime) { this.lastEmitted = lastEmitted; this.startTime = startTime; } /** * Returns the last value emitted by the reader. */ public long getLastEmitted() { return lastEmitted; } /** * Returns the time the reader was started. */ public Instant getStartTime() { return startTime; } ///////////////////////////////////////////////////////////////////////////////////// @SuppressWarnings("unused") // For AvroCoder private CounterMark() { this.lastEmitted = 0L; this.startTime = Instant.now(); } @Override public void finalizeCheckpoint() throws IOException {} } }