/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.values;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import javax.annotation.Nullable;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.annotations.Internal;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.CannotProvideCoderException.ReasonCode;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderRegistry;
import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
import org.apache.beam.sdk.io.GenerateSequence;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.io.UnboundedSource.UnboundedReader;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.transforms.windowing.WindowFn;
/**
* A {@link PCollection PCollection<T>} is an immutable collection of values of type
* {@code T}. A {@link PCollection} can contain either a bounded or unbounded
* number of elements. Bounded and unbounded {@link PCollection PCollections} are produced
* as the output of {@link PTransform PTransforms}
* (including root PTransforms like {@link Read} and {@link Create}), and can
* be passed as the inputs of other PTransforms.
*
* <p>Some root transforms produce bounded {@code PCollections} and others
* produce unbounded ones. For example, {@link GenerateSequence#from} with
* {@link GenerateSequence#to} produces a fixed set of integers, so it produces a bounded
* {@link PCollection}. {@link GenerateSequence#from} without a {@link GenerateSequence#to}
* produces all integers as an infinite stream, so it produces an unbounded {@link PCollection}.
*
* <p>Each element in a {@link PCollection} has an associated timestamp. Readers assign timestamps
* to elements when they create {@link PCollection PCollections}, and other
* {@link PTransform PTransforms} propagate these timestamps from their input to their output. See
* the documentation on {@link BoundedReader} and {@link UnboundedReader} for more information on
* how these readers produce timestamps and watermarks.
*
* <p>Additionally, a {@link PCollection} has an associated
* {@link WindowFn} and each element is assigned to a set of windows.
* By default, the windowing function is {@link GlobalWindows}
* and all elements are assigned into a single default window.
* This default can be overridden with the {@link Window}
* {@link PTransform}.
*
* <p>See the individual {@link PTransform} subclasses for specific information
* on how they propagate timestamps and windowing.
*
* @param <T> the type of the elements of this {@link PCollection}
*/
public class PCollection<T> extends PValueBase implements PValue {
/**
* The {@link Coder} used by this {@link PCollection} to encode and decode the values stored in
* it, or null if not specified nor inferred yet.
*/
private CoderOrFailure<T> coderOrFailure =
new CoderOrFailure<>(null, "No Coder was specified, and Coder Inference did not occur");
private TypeDescriptor<T> typeDescriptor;
@Override
public void finishSpecifyingOutput(
String transformName, PInput input, PTransform<?, ?> transform) {
this.coderOrFailure = inferCoderOrFail(input, transform, getPipeline().getCoderRegistry());
super.finishSpecifyingOutput(transformName, input, transform);
}
/**
* After building, finalizes this {@link PValue} to make it ready for
* running. Automatically invoked whenever the {@link PValue} is "used"
* (e.g., when apply() is called on it) and when the Pipeline is
* run (useful if this is a {@link PValue} with no consumers).
*/
@Override
public void finishSpecifying(PInput input, PTransform<?, ?> transform) {
if (isFinishedSpecifying()) {
return;
}
this.coderOrFailure = inferCoderOrFail(input, transform, getPipeline().getCoderRegistry());
// Ensure that this TypedPValue has a coder by inferring the coder if none exists; If not,
// this will throw an exception.
getCoder();
super.finishSpecifying(input, transform);
}
/**
* Returns a {@link TypeDescriptor TypeDescriptor<T>} with some reflective information
* about {@code T}, if possible. May return {@code null} if no information
* is available. Subclasses may override this to enable better
* {@code Coder} inference.
*/
public TypeDescriptor<T> getTypeDescriptor() {
return typeDescriptor;
}
/**
* If the coder is not explicitly set, this sets the coder for this {@link PCollection} to the
* best coder that can be inferred based upon the known {@link TypeDescriptor}. By default, this
* is null, but can and should be improved by subclasses.
*/
@SuppressWarnings({"unchecked", "rawtypes"})
private CoderOrFailure<T> inferCoderOrFail(
PInput input, PTransform<?, ?> transform, CoderRegistry registry) {
// First option for a coder: use the Coder set on this PValue.
if (coderOrFailure.coder != null) {
return coderOrFailure;
}
// Second option for a coder: use the default Coder from the producing PTransform.
CannotProvideCoderException inputCoderException;
try {
return new CoderOrFailure<>(
((PTransform) transform).getDefaultOutputCoder(input, this), null);
} catch (CannotProvideCoderException exc) {
inputCoderException = exc;
}
// Third option for a coder: Look in the coder registry.
TypeDescriptor<T> token = getTypeDescriptor();
CannotProvideCoderException inferFromTokenException = null;
if (token != null) {
try {
return new CoderOrFailure<>(registry.getCoder(token), null);
} catch (CannotProvideCoderException exc) {
inferFromTokenException = exc;
// Attempt to detect when the token came from a TupleTag used for a ParDo output,
// and provide a better error message if so. Unfortunately, this information is not
// directly available from the TypeDescriptor, so infer based on the type of the PTransform
// and the error message itself.
if (transform instanceof ParDo.MultiOutput
&& exc.getReason() == ReasonCode.TYPE_ERASURE) {
inferFromTokenException = new CannotProvideCoderException(exc.getMessage()
+ " If this error occurs for an output of the producing ParDo, verify that the "
+ "TupleTag for this output is constructed with proper type information (see "
+ "TupleTag Javadoc) or explicitly set the Coder to use if this is not possible.");
}
}
}
// Build up the error message and list of causes.
StringBuilder messageBuilder = new StringBuilder()
.append("Unable to return a default Coder for ").append(this)
.append(". Correct one of the following root causes:");
// No exception, but give the user a message about .setCoder() has not been called.
messageBuilder.append("\n No Coder has been manually specified; ")
.append(" you may do so using .setCoder().");
if (inferFromTokenException != null) {
messageBuilder
.append("\n Inferring a Coder from the CoderRegistry failed: ")
.append(inferFromTokenException.getMessage());
}
if (inputCoderException != null) {
messageBuilder
.append("\n Using the default output Coder from the producing PTransform failed: ")
.append(inputCoderException.getMessage());
}
// Build and throw the exception.
return new CoderOrFailure<>(null, messageBuilder.toString());
}
/**
* The enumeration of cases for whether a {@link PCollection} is bounded.
*/
public enum IsBounded {
/**
* Indicates that a {@link PCollection} contains a bounded number of elements.
*/
BOUNDED,
/**
* Indicates that a {@link PCollection} contains an unbounded number of elements.
*/
UNBOUNDED;
/**
* Returns the composed IsBounded property.
*
* <p>The composed property is {@link #BOUNDED} only if all components are {@link #BOUNDED}.
* Otherwise, it is {@link #UNBOUNDED}.
*/
public IsBounded and(IsBounded that) {
if (this == BOUNDED && that == BOUNDED) {
return BOUNDED;
} else {
return UNBOUNDED;
}
}
}
/**
* Returns the name of this {@link PCollection}.
*
* <p>By default, the name of a {@link PCollection} is based on the name of the
* {@link PTransform} that produces it. It can be specified explicitly by
* calling {@link #setName}.
*
* @throws IllegalStateException if the name hasn't been set yet
*/
@Override
public String getName() {
return super.getName();
}
/**
* Sets the name of this {@link PCollection}. Returns {@code this}.
*
* @throws IllegalStateException if this {@link PCollection} has already been
* finalized and may no longer be set.
* Once {@link #apply} has been called, this will be the case.
*/
@Override
public PCollection<T> setName(String name) {
super.setName(name);
return this;
}
/**
* Returns the {@link Coder} used by this {@link PCollection} to encode and decode
* the values stored in it.
*
* @throws IllegalStateException if the {@link Coder} hasn't been set, and
* couldn't be inferred.
*/
public Coder<T> getCoder() {
checkState(coderOrFailure.coder != null, coderOrFailure.failure);
return coderOrFailure.coder;
}
/**
* Sets the {@link Coder} used by this {@link PCollection} to encode and decode the
* values stored in it. Returns {@code this}.
*
* @throws IllegalStateException if this {@link PCollection} has already
* been finalized and may no longer be set.
* Once {@link #apply} has been called, this will be the case.
*/
public PCollection<T> setCoder(Coder<T> coder) {
checkState(
!isFinishedSpecifying(), "cannot change the Coder of %s once it's been used", this);
checkArgument(coder != null, "Cannot setCoder(null)");
this.coderOrFailure = new CoderOrFailure<>(coder, null);
return this;
}
/**
* Like {@link IsBounded#apply(String, PTransform)} but defaulting to the name
* of the {@link PTransform}.
*
* @return the output of the applied {@link PTransform}
*/
public <OutputT extends POutput> OutputT apply(PTransform<? super PCollection<T>, OutputT> t) {
return Pipeline.applyTransform(this, t);
}
/**
* Applies the given {@link PTransform} to this input {@link PCollection},
* using {@code name} to identify this specific application of the transform.
* This name is used in various places, including the monitoring UI, logging,
* and to stably identify this application node in the job graph.
*
* @return the output of the applied {@link PTransform}
*/
public <OutputT extends POutput> OutputT apply(
String name, PTransform<? super PCollection<T>, OutputT> t) {
return Pipeline.applyTransform(name, this, t);
}
/**
* Returns the {@link WindowingStrategy} of this {@link PCollection}.
*/
public WindowingStrategy<?, ?> getWindowingStrategy() {
return windowingStrategy;
}
public IsBounded isBounded() {
return isBounded;
}
/////////////////////////////////////////////////////////////////////////////
// Internal details below here.
/**
* {@link WindowingStrategy} that will be used for merging windows and triggering output in this
* {@link PCollection} and subsequence {@link PCollection PCollections} produced from this one.
*
* <p>By default, no merging is performed.
*/
private WindowingStrategy<?, ?> windowingStrategy;
private IsBounded isBounded;
private PCollection(Pipeline p) {
super(p);
}
/**
* Sets the {@link TypeDescriptor TypeDescriptor<T>} for this
* {@link PCollection PCollection<T>}. This may allow the enclosing
* {@link PCollectionTuple}, {@link PCollectionList}, or {@code PTransform<?, PCollection<T>>},
* etc., to provide more detailed reflective information.
*/
public PCollection<T> setTypeDescriptor(TypeDescriptor<T> typeDescriptor) {
this.typeDescriptor = typeDescriptor;
return this;
}
/**
* <b><i>For internal use only; no backwards-compatibility guarantees.</i></b>
*/
@Internal
public PCollection<T> setWindowingStrategyInternal(WindowingStrategy<?, ?> windowingStrategy) {
this.windowingStrategy = windowingStrategy;
return this;
}
/**
* <b><i>For internal use only; no backwards-compatibility guarantees.</i></b>
*/
@Internal
public PCollection<T> setIsBoundedInternal(IsBounded isBounded) {
this.isBounded = isBounded;
return this;
}
/**
* <b><i>For internal use only; no backwards-compatibility guarantees.</i></b>
*/
@Internal
public static <T> PCollection<T> createPrimitiveOutputInternal(
Pipeline pipeline,
WindowingStrategy<?, ?> windowingStrategy,
IsBounded isBounded) {
return new PCollection<T>(pipeline)
.setWindowingStrategyInternal(windowingStrategy)
.setIsBoundedInternal(isBounded);
}
private static class CoderOrFailure<T> {
@Nullable private final Coder<T> coder;
@Nullable private final String failure;
public CoderOrFailure(@Nullable Coder<T> coder, @Nullable String failure) {
this.coder = coder;
this.failure = failure;
}
}
}