/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.transforms; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.Collections; import java.util.Map; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.display.DisplayData.Builder; import org.apache.beam.sdk.transforms.display.HasDisplayData; import org.apache.beam.sdk.util.NameUtils; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PInput; import org.apache.beam.sdk.values.POutput; import org.apache.beam.sdk.values.PValue; import org.apache.beam.sdk.values.TupleTag; /** * A {@code PTransform<InputT, OutputT>} is an operation that takes an * {@code InputT} (some subtype of {@link PInput}) and produces an * {@code OutputT} (some subtype of {@link POutput}). * * <p>Common PTransforms include root PTransforms like * {@link org.apache.beam.sdk.io.TextIO.Read}, * {@link Create}, processing and * conversion operations like {@link ParDo}, * {@link GroupByKey}, * {@link org.apache.beam.sdk.transforms.join.CoGroupByKey}, * {@link Combine}, and {@link Count}, and outputting * PTransforms like * {@link org.apache.beam.sdk.io.TextIO.Write}. Users also * define their own application-specific composite PTransforms. * * <p>Each {@code PTransform<InputT, OutputT>} has a single * {@code InputT} type and a single {@code OutputT} type. Many * PTransforms conceptually transform one input value to one output * value, and in this case {@code InputT} and {@code Output} are * typically instances of * {@link org.apache.beam.sdk.values.PCollection}. * A root * PTransform conceptually has no input; in this case, conventionally * a {@link org.apache.beam.sdk.values.PBegin} object * produced by calling {@link Pipeline#begin} is used as the input. * An outputting PTransform conceptually has no output; in this case, * conventionally {@link org.apache.beam.sdk.values.PDone} * is used as its output type. Some PTransforms conceptually have * multiple inputs and/or outputs; in these cases special "bundling" * classes like * {@link org.apache.beam.sdk.values.PCollectionList}, * {@link org.apache.beam.sdk.values.PCollectionTuple} * are used * to combine multiple values into a single bundle for passing into or * returning from the PTransform. * * <p>A {@code PTransform<InputT, OutputT>} is invoked by calling * {@code apply()} on its {@code InputT}, returning its {@code OutputT}. * Calls can be chained to concisely create linear pipeline segments. * For example: * * <pre> {@code * PCollection<T1> pc1 = ...; * PCollection<T2> pc2 = * pc1.apply(ParDo.of(new MyDoFn<T1,KV<K,V>>())) * .apply(GroupByKey.<K, V>create()) * .apply(Combine.perKey(new MyKeyedCombineFn<K,V>())) * .apply(ParDo.of(new MyDoFn2<KV<K,V>,T2>())); * } </pre> * * <p>PTransform operations have unique names, which are used by the * system when explaining what's going on during optimization and * execution. Each PTransform gets a system-provided default name, * but it's a good practice to specify a more informative explicit * name when applying the transform. For example: * * <pre> {@code * ... * .apply("Step1", ParDo.of(new MyDoFn3())) * ... * } </pre> * * <p>Each PCollection output produced by a PTransform, * either directly or within a "bundling" class, automatically gets * its own name derived from the name of its producing PTransform. * * <p>Each PCollection output produced by a PTransform * also records a {@link org.apache.beam.sdk.coders.Coder} * that specifies how the elements of that PCollection * are to be encoded as a byte string, if necessary. The * PTransform may provide a default Coder for any of its outputs, for * instance by deriving it from the PTransform input's Coder. If the * PTransform does not specify the Coder for an output PCollection, * the system will attempt to infer a Coder for it, based on * what's known at run-time about the Java type of the output's * elements. The enclosing {@link Pipeline}'s * {@link org.apache.beam.sdk.coders.CoderRegistry} * (accessible via {@link Pipeline#getCoderRegistry}) defines the * mapping from Java types to the default Coder to use, for a standard * set of Java types; users can extend this mapping for additional * types, via * {@link org.apache.beam.sdk.coders.CoderRegistry#registerCoderProvider}. * If this inference process fails, either because the Java type was * not known at run-time (e.g., due to Java's "erasure" of generic * types) or there was no default Coder registered, then the Coder * should be specified manually by calling * {@link PCollection#setCoder} * on the output PCollection. The Coder of every output * PCollection must be determined one way or another * before that output is used as an input to another PTransform, or * before the enclosing Pipeline is run. * * <p>A small number of PTransforms are implemented natively by the * Apache Beam SDK; such PTransforms simply return an * output value as their apply implementation. * The majority of PTransforms are * implemented as composites of other PTransforms. Such a PTransform * subclass typically just implements {@link #expand}, computing its * Output value from its {@code InputT} value. User programs are encouraged to * use this mechanism to modularize their own code. Such composite * abstractions get their own name, and navigating through the * composition hierarchy of PTransforms is supported by the monitoring * interface. Examples of composite PTransforms can be found in this * directory and in examples. From the caller's point of view, there * is no distinction between a PTransform implemented natively and one * implemented in terms of other PTransforms; both kinds of PTransform * are invoked in the same way, using {@code apply()}. * * <h3>Note on Serialization</h3> * * <p>{@code PTransform} doesn't actually support serialization, despite * implementing {@code Serializable}. * * <p>{@code PTransform} is marked {@code Serializable} solely * because it is common for an anonymous {@link DoFn}, * instance to be created within an * {@code apply()} method of a composite {@code PTransform}. * * <p>Each of those {@code *Fn}s is {@code Serializable}, but * unfortunately its instance state will contain a reference to the * enclosing {@code PTransform} instance, and so attempt to serialize * the {@code PTransform} instance, even though the {@code *Fn} * instance never references anything about the enclosing * {@code PTransform}. * * <p>To allow such anonymous {@code *Fn}s to be written * conveniently, {@code PTransform} is marked as {@code Serializable}, * and includes dummy {@code writeObject()} and {@code readObject()} * operations that do not save or restore any state. * * @see <a href= * "https://beam.apache.org/documentation/programming-guide/#transforms" * >Applying Transformations</a> * * @param <InputT> the type of the input to this PTransform * @param <OutputT> the type of the output of this PTransform */ public abstract class PTransform<InputT extends PInput, OutputT extends POutput> implements Serializable /* See the note above */, HasDisplayData { /** * Applies this {@code PTransform} on the given {@code InputT}, and returns its * {@code Output}. * * <p>Composite transforms, which are defined in terms of other transforms, * should return the output of one of the composed transforms. Non-composite * transforms, which do not apply any transforms internally, should return * a new unbound output and register evaluators (via backend-specific * registration methods). */ public abstract OutputT expand(InputT input); /** * Called before running the Pipeline to verify this transform is fully and correctly * specified. * * <p>By default, does nothing. */ public void validate(PipelineOptions options) {} /** * Returns all {@link PValue PValues} that are consumed as inputs to this {@link PTransform} that * are independent of the expansion of the {@link InputT} within {@link #expand(PInput)}. * * <p>For example, this can contain any side input consumed by this {@link PTransform}. */ public Map<TupleTag<?>, PValue> getAdditionalInputs() { return Collections.emptyMap(); } /** * Returns the transform name. * * <p>This name is provided by the transform creator and is not required to be unique. */ public String getName() { return name != null ? name : getKindString(); } ///////////////////////////////////////////////////////////////////////////// // See the note about about PTransform's fake Serializability, to // understand why all of its instance state is transient. /** * The base name of this {@code PTransform}, e.g., from defaults, or * {@code null} if not yet assigned. */ protected final transient String name; protected PTransform() { this.name = null; } protected PTransform(String name) { this.name = name; } @Override public String toString() { if (name == null) { return getKindString(); } else { return getName() + " [" + getKindString() + "]"; } } /** * Returns the name to use by default for this {@code PTransform} * (not including the names of any enclosing {@code PTransform}s). * * <p>By default, returns the base name of this {@code PTransform}'s class. * * <p>The caller is responsible for ensuring that names of applied * {@code PTransform}s are unique, e.g., by adding a uniquifying * suffix when needed. */ protected String getKindString() { if (getClass().isAnonymousClass()) { return "AnonymousTransform"; } else { return NameUtils.approximatePTransformName(getClass()); } } private void writeObject(ObjectOutputStream oos) { // We don't really want to be serializing this object, but we // often have serializable anonymous DoFns nested within a // PTransform. } private void readObject(ObjectInputStream oos) { // We don't really want to be serializing this object, but we // often have serializable anonymous DoFns nested within a // PTransform. } /** * Returns the default {@code Coder} to use for the output of this * single-output {@code PTransform}. * * <p>By default, always throws * * @throws CannotProvideCoderException if no coder can be inferred */ protected Coder<?> getDefaultOutputCoder() throws CannotProvideCoderException { throw new CannotProvideCoderException("PTransform.getOutputCoder called."); } /** * Returns the default {@code Coder} to use for the output of this * single-output {@code PTransform} when applied to the given input. * * <p>By default, always throws. * * @throws CannotProvideCoderException if none can be inferred. */ protected Coder<?> getDefaultOutputCoder(@SuppressWarnings("unused") InputT input) throws CannotProvideCoderException { return getDefaultOutputCoder(); } /** * Returns the default {@code Coder} to use for the given output of * this single-output {@code PTransform} when applied to the given input. * * <p>By default, always throws. * * @throws CannotProvideCoderException if none can be inferred. */ public <T> Coder<T> getDefaultOutputCoder( InputT input, @SuppressWarnings("unused") PCollection<T> output) throws CannotProvideCoderException { @SuppressWarnings("unchecked") Coder<T> defaultOutputCoder = (Coder<T>) getDefaultOutputCoder(input); return defaultOutputCoder; } /** * {@inheritDoc} * * <p>By default, does not register any display data. Implementors may override this method * to provide their own display data. */ @Override public void populateDisplayData(Builder builder) {} }