/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.transforms; import java.util.List; import java.util.Map; import org.apache.beam.sdk.PipelineRunner; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.runners.TransformHierarchy.Node; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.PCollectionViews; /** * Transforms for creating {@link PCollectionView PCollectionViews} from * {@link PCollection PCollections} (to read them as side inputs). * * <p>While a {@link PCollection PCollection<ElemT>} has many values of type {@code ElemT} per * window, a {@link PCollectionView PCollectionView<ViewT>} has a single value of type * {@code ViewT} for each window. It can be thought of as a mapping from windows to values of * type {@code ViewT}. The transforms here represent ways of converting the {@code ElemT} values * in a window into a {@code ViewT} for that window. * * <p>When a {@link ParDo} tranform is processing a main input * element in a window {@code w} and a {@link PCollectionView} is read via * {@link DoFn.ProcessContext#sideInput}, the value of the view for {@code w} is * returned. * * <p>The SDK supports viewing a {@link PCollection}, per window, as a single value, * a {@link List}, an {@link Iterable}, a {@link Map}, or a multimap (iterable-valued {@link Map}). * * <p>For a {@link PCollection} that contains a single value of type {@code T} * per window, such as the output of {@link Combine#globally}, * use {@link View#asSingleton()} to prepare it for use as a side input: * * <pre> * {@code * PCollectionView<T> output = someOtherPCollection * .apply(Combine.globally(...)) * .apply(View.<T>asSingleton()); * } * </pre> * * <p>For a small {@link PCollection} with windows that can fit entirely in memory, * use {@link View#asList()} to prepare it for use as a {@code List}. * When read as a side input, the entire list for a window will be cached in memory. * * <pre> * {@code * PCollectionView<List<T>> output = * smallPCollection.apply(View.<T>asList()); * } * </pre> * * <p>If a {@link PCollection} of {@code KV<K, V>} is known to * have a single value per window for each key, then use {@link View#asMap()} * to view it as a {@code Map<K, V>}: * * <pre> * {@code * PCollectionView<Map<K, V> output = * somePCollection.apply(View.<K, V>asMap()); * } * </pre> * * <p>Otherwise, to access a {@link PCollection} of {@code KV<K, V>} as a * {@code Map<K, Iterable<V>>} side input, use {@link View#asMultimap()}: * * <pre> * {@code * PCollectionView<Map<K, Iterable<V>> output = * somePCollection.apply(View.<K, Iterable<V>>asMap()); * } * </pre> * * <p>To iterate over an entire window of a {@link PCollection} via * side input, use {@link View#asIterable()}: * * <pre> * {@code * PCollectionView<Iterable<T>> output = * somePCollection.apply(View.<T>asIterable()); * } * </pre> * * * <p>Both {@link View#asMultimap()} and {@link View#asMap()} are useful * for implementing lookup based "joins" with the main input, when the * side input is small enough to fit into memory. * * <p>For example, if you represent a page on a website via some {@code Page} object and * have some type {@code UrlVisits} logging that a URL was visited, you could convert these * to more fully structured {@code PageVisit} objects using a side input, something like the * following: * * <pre> * {@code * PCollection<Page> pages = ... // pages fit into memory * PCollection<UrlVisit> urlVisits = ... // very large collection * final PCollectionView<Map<URL, Page>> = urlToPage * .apply(WithKeys.of( ... )) // extract the URL from the page * .apply(View.<URL, Page>asMap()); * * PCollection PageVisits = urlVisits * .apply(ParDo.withSideInputs(urlToPage) * .of(new DoFn<UrlVisit, PageVisit>() { * {@literal @}Override * void processElement(ProcessContext context) { * UrlVisit urlVisit = context.element(); * Page page = urlToPage.get(urlVisit.getUrl()); * c.output(new PageVisit(page, urlVisit.getVisitData())); * } * })); * } * </pre> * * <p>See {@link ParDo.SingleOutput#withSideInputs} for details on how to access * this variable inside a {@link ParDo} over another {@link PCollection}. */ public class View { // Do not instantiate private View() { } /** * Returns a {@link AsSingleton} transform that takes a * {@link PCollection} with a single value per window * as input and produces a {@link PCollectionView} that returns * the value in the main input window when read as a side input. * * <pre> * {@code * PCollection<InputT> input = ... * CombineFn<InputT, OutputT> yourCombineFn = ... * PCollectionView<OutputT> output = input * .apply(Combine.globally(yourCombineFn)) * .apply(View.<OutputT>asSingleton()); * }</pre> * * <p>If the input {@link PCollection} is empty, * throws {@link java.util.NoSuchElementException} in the consuming * {@link DoFn}. * * <p>If the input {@link PCollection} contains more than one * element, throws {@link IllegalArgumentException} in the * consuming {@link DoFn}. */ public static <T> AsSingleton<T> asSingleton() { return new AsSingleton<>(); } /** * Returns a {@link View.AsList} transform that takes a {@link PCollection} and returns a * {@link PCollectionView} mapping each window to a {@link List} containing * all of the elements in the window. * * <p>The resulting list is required to fit in memory. */ public static <T> AsList<T> asList() { return new AsList<>(); } /** * Returns a {@link View.AsIterable} transform that takes a {@link PCollection} as input * and produces a {@link PCollectionView} mapping each window to an * {@link Iterable} of the values in that window. * * <p>The values of the {@link Iterable} for a window are not required to fit in memory, * but they may also not be effectively cached. If it is known that every window fits in memory, * and stronger caching is desired, use {@link #asList}. */ public static <T> AsIterable<T> asIterable() { return new AsIterable<>(); } /** * Returns a {@link View.AsMap} transform that takes a * {@link PCollection PCollection<KV<K, V>>} as * input and produces a {@link PCollectionView} mapping each window to * a {@link Map Map<K, V>}. It is required that each key of the input be * associated with a single value, per window. If this is not the case, precede this * view with {@code Combine.perKey}, as in the example below, or alternatively * use {@link View#asMultimap()}. * * <pre> * {@code * PCollection<KV<K, V>> input = ... * CombineFn<V, OutputT> yourCombineFn = ... * PCollectionView<Map<K, OutputT>> output = input * .apply(Combine.perKey(yourCombineFn)) * .apply(View.<K, OutputT>asMap()); * }</pre> * * <p>Currently, the resulting map is required to fit into memory. */ public static <K, V> AsMap<K, V> asMap() { return new AsMap<K, V>(); } /** * Returns a {@link View.AsMultimap} transform that takes a * {@link PCollection PCollection<KV<K, V>>} * as input and produces a {@link PCollectionView} mapping * each window to its contents as a {@link Map Map<K, Iterable<V>>} * for use as a side input. * In contrast to {@link View#asMap()}, it is not required that the keys in the * input collection be unique. * * <pre> * {@code * PCollection<KV<K, V>> input = ... // maybe more than one occurrence of a some keys * PCollectionView<Map<K, V>> output = input.apply(View.<K, V>asMultimap()); * }</pre> * * <p>Currently, the resulting map is required to fit into memory. */ public static <K, V> AsMultimap<K, V> asMultimap() { return new AsMultimap<K, V>(); } /** * <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> * * <p>Public only so a {@link PipelineRunner} may override its behavior. * * <p>See {@link View#asList()}. */ @Internal public static class AsList<T> extends PTransform<PCollection<T>, PCollectionView<List<T>>> { private AsList() { } @Override public PCollectionView<List<T>> expand(PCollection<T> input) { try { GroupByKey.applicableTo(input); } catch (IllegalStateException e) { throw new IllegalStateException("Unable to create a side-input view from input", e); } return input.apply(CreatePCollectionView.<T, List<T>>of(PCollectionViews.listView( input, input.getWindowingStrategy(), input.getCoder()))); } } /** * <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> * * <p>Public only so a {@link PipelineRunner} may override its behavior. * * <p>See {@link View#asIterable()}. */ @Internal public static class AsIterable<T> extends PTransform<PCollection<T>, PCollectionView<Iterable<T>>> { private AsIterable() { } @Override public PCollectionView<Iterable<T>> expand(PCollection<T> input) { try { GroupByKey.applicableTo(input); } catch (IllegalStateException e) { throw new IllegalStateException("Unable to create a side-input view from input", e); } return input.apply(CreatePCollectionView.<T, Iterable<T>>of(PCollectionViews.iterableView( input, input.getWindowingStrategy(), input.getCoder()))); } } /** * <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> * * <p>Public only so a {@link PipelineRunner} may override its behavior. * * <p>See {@link View#asSingleton()}. */ @Internal public static class AsSingleton<T> extends PTransform<PCollection<T>, PCollectionView<T>> { private final T defaultValue; private final boolean hasDefault; private AsSingleton() { this.defaultValue = null; this.hasDefault = false; } private AsSingleton(T defaultValue) { this.defaultValue = defaultValue; this.hasDefault = true; } /** * Returns whether this transform has a default value. */ public boolean hasDefaultValue() { return hasDefault; } /** * Returns the default value of this transform, or null if there isn't one. */ public T defaultValue() { return defaultValue; } /** * Default value to return for windows with no value in them. */ public AsSingleton<T> withDefaultValue(T defaultValue) { return new AsSingleton<>(defaultValue); } @Override public PCollectionView<T> expand(PCollection<T> input) { try { GroupByKey.applicableTo(input); } catch (IllegalStateException e) { throw new IllegalStateException("Unable to create a side-input view from input", e); } Combine.Globally<T, T> singletonCombine = Combine.globally(new SingletonCombineFn<>(hasDefault, input.getCoder(), defaultValue)); if (!hasDefault) { singletonCombine = singletonCombine.withoutDefaults(); } return input.apply(singletonCombine.asSingletonView()); } } private static class SingletonCombineFn<T> extends Combine.BinaryCombineFn<T> { private final boolean hasDefault; private final Coder<T> valueCoder; private final byte[] defaultValue; private SingletonCombineFn(boolean hasDefault, Coder<T> coder, T defaultValue) { this.hasDefault = hasDefault; if (hasDefault) { if (defaultValue == null) { this.defaultValue = null; this.valueCoder = coder; } else { this.valueCoder = coder; try { this.defaultValue = CoderUtils.encodeToByteArray(coder, defaultValue); } catch (CoderException e) { throw new IllegalArgumentException( String.format( "Could not encode the default value %s with the provided coder %s", defaultValue, coder)); } } } else { this.valueCoder = null; this.defaultValue = null; } } @Override public T apply(T left, T right) { throw new IllegalArgumentException( "PCollection with more than one element " + "accessed as a singleton view. Consider using Combine.globally().asSingleton() to " + "combine the PCollection into a single value"); } public T identity() { if (hasDefault) { if (defaultValue == null) { return null; } try { return CoderUtils.decodeFromByteArray(valueCoder, defaultValue); } catch (CoderException e) { throw new IllegalArgumentException( String.format( "Could not decode the default value with the provided coder %s", valueCoder)); } } else { throw new IllegalArgumentException( "Empty PCollection accessed as a singleton view. " + "Consider setting withDefault to provide a default value"); } } } /** * <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> * * <p>Public only so a {@link PipelineRunner} may override its behavior. * * <p>See {@link View#asMultimap()}. */ @Internal public static class AsMultimap<K, V> extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, Iterable<V>>>> { private AsMultimap() { } @Override public PCollectionView<Map<K, Iterable<V>>> expand(PCollection<KV<K, V>> input) { try { GroupByKey.applicableTo(input); } catch (IllegalStateException e) { throw new IllegalStateException("Unable to create a side-input view from input", e); } return input.apply(CreatePCollectionView.<KV<K, V>, Map<K, Iterable<V>>>of( PCollectionViews.multimapView( input, input.getWindowingStrategy(), input.getCoder()))); } } /** * <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> * * <p>Public only so a {@link PipelineRunner} may override its behavior. * * <p>See {@link View#asMap()}. */ @Internal public static class AsMap<K, V> extends PTransform<PCollection<KV<K, V>>, PCollectionView<Map<K, V>>> { private AsMap() { } /** * @deprecated this method simply returns this AsMap unmodified */ @Deprecated() public AsMap<K, V> withSingletonValues() { return this; } @Override public PCollectionView<Map<K, V>> expand(PCollection<KV<K, V>> input) { try { GroupByKey.applicableTo(input); } catch (IllegalStateException e) { throw new IllegalStateException("Unable to create a side-input view from input", e); } return input.apply(CreatePCollectionView.<KV<K, V>, Map<K, V>>of( PCollectionViews.mapView( input, input.getWindowingStrategy(), input.getCoder()))); } } //////////////////////////////////////////////////////////////////////////// // Internal details below /** * <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> * * <p>Creates a primitive {@link PCollectionView}. * * @param <ElemT> The type of the elements of the input PCollection * @param <ViewT> The type associated with the {@link PCollectionView} used as a side input */ @Internal public static class CreatePCollectionView<ElemT, ViewT> extends PTransform<PCollection<ElemT>, PCollectionView<ViewT>> { private PCollectionView<ViewT> view; private CreatePCollectionView(PCollectionView<ViewT> view) { this.view = view; } public static <ElemT, ViewT> CreatePCollectionView<ElemT, ViewT> of( PCollectionView<ViewT> view) { return new CreatePCollectionView<>(view); } /** * Return the {@link PCollectionView} that is returned by applying this {@link PTransform}. * * <p>This should not be used to obtain the output of any given application of this * {@link PTransform}. That should be obtained by inspecting the {@link Node} * that contains this {@link CreatePCollectionView}, as this view may have been replaced within * pipeline surgery. */ @Deprecated public PCollectionView<ViewT> getView() { return view; } @Override public PCollectionView<ViewT> expand(PCollection<ElemT> input) { return view; } } }