/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.transforms; import static com.google.common.base.Preconditions.checkState; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ThreadLocalRandom; import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.coders.CoderRegistry; import org.apache.beam.sdk.coders.DelegateCoder; import org.apache.beam.sdk.coders.IterableCoder; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.coders.StructuredCoder; import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.coders.VoidCoder; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.CombineFnBase.AbstractGlobalCombineFn; import org.apache.beam.sdk.transforms.CombineFnBase.GlobalCombineFn; import org.apache.beam.sdk.transforms.CombineWithContext.CombineFnWithContext; import org.apache.beam.sdk.transforms.CombineWithContext.Context; import org.apache.beam.sdk.transforms.CombineWithContext.RequiresContextInternal; import org.apache.beam.sdk.transforms.View.CreatePCollectionView; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.transforms.display.DisplayData.Builder; import org.apache.beam.sdk.transforms.display.HasDisplayData; import org.apache.beam.sdk.transforms.windowing.GlobalWindow; import org.apache.beam.sdk.transforms.windowing.GlobalWindows; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.util.AppliedCombineFn; import org.apache.beam.sdk.util.NameUtils; import org.apache.beam.sdk.util.NameUtils.NameOverride; import org.apache.beam.sdk.util.SerializableUtils; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionList; import org.apache.beam.sdk.values.PCollectionTuple; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.PCollectionViews; import org.apache.beam.sdk.values.PValue; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TupleTagList; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.sdk.values.WindowingStrategy; /** * {@code PTransform}s for combining {@code PCollection} elements * globally and per-key. * * <p>See the <a href="https://beam.apache.org/documentation/programming-guide/#transforms-combine">documentation</a> * for how to use the operations in this class. */ public class Combine { private Combine() { // do not instantiate } /** * Returns a {@link Globally Combine.Globally} {@code PTransform} * that uses the given {@code SerializableFunction} to combine all * the elements in each window of the input {@code PCollection} into a * single value in the output {@code PCollection}. The types of the input * elements and the output elements must be the same. * * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows}, * a default value in the {@link GlobalWindow} will be output if the input * {@code PCollection} is empty. To use this with inputs with other windowing, * either {@link Globally#withoutDefaults} or {@link Globally#asSingletonView} * must be called. * * <p>See {@link Globally Combine.Globally} for more information. */ public static <V> Globally<V, V> globally( SerializableFunction<Iterable<V>, V> combiner) { return globally(IterableCombineFn.of(combiner), displayDataForFn(combiner)); } /** * Returns a {@link Globally Combine.Globally} {@code PTransform} * that uses the given {@code GloballyCombineFn} to combine all * the elements in each window of the input {@code PCollection} into a * single value in the output {@code PCollection}. The types of the input * elements and the output elements can differ. * * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows}, * a default value in the {@link GlobalWindow} will be output if the input * {@code PCollection} is empty. To use this with inputs with other windowing, * either {@link Globally#withoutDefaults} or {@link Globally#asSingletonView} * must be called. * * <p>See {@link Globally Combine.Globally} for more information. */ public static <InputT, OutputT> Globally<InputT, OutputT> globally( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return globally(fn, displayDataForFn(fn)); } private static <T> DisplayData.ItemSpec<? extends Class<?>> displayDataForFn(T fn) { return DisplayData.item("combineFn", fn.getClass()) .withLabel("Combiner"); } private static <InputT, OutputT> Globally<InputT, OutputT> globally( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData) { return new Globally<>(fn, fnDisplayData, true, 0); } /** * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that * first groups its input {@code PCollection} of {@code KV}s by keys and * windows, then invokes the given function on each of the values lists to * produce a combined value, and then returns a {@code PCollection} * of {@code KV}s mapping each distinct key to its combined value for each * window. * * <p>Each output element is in the window by which its corresponding input * was grouped, and has the timestamp of the end of that window. The output * {@code PCollection} has the same * {@link org.apache.beam.sdk.transforms.windowing.WindowFn} * as the input. * * <p>See {@link PerKey Combine.PerKey} for more information. */ public static <K, V> PerKey<K, V, V> perKey( SerializableFunction<Iterable<V>, V> fn) { return perKey(IterableCombineFn.of(fn), displayDataForFn(fn)); } /** * Returns a {@link PerKey Combine.PerKey} {@code PTransform} that * first groups its input {@code PCollection} of {@code KV}s by keys and * windows, then invokes the given function on each of the values lists to * produce a combined value, and then returns a {@code PCollection} * of {@code KV}s mapping each distinct key to its combined value for each * window. * * <p>Each output element is in the window by which its corresponding input * was grouped, and has the timestamp of the end of that window. The output * {@code PCollection} has the same * {@link org.apache.beam.sdk.transforms.windowing.WindowFn} * as the input. * * <p>See {@link PerKey Combine.PerKey} for more information. */ public static <K, InputT, OutputT> PerKey<K, InputT, OutputT> perKey( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return perKey(fn, displayDataForFn(fn)); } private static <K, InputT, OutputT> PerKey<K, InputT, OutputT> perKey( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData) { return new PerKey<>(fn, fnDisplayData, false /*fewKeys*/); } /** * Returns a {@link PerKey Combine.PerKey}, and set fewKeys * in {@link GroupByKey}. */ private static <K, InputT, OutputT> PerKey<K, InputT, OutputT> fewKeys( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData) { return new PerKey<>(fn, fnDisplayData, true /*fewKeys*/); } /** * Returns a {@link GroupedValues Combine.GroupedValues} * {@code PTransform} that takes a {@code PCollection} of * {@code KV}s where a key maps to an {@code Iterable} of values, e.g., * the result of a {@code GroupByKey}, then uses the given * {@code SerializableFunction} to combine all the values associated * with a key, ignoring the key. The type of the input and * output values must be the same. * * <p>Each output element has the same timestamp and is in the same window * as its corresponding input element, and the output * {@code PCollection} has the same * {@link org.apache.beam.sdk.transforms.windowing.WindowFn} * associated with it as the input. * * <p>See {@link GroupedValues Combine.GroupedValues} for more information. * * <p>Note that {@link #perKey(SerializableFunction)} is typically * more convenient to use than {@link GroupByKey} followed by * {@code groupedValues(...)}. */ public static <K, V> GroupedValues<K, V, V> groupedValues( SerializableFunction<Iterable<V>, V> fn) { return groupedValues(IterableCombineFn.of(fn), displayDataForFn(fn)); } /** * Returns a {@link GroupedValues Combine.GroupedValues} * {@code PTransform} that takes a {@code PCollection} of * {@code KV}s where a key maps to an {@code Iterable} of values, e.g., * the result of a {@code GroupByKey}, then uses the given * {@code CombineFn} to combine all the values associated with a * key, ignoring the key. The types of the input and output values * can differ. * * <p>Each output element has the same timestamp and is in the same window * as its corresponding input element, and the output * {@code PCollection} has the same * {@link org.apache.beam.sdk.transforms.windowing.WindowFn} * associated with it as the input. * * <p>See {@link GroupedValues Combine.GroupedValues} for more information. * * <p>Note that {@link #perKey(CombineFnBase.GlobalCombineFn)} is typically * more convenient to use than {@link GroupByKey} followed by * {@code groupedValues(...)}. */ public static <K, InputT, OutputT> GroupedValues<K, InputT, OutputT> groupedValues( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return groupedValues(fn, displayDataForFn(fn)); } private static <K, InputT, OutputT> GroupedValues<K, InputT, OutputT> groupedValues( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData) { return new GroupedValues<>(fn, fnDisplayData); } ///////////////////////////////////////////////////////////////////////////// /** * A {@code CombineFn<InputT, AccumT, OutputT>} specifies how to combine a * collection of input values of type {@code InputT} into a single * output value of type {@code OutputT}. It does this via one or more * intermediate mutable accumulator values of type {@code AccumT}. * * <p>The overall process to combine a collection of input * {@code InputT} values into a single output {@code OutputT} value is as * follows: * * <ol> * * <li> The input {@code InputT} values are partitioned into one or more * batches. * * <li> For each batch, the {@link #createAccumulator} operation is * invoked to create a fresh mutable accumulator value of type * {@code AccumT}, initialized to represent the combination of zero * values. * * <li> For each input {@code InputT} value in a batch, the * {@link #addInput} operation is invoked to add the value to that * batch's accumulator {@code AccumT} value. The accumulator may just * record the new value (e.g., if {@code AccumT == List<InputT>}, or may do * work to represent the combination more compactly. * * <li> The {@link #mergeAccumulators} operation is invoked to * combine a collection of accumulator {@code AccumT} values into a * single combined output accumulator {@code AccumT} value, once the * merging accumulators have had all all the input values in their * batches added to them. This operation is invoked repeatedly, * until there is only one accumulator value left. * * <li> The {@link #extractOutput} operation is invoked on the final * accumulator {@code AccumT} value to get the output {@code OutputT} value. * * </ol> * * <p>For example: * <pre> {@code * public class AverageFn extends CombineFn<Integer, AverageFn.Accum, Double> { * public static class Accum { * int sum = 0; * int count = 0; * } * public Accum createAccumulator() { * return new Accum(); * } * public Accum addInput(Accum accum, Integer input) { * accum.sum += input; * accum.count++; * return accum; * } * public Accum mergeAccumulators(Iterable<Accum> accums) { * Accum merged = createAccumulator(); * for (Accum accum : accums) { * merged.sum += accum.sum; * merged.count += accum.count; * } * return merged; * } * public Double extractOutput(Accum accum) { * return ((double) accum.sum) / accum.count; * } * } * PCollection<Integer> pc = ...; * PCollection<Double> average = pc.apply(Combine.globally(new AverageFn())); * } </pre> * * <p>Combining functions used by {@link Combine.Globally}, * {@link Combine.PerKey}, {@link Combine.GroupedValues}, and * {@code PTransforms} derived from them should be * <i>associative</i> and <i>commutative</i>. Associativity is * required because input values are first broken up into subgroups * before being combined, and their intermediate results further * combined, in an arbitrary tree structure. Commutativity is * required because any order of the input values is ignored when * breaking up input values into groups. * * @param <InputT> type of input values * @param <AccumT> type of mutable accumulator values * @param <OutputT> type of output values */ public abstract static class CombineFn<InputT, AccumT, OutputT> extends AbstractGlobalCombineFn<InputT, AccumT, OutputT> { /** * Returns a new, mutable accumulator value, representing the accumulation of zero input values. */ public abstract AccumT createAccumulator(); /** * Adds the given input value to the given accumulator, returning the * new accumulator value. * * <p>For efficiency, the input accumulator may be modified and returned. */ public abstract AccumT addInput(AccumT accumulator, InputT input); /** * Returns an accumulator representing the accumulation of all the * input values accumulated in the merging accumulators. * * <p>May modify any of the argument accumulators. May return a * fresh accumulator, or may return one of the (modified) argument * accumulators. */ public abstract AccumT mergeAccumulators(Iterable<AccumT> accumulators); /** * Returns the output value that is the result of combining all * the input values represented by the given accumulator. */ public abstract OutputT extractOutput(AccumT accumulator); /** * Returns an accumulator that represents the same logical value as the * input accumulator, but may have a more compact representation. * * <p>For most CombineFns this would be a no-op, but should be overridden * by CombineFns that (for example) buffer up elements and combine * them in batches. * * <p>For efficiency, the input accumulator may be modified and returned. * * <p>By default returns the original accumulator. */ public AccumT compact(AccumT accumulator) { return accumulator; } /** * Applies this {@code CombineFn} to a collection of input values * to produce a combined output value. * * <p>Useful when using a {@code CombineFn} separately from a * {@code Combine} transform. Does not invoke the * {@link #mergeAccumulators} operation. */ public OutputT apply(Iterable<? extends InputT> inputs) { AccumT accum = createAccumulator(); for (InputT input : inputs) { accum = addInput(accum, input); } return extractOutput(accum); } /** * {@inheritDoc} * * <p>By default returns the extract output of an empty accumulator. */ @Override public OutputT defaultValue() { return extractOutput(createAccumulator()); } /** * Returns a {@link TypeDescriptor} capturing what is known statically * about the output type of this {@code CombineFn} instance's * most-derived class. * * <p>In the normal case of a concrete {@code CombineFn} subclass with * no generic type parameters of its own, this will be a complete * non-generic type. */ public TypeDescriptor<OutputT> getOutputType() { return new TypeDescriptor<OutputT>(getClass()) {}; } } ///////////////////////////////////////////////////////////////////////////// /** * An abstract subclass of {@link CombineFn} for implementing combiners that are more * easily expressed as binary operations. */ public abstract static class BinaryCombineFn<V> extends CombineFn<V, Holder<V>, V> { /** * Applies the binary operation to the two operands, returning the result. */ public abstract V apply(V left, V right); /** * Returns the value that should be used for the combine of the empty set. */ public V identity() { return null; } @Override public Holder<V> createAccumulator() { return new Holder<>(); } @Override public Holder<V> addInput(Holder<V> accumulator, V input) { if (accumulator.present) { accumulator.set(apply(accumulator.value, input)); } else { accumulator.set(input); } return accumulator; } @Override public Holder<V> mergeAccumulators(Iterable<Holder<V>> accumulators) { Iterator<Holder<V>> iter = accumulators.iterator(); if (!iter.hasNext()) { return createAccumulator(); } else { Holder<V> running = iter.next(); while (iter.hasNext()) { Holder<V> accum = iter.next(); if (accum.present) { if (running.present) { running.set(apply(running.value, accum.value)); } else { running.set(accum.value); } } } return running; } } @Override public V extractOutput(Holder<V> accumulator) { if (accumulator.present) { return accumulator.value; } else { return identity(); } } @Override public Coder<Holder<V>> getAccumulatorCoder(CoderRegistry registry, Coder<V> inputCoder) { return new HolderCoder<>(inputCoder); } @Override public Coder<V> getDefaultOutputCoder(CoderRegistry registry, Coder<V> inputCoder) { return inputCoder; } } /** * Holds a single value value of type {@code V} which may or may not be present. * * <p>Used only as a private accumulator class. */ public static class Holder<V> { private V value; private boolean present; private Holder() { } private Holder(V value) { set(value); } private void set(V value) { this.present = true; this.value = value; } } /** * A {@link Coder} for a {@link Holder}. */ private static class HolderCoder<V> extends StructuredCoder<Holder<V>> { private Coder<V> valueCoder; public HolderCoder(Coder<V> valueCoder) { this.valueCoder = valueCoder; } @Override public void encode(Holder<V> accumulator, OutputStream outStream) throws CoderException, IOException { encode(accumulator, outStream, Context.NESTED); } @Override public void encode(Holder<V> accumulator, OutputStream outStream, Context context) throws CoderException, IOException { if (accumulator.present) { outStream.write(1); valueCoder.encode(accumulator.value, outStream, context); } else { outStream.write(0); } } @Override public Holder<V> decode(InputStream inStream) throws CoderException, IOException { return decode(inStream, Context.NESTED); } @Override public Holder<V> decode(InputStream inStream, Context context) throws CoderException, IOException { if (inStream.read() == 1) { return new Holder<>(valueCoder.decode(inStream, context)); } else { return new Holder<>(); } } @Override public List<? extends Coder<?>> getCoderArguments() { return Collections.singletonList(valueCoder); } @Override public void verifyDeterministic() throws NonDeterministicException { valueCoder.verifyDeterministic(); } } /** * An abstract subclass of {@link CombineFn} for implementing combiners that are more * easily and efficiently expressed as binary operations on <code>int</code>s * * <p>It uses {@code int[0]} as the mutable accumulator. */ public abstract static class BinaryCombineIntegerFn extends CombineFn<Integer, int[], Integer> { /** * Applies the binary operation to the two operands, returning the result. */ public abstract int apply(int left, int right); /** * Returns the identity element of this operation, i.e. an element {@code e} * such that {@code apply(e, x) == apply(x, e) == x} for all values of {@code x}. */ public abstract int identity(); @Override public int[] createAccumulator() { return wrap(identity()); } @Override public int[] addInput(int[] accumulator, Integer input) { accumulator[0] = apply(accumulator[0], input); return accumulator; } @Override public int[] mergeAccumulators(Iterable<int[]> accumulators) { Iterator<int[]> iter = accumulators.iterator(); if (!iter.hasNext()) { return createAccumulator(); } else { int[] running = iter.next(); while (iter.hasNext()) { running[0] = apply(running[0], iter.next()[0]); } return running; } } @Override public Integer extractOutput(int[] accumulator) { return accumulator[0]; } @Override public Coder<int[]> getAccumulatorCoder(CoderRegistry registry, Coder<Integer> inputCoder) { return DelegateCoder.of( inputCoder, new ToIntegerCodingFunction(), new FromIntegerCodingFunction()); } @Override public Coder<Integer> getDefaultOutputCoder(CoderRegistry registry, Coder<Integer> inputCoder) { return inputCoder; } private static int[] wrap(int value) { return new int[] { value }; } private static final class ToIntegerCodingFunction implements DelegateCoder.CodingFunction<int[], Integer> { @Override public Integer apply(int[] accumulator) { return accumulator[0]; } @Override public boolean equals(Object o) { return o instanceof ToIntegerCodingFunction; } @Override public int hashCode() { return this.getClass().hashCode(); } } private static final class FromIntegerCodingFunction implements DelegateCoder.CodingFunction<Integer, int[]> { @Override public int[] apply(Integer value) { return wrap(value); } @Override public boolean equals(Object o) { return o instanceof FromIntegerCodingFunction; } @Override public int hashCode() { return this.getClass().hashCode(); } } } /** * An abstract subclass of {@link CombineFn} for implementing combiners that are more * easily and efficiently expressed as binary operations on <code>long</code>s. * * <p>It uses {@code long[0]} as the mutable accumulator. */ public abstract static class BinaryCombineLongFn extends CombineFn<Long, long[], Long> { /** * Applies the binary operation to the two operands, returning the result. */ public abstract long apply(long left, long right); /** * Returns the identity element of this operation, i.e. an element {@code e} * such that {@code apply(e, x) == apply(x, e) == x} for all values of {@code x}. */ public abstract long identity(); @Override public long[] createAccumulator() { return wrap(identity()); } @Override public long[] addInput(long[] accumulator, Long input) { accumulator[0] = apply(accumulator[0], input); return accumulator; } @Override public long[] mergeAccumulators(Iterable<long[]> accumulators) { Iterator<long[]> iter = accumulators.iterator(); if (!iter.hasNext()) { return createAccumulator(); } else { long[] running = iter.next(); while (iter.hasNext()) { running[0] = apply(running[0], iter.next()[0]); } return running; } } @Override public Long extractOutput(long[] accumulator) { return accumulator[0]; } @Override public Coder<long[]> getAccumulatorCoder(CoderRegistry registry, Coder<Long> inputCoder) { return DelegateCoder.of(inputCoder, new ToLongCodingFunction(), new FromLongCodingFunction()); } @Override public Coder<Long> getDefaultOutputCoder(CoderRegistry registry, Coder<Long> inputCoder) { return inputCoder; } private static long[] wrap(long value) { return new long[] { value }; } private static final class ToLongCodingFunction implements DelegateCoder.CodingFunction<long[], Long> { @Override public Long apply(long[] accumulator) { return accumulator[0]; } @Override public boolean equals(Object o) { return o instanceof ToLongCodingFunction; } @Override public int hashCode() { return this.getClass().hashCode(); } } private static final class FromLongCodingFunction implements DelegateCoder.CodingFunction<Long, long[]> { @Override public long[] apply(Long value) { return wrap(value); } @Override public boolean equals(Object o) { return o instanceof FromLongCodingFunction; } @Override public int hashCode() { return this.getClass().hashCode(); } } } /** * An abstract subclass of {@link CombineFn} for implementing combiners that are more * easily and efficiently expressed as binary operations on <code>double</code>s. * * <p>It uses {@code double[0]} as the mutable accumulator. */ public abstract static class BinaryCombineDoubleFn extends CombineFn<Double, double[], Double> { /** * Applies the binary operation to the two operands, returning the result. */ public abstract double apply(double left, double right); /** * Returns the identity element of this operation, i.e. an element {@code e} * such that {@code apply(e, x) == apply(x, e) == x} for all values of {@code x}. */ public abstract double identity(); @Override public double[] createAccumulator() { return wrap(identity()); } @Override public double[] addInput(double[] accumulator, Double input) { accumulator[0] = apply(accumulator[0], input); return accumulator; } @Override public double[] mergeAccumulators(Iterable<double[]> accumulators) { Iterator<double[]> iter = accumulators.iterator(); if (!iter.hasNext()) { return createAccumulator(); } else { double[] running = iter.next(); while (iter.hasNext()) { running[0] = apply(running[0], iter.next()[0]); } return running; } } @Override public Double extractOutput(double[] accumulator) { return accumulator[0]; } @Override public Coder<double[]> getAccumulatorCoder(CoderRegistry registry, Coder<Double> inputCoder) { return DelegateCoder.of( inputCoder, new ToDoubleCodingFunction(), new FromDoubleCodingFunction()); } @Override public Coder<Double> getDefaultOutputCoder(CoderRegistry registry, Coder<Double> inputCoder) { return inputCoder; } private static double[] wrap(double value) { return new double[] { value }; } private static final class ToDoubleCodingFunction implements DelegateCoder.CodingFunction<double[], Double> { @Override public Double apply(double[] accumulator) { return accumulator[0]; } @Override public boolean equals(Object o) { return o instanceof ToDoubleCodingFunction; } @Override public int hashCode() { return this.getClass().hashCode(); } } private static final class FromDoubleCodingFunction implements DelegateCoder.CodingFunction<Double, double[]> { @Override public double[] apply(Double value) { return wrap(value); } @Override public boolean equals(Object o) { return o instanceof FromDoubleCodingFunction; } @Override public int hashCode() { return this.getClass().hashCode(); } } } ///////////////////////////////////////////////////////////////////////////// /** * A {@code CombineFn} that uses a subclass of * {@link AccumulatingCombineFn.Accumulator} as its accumulator * type. By defining the operations of the {@code Accumulator} * helper class, the operations of the enclosing {@code CombineFn} * are automatically provided. This can reduce the code required to * implement a {@code CombineFn}. * * <p>For example, the example from {@link CombineFn} above can be * expressed using {@code AccumulatingCombineFn} more concisely as * follows: * * <pre> {@code * public class AverageFn * extends AccumulatingCombineFn<Integer, AverageFn.Accum, Double> { * public Accum createAccumulator() { * return new Accum(); * } * public class Accum * extends AccumulatingCombineFn<Integer, AverageFn.Accum, Double> * .Accumulator { * private int sum = 0; * private int count = 0; * public void addInput(Integer input) { * sum += input; * count++; * } * public void mergeAccumulator(Accum other) { * sum += other.sum; * count += other.count; * } * public Double extractOutput() { * return ((double) sum) / count; * } * } * } * PCollection<Integer> pc = ...; * PCollection<Double> average = pc.apply(Combine.globally(new AverageFn())); * } </pre> * * @param <InputT> type of input values * @param <AccumT> type of mutable accumulator values * @param <OutputT> type of output values */ public abstract static class AccumulatingCombineFn< InputT, AccumT extends AccumulatingCombineFn.Accumulator<InputT, AccumT, OutputT>, OutputT> extends CombineFn<InputT, AccumT, OutputT> { /** * The type of mutable accumulator values used by this * {@code AccumulatingCombineFn}. */ public interface Accumulator<InputT, AccumT, OutputT> { /** * Adds the given input value to this accumulator, modifying * this accumulator. */ void addInput(InputT input); /** * Adds the input values represented by the given accumulator * into this accumulator. */ void mergeAccumulator(AccumT other); /** * Returns the output value that is the result of combining all * the input values represented by this accumulator. */ OutputT extractOutput(); } @Override public final AccumT addInput(AccumT accumulator, InputT input) { accumulator.addInput(input); return accumulator; } @Override public final AccumT mergeAccumulators(Iterable<AccumT> accumulators) { AccumT accumulator = createAccumulator(); for (AccumT partial : accumulators) { accumulator.mergeAccumulator(partial); } return accumulator; } @Override public final OutputT extractOutput(AccumT accumulator) { return accumulator.extractOutput(); } } ///////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// /** * {@code Combine.Globally<InputT, OutputT>} takes a {@code PCollection<InputT>} * and returns a {@code PCollection<OutputT>} whose elements are the result of * combining all the elements in each window of the input {@code PCollection}, * using a specified {@link CombineFn CombineFn<InputT, AccumT, OutputT>}. * It is common for {@code InputT == OutputT}, but not required. Common combining * functions include sums, mins, maxes, and averages of numbers, * conjunctions and disjunctions of booleans, statistical * aggregations, etc. * * <p>Example of use: * <pre> {@code * PCollection<Integer> pc = ...; * PCollection<Integer> sum = pc.apply( * Combine.globally(new Sum.SumIntegerFn())); * } </pre> * * <p>Combining can happen in parallel, with different subsets of the * input {@code PCollection} being combined separately, and their * intermediate results combined further, in an arbitrary tree * reduction pattern, until a single result value is produced. * * <p>If the input {@code PCollection} is windowed into {@link GlobalWindows}, * a default value in the {@link GlobalWindow} will be output if the input * {@code PCollection} is empty. To use this with inputs with other windowing, * either {@link #withoutDefaults} or {@link #asSingletonView} must be called, * as the default value cannot be automatically assigned to any single window. * * <p>By default, the {@code Coder} of the output {@code PValue<OutputT>} * is inferred from the concrete type of the * {@code CombineFn<InputT, AccumT, OutputT>}'s output type {@code OutputT}. * * <p>See also {@link #perKey}/{@link PerKey Combine.PerKey} and * {@link #groupedValues}/{@link GroupedValues Combine.GroupedValues}, which * are useful for combining values associated with each key in * a {@code PCollection} of {@code KV}s. * * @param <InputT> type of input values * @param <OutputT> type of output values */ public static class Globally<InputT, OutputT> extends PTransform<PCollection<InputT>, PCollection<OutputT>> { private final GlobalCombineFn<? super InputT, ?, OutputT> fn; private final DisplayData.ItemSpec<? extends Class<?>> fnDisplayData; private final boolean insertDefault; private final int fanout; private final List<PCollectionView<?>> sideInputs; private Globally(GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, boolean insertDefault, int fanout) { this.fn = fn; this.fnDisplayData = fnDisplayData; this.insertDefault = insertDefault; this.fanout = fanout; this.sideInputs = ImmutableList.of(); } private Globally(GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, boolean insertDefault, int fanout, List<PCollectionView<?>> sideInputs) { this.fn = fn; this.fnDisplayData = fnDisplayData; this.insertDefault = insertDefault; this.fanout = fanout; this.sideInputs = sideInputs; } @Override protected String getKindString() { return String.format("Combine.globally(%s)", NameUtils.approximateSimpleName(fn)); } /** * Returns a {@link PTransform} that produces a {@code PCollectionView} * whose elements are the result of combining elements per-window in * the input {@code PCollection}. If a value is requested from the view * for a window that is not present, the result of applying the {@code CombineFn} * to an empty input set will be returned. */ public GloballyAsSingletonView<InputT, OutputT> asSingletonView() { return new GloballyAsSingletonView<>(fn, fnDisplayData, insertDefault, fanout); } /** * Returns a {@link PTransform} identical to this, but that does not attempt to * provide a default value in the case of empty input. Required when the input * is not globally windowed and the output is not being used as a side input. */ public Globally<InputT, OutputT> withoutDefaults() { return new Globally<>(fn, fnDisplayData, false, fanout); } /** * Returns a {@link PTransform} identical to this, but that uses an intermediate node * to combine parts of the data to reduce load on the final global combine step. * * <p>The {@code fanout} parameter determines the number of intermediate keys * that will be used. */ public Globally<InputT, OutputT> withFanout(int fanout) { return new Globally<>(fn, fnDisplayData, insertDefault, fanout); } /** * Returns a {@link PTransform} identical to this, but with the specified side inputs to use * in {@link CombineFnWithContext}. */ public Globally<InputT, OutputT> withSideInputs(PCollectionView<?>... sideInputs) { return withSideInputs(Arrays.asList(sideInputs)); } /** * Returns a {@link PTransform} identical to this, but with the specified side inputs to use * in {@link CombineFnWithContext}. */ public Globally<InputT, OutputT> withSideInputs( Iterable<? extends PCollectionView<?>> sideInputs) { checkState(fn instanceof RequiresContextInternal); return new Globally<>(fn, fnDisplayData, insertDefault, fanout, ImmutableList.copyOf(sideInputs)); } /** * Returns the {@link GlobalCombineFn} used by this Combine operation. */ public GlobalCombineFn<? super InputT, ?, OutputT> getFn() { return fn; } /** * Returns the side inputs used by this Combine operation. */ public List<PCollectionView<?>> getSideInputs() { return sideInputs; } /** * Returns the side inputs of this {@link Combine}, tagged with the tag of the * {@link PCollectionView}. The values of the returned map will be equal to the result of * {@link #getSideInputs()}. */ @Override public Map<TupleTag<?>, PValue> getAdditionalInputs() { ImmutableMap.Builder<TupleTag<?>, PValue> additionalInputs = ImmutableMap.builder(); for (PCollectionView<?> sideInput : sideInputs) { additionalInputs.put(sideInput.getTagInternal(), sideInput.getPCollection()); } return additionalInputs.build(); } /** * Returns whether or not this transformation applies a default value. */ public boolean isInsertDefault() { return insertDefault; } @Override public PCollection<OutputT> expand(PCollection<InputT> input) { PCollection<KV<Void, InputT>> withKeys = input .apply(WithKeys.<Void, InputT>of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())); Combine.PerKey<Void, InputT, OutputT> combine = Combine.fewKeys(fn, fnDisplayData); if (!sideInputs.isEmpty()) { combine = combine.withSideInputs(sideInputs); } PCollection<KV<Void, OutputT>> combined; if (fanout >= 2) { combined = withKeys.apply(combine.withHotKeyFanout(fanout)); } else { combined = withKeys.apply(combine); } PCollection<OutputT> output = combined.apply(Values.<OutputT>create()); if (insertDefault) { if (!output.getWindowingStrategy().getWindowFn().isCompatible(new GlobalWindows())) { throw new IllegalStateException(fn.getIncompatibleGlobalWindowErrorMessage()); } return insertDefaultValueIfEmpty(output); } else { return output; } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); Combine.populateDisplayData(builder, fn, fnDisplayData); Combine.populateGlobalDisplayData(builder, fanout, insertDefault); } private PCollection<OutputT> insertDefaultValueIfEmpty(PCollection<OutputT> maybeEmpty) { final PCollectionView<Iterable<OutputT>> maybeEmptyView = maybeEmpty.apply( View.<OutputT>asIterable()); final OutputT defaultValue = fn.defaultValue(); PCollection<OutputT> defaultIfEmpty = maybeEmpty.getPipeline() .apply("CreateVoid", Create.of((Void) null).withCoder(VoidCoder.of())) .apply("ProduceDefault", ParDo.of( new DoFn<Void, OutputT>() { @ProcessElement public void processElement(ProcessContext c) { Iterator<OutputT> combined = c.sideInput(maybeEmptyView).iterator(); if (!combined.hasNext()) { c.output(defaultValue); } } }).withSideInputs(maybeEmptyView)) .setCoder(maybeEmpty.getCoder()) .setWindowingStrategyInternal(maybeEmpty.getWindowingStrategy()); return PCollectionList.of(maybeEmpty).and(defaultIfEmpty) .apply(Flatten.<OutputT>pCollections()); } } private static void populateDisplayData( DisplayData.Builder builder, HasDisplayData fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayItem) { builder .include("combineFn", fn) .add(fnDisplayItem); } private static void populateGlobalDisplayData( DisplayData.Builder builder, int fanout, boolean insertDefault) { builder .addIfNotDefault(DisplayData.item("fanout", fanout) .withLabel("Key Fanout Size"), 0) .add(DisplayData.item("emitDefaultOnEmptyInput", insertDefault) .withLabel("Emit Default On Empty Input")); } /** * {@code Combine.GloballyAsSingletonView<InputT, OutputT>} takes a {@code PCollection<InputT>} * and returns a {@code PCollectionView<OutputT>} whose elements are the result of * combining all the elements in each window of the input {@code PCollection}, * using a specified {@link CombineFn CombineFn<InputT, AccumT, OutputT>}. * It is common for {@code InputT == OutputT}, but not required. Common combining * functions include sums, mins, maxes, and averages of numbers, * conjunctions and disjunctions of booleans, statistical * aggregations, etc. * * <p>Example of use: * <pre> {@code * PCollection<Integer> pc = ...; * PCollection<Integer> sum = pc.apply( * Combine.globally(new Sum.SumIntegerFn())); * } </pre> * * <p>Combining can happen in parallel, with different subsets of the * input {@code PCollection} being combined separately, and their * intermediate results combined further, in an arbitrary tree * reduction pattern, until a single result value is produced. * * <p>If a value is requested from the view for a window that is not present * and {@code insertDefault} is true, the result of calling the {@code CombineFn} * on empty input will returned. If {@code insertDefault} is false, an * exception will be thrown instead. * * <p>By default, the {@code Coder} of the output {@code PValue<OutputT>} * is inferred from the concrete type of the * {@code CombineFn<InputT, AccumT, OutputT>}'s output type {@code OutputT}. * * <p>See also {@link #perKey}/{@link PerKey Combine.PerKey} and * {@link #groupedValues}/{@link GroupedValues Combine.GroupedValues}, which * are useful for combining values associated with each key in * a {@code PCollection} of {@code KV}s. * * @param <InputT> type of input values * @param <OutputT> type of output values */ public static class GloballyAsSingletonView<InputT, OutputT> extends PTransform<PCollection<InputT>, PCollectionView<OutputT>> { private final GlobalCombineFn<? super InputT, ?, OutputT> fn; private final DisplayData.ItemSpec<? extends Class<?>> fnDisplayData; private final boolean insertDefault; private final int fanout; private GloballyAsSingletonView( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, boolean insertDefault, int fanout) { this.fn = fn; this.fnDisplayData = fnDisplayData; this.insertDefault = insertDefault; this.fanout = fanout; } @Override public PCollectionView<OutputT> expand(PCollection<InputT> input) { PCollection<OutputT> combined = input.apply(Combine.<InputT, OutputT>globally(fn).withoutDefaults().withFanout(fanout)); return combined.apply( CreatePCollectionView.<OutputT, OutputT>of( PCollectionViews.singletonView( combined, input.getWindowingStrategy(), insertDefault, insertDefault ? fn.defaultValue() : null, combined.getCoder()))); } public int getFanout() { return fanout; } public boolean getInsertDefault() { return insertDefault; } public GlobalCombineFn<? super InputT, ?, OutputT> getCombineFn() { return fn; } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); Combine.populateDisplayData(builder, fn, fnDisplayData); Combine.populateGlobalDisplayData(builder, fanout, insertDefault); } } /** * Converts a {@link SerializableFunction} from {@code Iterable<V>}s * to {@code V}s into a simple {@link CombineFn} over {@code V}s. * * <p>Used in the implementation of convenience methods like * {@link #globally(SerializableFunction)}, * {@link #perKey(SerializableFunction)}, and * {@link #groupedValues(SerializableFunction)}. */ public static class IterableCombineFn<V> extends CombineFn<V, List<V>, V> implements NameOverride { /** * Returns a {@code CombineFn} that uses the given * {@code SerializableFunction} to combine values. */ public static <V> IterableCombineFn<V> of( SerializableFunction<Iterable<V>, V> combiner) { return of(combiner, DEFAULT_BUFFER_SIZE); } /** * Returns a {@code CombineFn} that uses the given * {@code SerializableFunction} to combine values, * attempting to buffer at least {@code bufferSize} * values between invocations. */ public static <V> IterableCombineFn<V> of( SerializableFunction<Iterable<V>, V> combiner, int bufferSize) { return new IterableCombineFn<>(combiner, bufferSize); } private static final int DEFAULT_BUFFER_SIZE = 20; /** The combiner function. */ private final SerializableFunction<Iterable<V>, V> combiner; /** * The number of values to accumulate before invoking the combiner * function to combine them. */ private final int bufferSize; private IterableCombineFn( SerializableFunction<Iterable<V>, V> combiner, int bufferSize) { this.combiner = combiner; this.bufferSize = bufferSize; } @Override public List<V> createAccumulator() { return new ArrayList<>(); } @Override public List<V> addInput(List<V> accumulator, V input) { accumulator.add(input); if (accumulator.size() > bufferSize) { return mergeToSingleton(accumulator); } else { return accumulator; } } @Override public List<V> mergeAccumulators(Iterable<List<V>> accumulators) { return mergeToSingleton(Iterables.concat(accumulators)); } @Override public V extractOutput(List<V> accumulator) { return combiner.apply(accumulator); } @Override public List<V> compact(List<V> accumulator) { return accumulator.size() > 1 ? mergeToSingleton(accumulator) : accumulator; } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("combineFn", combiner.getClass()) .withLabel("Combiner")); } private List<V> mergeToSingleton(Iterable<V> values) { List<V> singleton = new ArrayList<>(); singleton.add(combiner.apply(values)); return singleton; } @Override public String getNameOverride() { return NameUtils.approximateSimpleName(combiner); } } /** * Converts a {@link SerializableFunction} from {@code Iterable<V>}s * to {@code V}s into a simple {@link CombineFn} over {@code V}s. * * <p>@deprecated Use {@link IterableCombineFn} or the more space efficient * {@link BinaryCombineFn} instead (which avoids buffering values). */ @Deprecated public static class SimpleCombineFn<V> extends IterableCombineFn<V> { /** * Returns a {@code CombineFn} that uses the given * {@code SerializableFunction} to combine values. */ @Deprecated public static <V> SimpleCombineFn<V> of( SerializableFunction<Iterable<V>, V> combiner) { return new SimpleCombineFn<>(combiner); } protected SimpleCombineFn(SerializableFunction<Iterable<V>, V> combiner) { super(combiner, IterableCombineFn.DEFAULT_BUFFER_SIZE); } } ///////////////////////////////////////////////////////////////////////////// /** * {@code PerKey<K, InputT, OutputT>} takes a * {@code PCollection<KV<K, InputT>>}, groups it by key, applies a * combining function to the {@code InputT} values associated with each * key to produce a combined {@code OutputT} value, and returns a * {@code PCollection<KV<K, OutputT>>} representing a map from each * distinct key of the input {@code PCollection} to the corresponding * combined value. {@code InputT} and {@code OutputT} are often the same. * * <p>This is a concise shorthand for an application of * {@link GroupByKey} followed by an application of * {@link GroupedValues Combine.GroupedValues}. See those * operations for more details on how keys are compared for equality * and on the default {@code Coder} for the output. * * <p>Example of use: * <pre> {@code * PCollection<KV<String, Double>> salesRecords = ...; * PCollection<KV<String, Double>> totalSalesPerPerson = * salesRecords.apply(Combine.<String, Double, Double>perKey( * Sum.ofDoubles())); * } </pre> * * <p>Each output element is in the window by which its corresponding input * was grouped, and has the timestamp of the end of that window. The output * {@code PCollection} has the same * {@link org.apache.beam.sdk.transforms.windowing.WindowFn} * as the input. * * @param <K> the type of the keys of the input and output * {@code PCollection}s * @param <InputT> the type of the values of the input {@code PCollection} * @param <OutputT> the type of the values of the output {@code PCollection} */ public static class PerKey<K, InputT, OutputT> extends PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> { private final GlobalCombineFn<? super InputT, ?, OutputT> fn; private final DisplayData.ItemSpec<? extends Class<?>> fnDisplayData; private final boolean fewKeys; private final List<PCollectionView<?>> sideInputs; private PerKey( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, boolean fewKeys) { this.fn = fn; this.fnDisplayData = fnDisplayData; this.fewKeys = fewKeys; this.sideInputs = ImmutableList.of(); } private PerKey( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, boolean fewKeys, List<PCollectionView<?>> sideInputs) { this.fn = fn; this.fnDisplayData = fnDisplayData; this.fewKeys = fewKeys; this.sideInputs = sideInputs; } @Override protected String getKindString() { return String.format("Combine.perKey(%s)", NameUtils.approximateSimpleName(fn)); } /** * Returns a {@link PTransform} identical to this, but with the specified side inputs to use * in {@link CombineFnWithContext}. */ public PerKey<K, InputT, OutputT> withSideInputs(PCollectionView<?>... sideInputs) { return withSideInputs(Arrays.asList(sideInputs)); } /** * Returns a {@link PTransform} identical to this, but with the specified side inputs to use * in {@link CombineFnWithContext}. */ public PerKey<K, InputT, OutputT> withSideInputs( Iterable<? extends PCollectionView<?>> sideInputs) { checkState(fn instanceof RequiresContextInternal); return new PerKey<>(fn, fnDisplayData, fewKeys, ImmutableList.copyOf(sideInputs)); } /** * If a single key has disproportionately many values, it may become a * bottleneck, especially in streaming mode. This returns a new per-key * combining transform that inserts an intermediate node to combine "hot" * keys partially before performing the full combine. * * @param hotKeyFanout a function from keys to an integer N, where the key * will be spread among N intermediate nodes for partial combining. * If N is less than or equal to 1, this key will not be sent through an * intermediate node. */ public PerKeyWithHotKeyFanout<K, InputT, OutputT> withHotKeyFanout( SerializableFunction<? super K, Integer> hotKeyFanout) { return new PerKeyWithHotKeyFanout<>(fn, fnDisplayData, hotKeyFanout); } /** * Like {@link #withHotKeyFanout(SerializableFunction)}, but returning the given * constant value for every key. */ public PerKeyWithHotKeyFanout<K, InputT, OutputT> withHotKeyFanout(final int hotKeyFanout) { return new PerKeyWithHotKeyFanout<>(fn, fnDisplayData, new SimpleFunction<K, Integer>() { @Override public void populateDisplayData(Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("fanout", hotKeyFanout) .withLabel("Key Fanout Size")); } @Override public Integer apply(K unused) { return hotKeyFanout; } }); } /** * Returns the {@link GlobalCombineFn} used by this Combine operation. */ public GlobalCombineFn<? super InputT, ?, OutputT> getFn() { return fn; } /** * Returns the side inputs used by this Combine operation. */ public List<PCollectionView<?>> getSideInputs() { return sideInputs; } /** * Returns the side inputs of this {@link Combine}, tagged with the tag of the * {@link PCollectionView}. The values of the returned map will be equal to the result of * {@link #getSideInputs()}. */ @Override public Map<TupleTag<?>, PValue> getAdditionalInputs() { ImmutableMap.Builder<TupleTag<?>, PValue> additionalInputs = ImmutableMap.builder(); for (PCollectionView<?> sideInput : sideInputs) { additionalInputs.put(sideInput.getTagInternal(), sideInput.getPCollection()); } return additionalInputs.build(); } @Override public PCollection<KV<K, OutputT>> expand(PCollection<KV<K, InputT>> input) { return input .apply( fewKeys ? GroupByKey.<K, InputT>createWithFewKeys() : GroupByKey.<K, InputT>create()) .apply( Combine.<K, InputT, OutputT>groupedValues(fn, fnDisplayData) .withSideInputs(sideInputs)); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); Combine.populateDisplayData(builder, fn, fnDisplayData); } } /** * Like {@link PerKey}, but sharding the combining of hot keys. */ public static class PerKeyWithHotKeyFanout<K, InputT, OutputT> extends PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> { private final GlobalCombineFn<? super InputT, ?, OutputT> fn; private final DisplayData.ItemSpec<? extends Class<?>> fnDisplayData; private final SerializableFunction<? super K, Integer> hotKeyFanout; private PerKeyWithHotKeyFanout( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, SerializableFunction<? super K, Integer> hotKeyFanout) { this.fn = fn; this.fnDisplayData = fnDisplayData; this.hotKeyFanout = hotKeyFanout; } @Override protected String getKindString() { return String.format("Combine.perKeyWithFanout(%s)", NameUtils.approximateSimpleName(fn)); } @Override public PCollection<KV<K, OutputT>> expand(PCollection<KV<K, InputT>> input) { return applyHelper(input); } private <AccumT> PCollection<KV<K, OutputT>> applyHelper(PCollection<KV<K, InputT>> input) { // Name the accumulator type. @SuppressWarnings("unchecked") final GlobalCombineFn<InputT, AccumT, OutputT> typedFn = (GlobalCombineFn<InputT, AccumT, OutputT>) this.fn; if (!(input.getCoder() instanceof KvCoder)) { throw new IllegalStateException( "Expected input coder to be KvCoder, but was " + input.getCoder()); } @SuppressWarnings("unchecked") final KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder(); final Coder<AccumT> accumCoder; try { accumCoder = typedFn.getAccumulatorCoder( input.getPipeline().getCoderRegistry(), inputCoder.getValueCoder()); } catch (CannotProvideCoderException e) { throw new IllegalStateException("Unable to determine accumulator coder.", e); } Coder<InputOrAccum<InputT, AccumT>> inputOrAccumCoder = new InputOrAccum.InputOrAccumCoder<InputT, AccumT>( inputCoder.getValueCoder(), accumCoder); // A CombineFn's mergeAccumulator can be applied in a tree-like fashion. // Here we shard the key using an integer nonce, combine on that partial // set of values, then drop the nonce and do a final combine of the // aggregates. We do this by splitting the original CombineFn into two, // on that does addInput + merge and another that does merge + extract. GlobalCombineFn<InputT, AccumT, AccumT> hotPreCombine; GlobalCombineFn<InputOrAccum<InputT, AccumT>, AccumT, OutputT> postCombine; if (typedFn instanceof CombineFn) { final CombineFn<InputT, AccumT, OutputT> fn = (CombineFn<InputT, AccumT, OutputT>) typedFn; hotPreCombine = new CombineFn<InputT, AccumT, AccumT>() { @Override public AccumT createAccumulator() { return fn.createAccumulator(); } @Override public AccumT addInput(AccumT accumulator, InputT value) { return fn.addInput(accumulator, value); } @Override public AccumT mergeAccumulators(Iterable<AccumT> accumulators) { return fn.mergeAccumulators(accumulators); } @Override public AccumT compact(AccumT accumulator) { return fn.compact(accumulator); } @Override public AccumT extractOutput(AccumT accumulator) { return accumulator; } @Override @SuppressWarnings("unchecked") public Coder<AccumT> getAccumulatorCoder( CoderRegistry registry, Coder<InputT> inputCoder) throws CannotProvideCoderException { return accumCoder; } @Override public void populateDisplayData(DisplayData.Builder builder) { builder.delegate(PerKeyWithHotKeyFanout.this); } }; postCombine = new CombineFn<InputOrAccum<InputT, AccumT>, AccumT, OutputT>() { @Override public AccumT createAccumulator() { return fn.createAccumulator(); } @Override public AccumT addInput(AccumT accumulator, InputOrAccum<InputT, AccumT> value) { if (value.accum == null) { return fn.addInput(accumulator, value.input); } else { return fn.mergeAccumulators(ImmutableList.of(accumulator, value.accum)); } } @Override public AccumT mergeAccumulators(Iterable<AccumT> accumulators) { return fn.mergeAccumulators(accumulators); } @Override public AccumT compact(AccumT accumulator) { return fn.compact(accumulator); } @Override public OutputT extractOutput(AccumT accumulator) { return fn.extractOutput(accumulator); } @Override public Coder<OutputT> getDefaultOutputCoder( CoderRegistry registry, Coder<InputOrAccum<InputT, AccumT>> accumulatorCoder) throws CannotProvideCoderException { return fn.getDefaultOutputCoder(registry, inputCoder.getValueCoder()); } @Override public Coder<AccumT> getAccumulatorCoder( CoderRegistry registry, Coder<InputOrAccum<InputT, AccumT>> inputCoder) throws CannotProvideCoderException { return accumCoder; } @Override public void populateDisplayData(DisplayData.Builder builder) { builder.delegate(PerKeyWithHotKeyFanout.this); } }; } else if (typedFn instanceof CombineFnWithContext) { final CombineFnWithContext<InputT, AccumT, OutputT> fnWithContext = (CombineFnWithContext<InputT, AccumT, OutputT>) typedFn; hotPreCombine = new CombineFnWithContext<InputT, AccumT, AccumT>() { @Override public AccumT createAccumulator(Context c) { return fnWithContext.createAccumulator(c); } @Override public AccumT addInput(AccumT accumulator, InputT value, Context c) { return fnWithContext.addInput(accumulator, value, c); } @Override public AccumT mergeAccumulators(Iterable<AccumT> accumulators, Context c) { return fnWithContext.mergeAccumulators(accumulators, c); } @Override public AccumT compact(AccumT accumulator, Context c) { return fnWithContext.compact(accumulator, c); } @Override public AccumT extractOutput(AccumT accumulator, Context c) { return accumulator; } @Override @SuppressWarnings("unchecked") public Coder<AccumT> getAccumulatorCoder( CoderRegistry registry, Coder<InputT> inputCoder) throws CannotProvideCoderException { return accumCoder; } @Override public void populateDisplayData(DisplayData.Builder builder) { builder.delegate(PerKeyWithHotKeyFanout.this); } }; postCombine = new CombineFnWithContext<InputOrAccum<InputT, AccumT>, AccumT, OutputT>() { @Override public AccumT createAccumulator(Context c) { return fnWithContext.createAccumulator(c); } @Override public AccumT addInput( AccumT accumulator, InputOrAccum<InputT, AccumT> value, Context c) { if (value.accum == null) { return fnWithContext.addInput(accumulator, value.input, c); } else { return fnWithContext.mergeAccumulators( ImmutableList.of(accumulator, value.accum), c); } } @Override public AccumT mergeAccumulators(Iterable<AccumT> accumulators, Context c) { return fnWithContext.mergeAccumulators(accumulators, c); } @Override public AccumT compact(AccumT accumulator, Context c) { return fnWithContext.compact(accumulator, c); } @Override public OutputT extractOutput(AccumT accumulator, Context c) { return fnWithContext.extractOutput(accumulator, c); } @Override public Coder<OutputT> getDefaultOutputCoder( CoderRegistry registry, Coder<InputOrAccum<InputT, AccumT>> accumulatorCoder) throws CannotProvideCoderException { return fnWithContext.getDefaultOutputCoder(registry, inputCoder.getValueCoder()); } @Override public Coder<AccumT> getAccumulatorCoder( CoderRegistry registry, Coder<InputOrAccum<InputT, AccumT>> inputCoder) throws CannotProvideCoderException { return accumCoder; } @Override public void populateDisplayData(DisplayData.Builder builder) { builder.delegate(PerKeyWithHotKeyFanout.this); } }; } else { throw new IllegalStateException( String.format("Unknown type of CombineFn: %s", typedFn.getClass())); } // Use the provided hotKeyFanout fn to split into "hot" and "cold" keys, // augmenting the hot keys with a nonce. final TupleTag<KV<KV<K, Integer>, InputT>> hot = new TupleTag<>(); final TupleTag<KV<K, InputT>> cold = new TupleTag<>(); PCollectionTuple split = input.apply("AddNonce", ParDo.of( new DoFn<KV<K, InputT>, KV<K, InputT>>() { transient int counter; @StartBundle public void startBundle() { counter = ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE); } @ProcessElement public void processElement(ProcessContext c) { KV<K, InputT> kv = c.element(); int spread = Math.max(1, hotKeyFanout.apply(kv.getKey())); if (spread <= 1) { c.output(kv); } else { int nonce = counter++ % spread; c.output(hot, KV.of(KV.of(kv.getKey(), nonce), kv.getValue())); } } }) .withOutputTags(cold, TupleTagList.of(hot))); // The first level of combine should never use accumulating mode. WindowingStrategy<?, ?> preCombineStrategy = input.getWindowingStrategy(); if (preCombineStrategy.getMode() == WindowingStrategy.AccumulationMode.ACCUMULATING_FIRED_PANES) { preCombineStrategy = preCombineStrategy.withMode( WindowingStrategy.AccumulationMode.DISCARDING_FIRED_PANES); } // Combine the hot and cold keys separately. PCollection<KV<K, InputOrAccum<InputT, AccumT>>> precombinedHot = split .get(hot) .setCoder( KvCoder.of( KvCoder.of(inputCoder.getKeyCoder(), VarIntCoder.of()), inputCoder.getValueCoder())) .setWindowingStrategyInternal(preCombineStrategy) .apply( "PreCombineHot", Combine.<KV<K, Integer>, InputT, AccumT>perKey(hotPreCombine, fnDisplayData)) .apply( "StripNonce", MapElements.via( new SimpleFunction< KV<KV<K, Integer>, AccumT>, KV<K, InputOrAccum<InputT, AccumT>>>() { @Override public KV<K, InputOrAccum<InputT, AccumT>> apply( KV<KV<K, Integer>, AccumT> elem) { return KV.of( elem.getKey().getKey(), InputOrAccum.<InputT, AccumT>accum(elem.getValue())); } })) .setCoder(KvCoder.of(inputCoder.getKeyCoder(), inputOrAccumCoder)) .apply(Window.<KV<K, InputOrAccum<InputT, AccumT>>>remerge()) .setWindowingStrategyInternal(input.getWindowingStrategy()); PCollection<KV<K, InputOrAccum<InputT, AccumT>>> preprocessedCold = split .get(cold) .setCoder(inputCoder) .apply("PrepareCold", MapElements.via( new SimpleFunction<KV<K, InputT>, KV<K, InputOrAccum<InputT, AccumT>>>() { @Override public KV<K, InputOrAccum<InputT, AccumT>> apply(KV<K, InputT> element) { return KV.of(element.getKey(), InputOrAccum.<InputT, AccumT>input(element.getValue())); } })) .setCoder(KvCoder.of(inputCoder.getKeyCoder(), inputOrAccumCoder)); // Combine the union of the pre-processed hot and cold key results. return PCollectionList.of(precombinedHot) .and(preprocessedCold) .apply(Flatten.<KV<K, InputOrAccum<InputT, AccumT>>>pCollections()) .apply( "PostCombine", Combine.<K, InputOrAccum<InputT, AccumT>, OutputT>perKey(postCombine, fnDisplayData)); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); Combine.populateDisplayData(builder, fn, fnDisplayData); if (hotKeyFanout instanceof HasDisplayData) { builder.include("hotKeyFanout", (HasDisplayData) hotKeyFanout); } builder.add(DisplayData.item("fanoutFn", hotKeyFanout.getClass()) .withLabel("Fanout Function")); } /** * Used to store either an input or accumulator value, for flattening * the hot and cold key paths. */ private static class InputOrAccum<InputT, AccumT> { public final InputT input; public final AccumT accum; private InputOrAccum(InputT input, AccumT aggr) { this.input = input; this.accum = aggr; } public static <InputT, AccumT> InputOrAccum<InputT, AccumT> input(InputT input) { return new InputOrAccum<>(input, null); } public static <InputT, AccumT> InputOrAccum<InputT, AccumT> accum(AccumT aggr) { return new InputOrAccum<>(null, aggr); } private static class InputOrAccumCoder<InputT, AccumT> extends StructuredCoder<InputOrAccum<InputT, AccumT>> { private final Coder<InputT> inputCoder; private final Coder<AccumT> accumCoder; public InputOrAccumCoder(Coder<InputT> inputCoder, Coder<AccumT> accumCoder) { this.inputCoder = inputCoder; this.accumCoder = accumCoder; } @Override public void encode(InputOrAccum<InputT, AccumT> value, OutputStream outStream) throws CoderException, IOException { encode(value, outStream, Coder.Context.NESTED); } @Override public void encode( InputOrAccum<InputT, AccumT> value, OutputStream outStream, Coder.Context context) throws CoderException, IOException { if (value.input != null) { outStream.write(0); inputCoder.encode(value.input, outStream, context); } else { outStream.write(1); accumCoder.encode(value.accum, outStream, context); } } @Override public InputOrAccum<InputT, AccumT> decode(InputStream inStream) throws CoderException, IOException { return decode(inStream, Coder.Context.NESTED); } @Override public InputOrAccum<InputT, AccumT> decode(InputStream inStream, Coder.Context context) throws CoderException, IOException { if (inStream.read() == 0) { return InputOrAccum.<InputT, AccumT>input(inputCoder.decode(inStream, context)); } else { return InputOrAccum.<InputT, AccumT>accum(accumCoder.decode(inStream, context)); } } @Override public List<? extends Coder<?>> getCoderArguments() { return ImmutableList.of(inputCoder, accumCoder); } @Override public void verifyDeterministic() throws Coder.NonDeterministicException { inputCoder.verifyDeterministic(); accumCoder.verifyDeterministic(); } } } } ///////////////////////////////////////////////////////////////////////////// /** * {@code GroupedValues<K, InputT, OutputT>} takes a {@code PCollection<KV<K, Iterable<InputT>>>}, * such as the result of {@link GroupByKey}, applies a specified {@link CombineFn * CombineFn<InputT, AccumT, OutputT>} to each of the input {@code KV<K, * Iterable<InputT>>} elements to produce a combined output {@code KV<K, OutputT>} element, and * returns a {@code PCollection<KV<K, OutputT>>} containing all the combined output elements. It * is common for {@code InputT == OutputT}, but not required. Common combining functions include * sums, mins, maxes, and averages of numbers, conjunctions and disjunctions of booleans, * statistical aggregations, etc. * * <p>Example of use: * * <pre>{@code * PCollection<KV<String, Integer>> pc = ...; * PCollection<KV<String, Iterable<Integer>>> groupedByKey = pc.apply( * new GroupByKey<String, Integer>()); * PCollection<KV<String, Integer>> sumByKey = groupedByKey.apply( * Combine.<String, Integer>groupedValues( * new Sum.SumIntegerFn())); * } * </pre> * * <p>See also {@link #perKey}/{@link PerKey Combine.PerKey}, which captures the common pattern of * "combining by key" in a single easy-to-use {@code PTransform}. * * <p>Combining for different keys can happen in parallel. Moreover, combining of the {@code * Iterable<InputT>} values associated a single key can happen in parallel, with different subsets * of the values being combined separately, and their intermediate results combined further, in an * arbitrary tree reduction pattern, until a single result value is produced for each key. * * <p>By default, the {@code Coder} of the keys of the output {@code PCollection<KV<K, OutputT>>} * is that of the keys of the input {@code PCollection<KV<K, InputT>>}, and the {@code Coder} of * the values of the output {@code PCollection<KV<K, OutputT>>} is inferred from the concrete type * of the {@code CombineFn<InputT, AccumT, OutputT>}'s output type {@code OutputT}. * * <p>Each output element has the same timestamp and is in the same window as its corresponding * input element, and the output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} associated with it as the input. * * <p>See also {@link #globally}/{@link Globally Combine.Globally}, which combines all the values * in a {@code PCollection} into a single value in a {@code PCollection}. * * @param <K> type of input and output keys * @param <InputT> type of input values * @param <OutputT> type of output values */ public static class GroupedValues<K, InputT, OutputT> extends PTransform< PCollection<? extends KV<K, ? extends Iterable<InputT>>>, PCollection<KV<K, OutputT>>> { private final GlobalCombineFn<? super InputT, ?, OutputT> fn; private final DisplayData.ItemSpec<? extends Class<?>> fnDisplayData; private final List<PCollectionView<?>> sideInputs; private GroupedValues( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData) { this.fn = SerializableUtils.clone(fn); this.fnDisplayData = fnDisplayData; this.sideInputs = ImmutableList.of(); } private GroupedValues( GlobalCombineFn<? super InputT, ?, OutputT> fn, DisplayData.ItemSpec<? extends Class<?>> fnDisplayData, List<PCollectionView<?>> sideInputs) { this.fn = SerializableUtils.clone(fn); this.fnDisplayData = fnDisplayData; this.sideInputs = sideInputs; } public GroupedValues<K, InputT, OutputT> withSideInputs(PCollectionView<?>... sideInputs) { return withSideInputs(Arrays.asList(sideInputs)); } public GroupedValues<K, InputT, OutputT> withSideInputs( Iterable<? extends PCollectionView<?>> sideInputs) { return new GroupedValues<>(fn, fnDisplayData, ImmutableList.copyOf(sideInputs)); } /** * Returns the {@link GlobalCombineFn} used by this Combine operation. */ public GlobalCombineFn<? super InputT, ?, OutputT> getFn() { return fn; } public List<PCollectionView<?>> getSideInputs() { return sideInputs; } @Override public PCollection<KV<K, OutputT>> expand( PCollection<? extends KV<K, ? extends Iterable<InputT>>> input) { PCollection<KV<K, OutputT>> output = input.apply(ParDo.of( new DoFn<KV<K, ? extends Iterable<InputT>>, KV<K, OutputT>>() { @ProcessElement public void processElement(final ProcessContext c) { K key = c.element().getKey(); OutputT output; if (fn instanceof CombineFnWithContext) { output = ((CombineFnWithContext<? super InputT, ?, OutputT>) fn) .apply(c.element().getValue(), new CombineWithContext.Context() { @Override public PipelineOptions getPipelineOptions() { return c.getPipelineOptions(); } @Override public <T> T sideInput(PCollectionView<T> view) { return c.sideInput(view); } }); } else if (fn instanceof CombineFn) { output = ((CombineFn<? super InputT, ?, OutputT>) fn) .apply(c.element().getValue()); } else { throw new IllegalStateException( String.format("Unknown type of CombineFn: %s", fn.getClass())); } c.output(KV.of(key, output)); } @Override public void populateDisplayData(DisplayData.Builder builder) { builder.delegate(Combine.GroupedValues.this); } }).withSideInputs(sideInputs)); try { Coder<KV<K, OutputT>> outputCoder = getDefaultOutputCoder(input); output.setCoder(outputCoder); } catch (CannotProvideCoderException exc) { // let coder inference happen later, if it can } return output; } /** * Returns the {@link CombineFn} bound to its coders. * * <p>For internal use. */ public AppliedCombineFn<? super K, ? super InputT, ?, OutputT> getAppliedFn( CoderRegistry registry, Coder<? extends KV<K, ? extends Iterable<InputT>>> inputCoder, WindowingStrategy<?, ?> windowingStrategy) { KvCoder<K, InputT> kvCoder = getKvCoder(inputCoder); return AppliedCombineFn.withInputCoder( fn, registry, kvCoder, sideInputs, windowingStrategy); } private KvCoder<K, InputT> getKvCoder( Coder<? extends KV<K, ? extends Iterable<InputT>>> inputCoder) { if (!(inputCoder instanceof KvCoder)) { throw new IllegalStateException( "Combine.GroupedValues requires its input to use KvCoder"); } @SuppressWarnings({"unchecked", "rawtypes"}) KvCoder<K, ? extends Iterable<InputT>> kvCoder = (KvCoder) inputCoder; Coder<K> keyCoder = kvCoder.getKeyCoder(); Coder<? extends Iterable<InputT>> kvValueCoder = kvCoder.getValueCoder(); if (!(kvValueCoder instanceof IterableCoder)) { throw new IllegalStateException( "Combine.GroupedValues requires its input values to use " + "IterableCoder"); } @SuppressWarnings("unchecked") IterableCoder<InputT> inputValuesCoder = (IterableCoder<InputT>) kvValueCoder; Coder<InputT> inputValueCoder = inputValuesCoder.getElemCoder(); return KvCoder.of(keyCoder, inputValueCoder); } @Override public Coder<KV<K, OutputT>> getDefaultOutputCoder( PCollection<? extends KV<K, ? extends Iterable<InputT>>> input) throws CannotProvideCoderException { KvCoder<K, InputT> kvCoder = getKvCoder(input.getCoder()); @SuppressWarnings("unchecked") Coder<OutputT> outputValueCoder = ((GlobalCombineFn<InputT, ?, OutputT>) fn) .getDefaultOutputCoder( input.getPipeline().getCoderRegistry(), kvCoder.getValueCoder()); return KvCoder.of(kvCoder.getKeyCoder(), outputValueCoder); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); Combine.populateDisplayData(builder, fn, fnDisplayData); } } }