/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.common.aggregators; import java.io.Serializable; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.types.Value; /** * Aggregators are a means of aggregating values across parallel instances of a function. Aggregators * collect simple statistics (such as the number of processed elements) about the actual work performed in a function. * Aggregators are specific to iterations and are commonly used to check the convergence of an iteration by using a * {@link ConvergenceCriterion}. In contrast to the {@link org.apache.flink.api.common.accumulators.Accumulator} (whose result is available at the end of a job, * the aggregators are computed once per iteration superstep. Their value can be used to check for convergence (at the end * of the iteration superstep) and it can be accessed in the next iteration superstep. * <p> * Aggregators must be registered at the iteration inside which they are used via the function. In the Java API, the * method is "IterativeDataSet.registerAggregator(...)" or "IterativeDataSet.registerAggregationConvergenceCriterion(...)" * when using the aggregator together with a convergence criterion. Aggregators are always registered under a name. That * name can be used to access the aggregator at runtime from within a function. The following code snippet shows a typical * case. Here, it count across all parallel instances how many elements are filtered out by a function. * * <pre> * // the user-defined function * public class MyFilter extends FilterFunction<Double> { * private LongSumAggregator agg; * * public void open(Configuration parameters) { * agg = getIterationRuntimeContext().getIterationAggregator("numFiltered"); * } * * public boolean filter (Double value) { * if (value > 1000000.0) { * agg.aggregate(1); * return false * } * * return true; * } * } * * // the iteration where the aggregator is registered * IterativeDataSet<Double> iteration = input.iterate(100).registerAggregator("numFiltered", LongSumAggregator.class); * ... * DataSet<Double> filtered = someIntermediateResult.filter(new MyFilter); * ... * DataSet<Double> result = iteration.closeWith(filtered); * ... * </pre> * * <p> * Aggregators must be <i>distributive</i>: An aggregator must be able to pre-aggregate values and it must be able * to aggregate these pre-aggregated values to form the final aggregate. Many aggregation functions fulfill this * condition (sum, min, max) and others can be brought into that form: One can expressing <i>count</i> as a sum over * values of one, and one can express <i>average</i> through a sum and a count. * * @param <T> The type of the aggregated value. */ @PublicEvolving public interface Aggregator<T extends Value> extends Serializable { /** * Gets the aggregator's current aggregate. * * @return The aggregator's current aggregate. */ T getAggregate(); /** * Aggregates the given element. In the case of a <i>sum</i> aggregator, this method adds the given * value to the sum. * * @param element The element to aggregate. */ void aggregate(T element); /** * Resets the internal state of the aggregator. This must bring the aggregator into the same * state as if it was newly initialized. */ void reset(); }