UnsortedGrouping.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java.operators;

import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.functions.GroupCombineFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.operators.Keys;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.Utils;
import org.apache.flink.api.java.aggregation.Aggregations;
import org.apache.flink.api.java.functions.FirstReducer;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.functions.SelectByMaxFunction;
import org.apache.flink.api.java.functions.SelectByMinFunction;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.util.Preconditions;

@Public
public class UnsortedGrouping<T> extends Grouping<T> {

	public UnsortedGrouping(DataSet<T> set, Keys<T> keys) {
		super(set, keys);
	}
	
	/**
	 * Uses a custom partitioner for the grouping.
	 * 
	 * @param partitioner The custom partitioner.
	 * @return The grouping object itself, to allow for method chaining.
	 */
	public UnsortedGrouping<T> withPartitioner(Partitioner<?> partitioner) {
		Preconditions.checkNotNull(partitioner);
		getKeys().validateCustomPartitioner(partitioner, null);
		
		this.customPartitioner = partitioner;
		return this;
	}

	// --------------------------------------------------------------------------------------------
	//  Operations / Transformations
	// --------------------------------------------------------------------------------------------
	
	/**
	 * Applies an Aggregate transformation on a grouped {@link org.apache.flink.api.java.tuple.Tuple} {@link DataSet}.<br>
	 * <b>Note: Only Tuple DataSets can be aggregated.</b>
	 * The transformation applies a built-in {@link Aggregations Aggregation} on a specified field 
	 *   of a Tuple group. Additional aggregation functions can be added to the resulting 
	 *   {@link AggregateOperator} by calling {@link AggregateOperator#and(Aggregations, int)}.
	 * 
	 * @param agg The built-in aggregation function that is computed.
	 * @param field The index of the Tuple field on which the aggregation function is applied.
	 * @return An AggregateOperator that represents the aggregated DataSet. 
	 * 
	 * @see org.apache.flink.api.java.tuple.Tuple
	 * @see Aggregations
	 * @see AggregateOperator
	 * @see DataSet
	 */
	public AggregateOperator<T> aggregate(Aggregations agg, int field) {
		return aggregate(agg, field, Utils.getCallLocationName());
	}
	
	// private helper that allows to set a different call location name
	private AggregateOperator<T> aggregate(Aggregations agg, int field, String callLocationName) {
		return new AggregateOperator<T>(this, agg, field, callLocationName);
	}

	/**
	 * Syntactic sugar for aggregate (SUM, field)
	 * @param field The index of the Tuple field on which the aggregation function is applied.
	 * @return An AggregateOperator that represents the summed DataSet.
	 *
	 * @see org.apache.flink.api.java.operators.AggregateOperator
	 */
	public AggregateOperator<T> sum (int field) {
		return this.aggregate (Aggregations.SUM, field, Utils.getCallLocationName());
	}

	/**
	 * Syntactic sugar for aggregate (MAX, field)
	 * @param field The index of the Tuple field on which the aggregation function is applied.
	 * @return An AggregateOperator that represents the max'ed DataSet.
	 *
	 * @see org.apache.flink.api.java.operators.AggregateOperator
	 */
	public AggregateOperator<T> max (int field) {
		return this.aggregate (Aggregations.MAX, field, Utils.getCallLocationName());
	}

	/**
	 * Syntactic sugar for aggregate (MIN, field)
	 * @param field The index of the Tuple field on which the aggregation function is applied.
	 * @return An AggregateOperator that represents the min'ed DataSet.
	 *
	 * @see org.apache.flink.api.java.operators.AggregateOperator
	 */
	public AggregateOperator<T> min (int field) {
		return this.aggregate (Aggregations.MIN, field, Utils.getCallLocationName());
	}
	
	/**
	 * Applies a Reduce transformation on a grouped {@link DataSet}.<br>
	 * For each group, the transformation consecutively calls a {@link org.apache.flink.api.common.functions.RichReduceFunction}
	 *   until only a single element for each group remains. 
	 * A ReduceFunction combines two elements into one new element of the same type.
	 * 
	 * @param reducer The ReduceFunction that is applied on each group of the DataSet.
	 * @return A ReduceOperator that represents the reduced DataSet.
	 * 
	 * @see org.apache.flink.api.common.functions.RichReduceFunction
	 * @see ReduceOperator
	 * @see DataSet
	 */
	public ReduceOperator<T> reduce(ReduceFunction<T> reducer) {
		if (reducer == null) {
			throw new NullPointerException("Reduce function must not be null.");
		}
		return new ReduceOperator<T>(this, inputDataSet.clean(reducer), Utils.getCallLocationName());
	}
	
	/**
	 * Applies a GroupReduce transformation on a grouped {@link DataSet}.<br>
	 * The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet.
	 * A GroupReduceFunction can iterate over all elements of a group and emit any
	 *   number of output elements including none.
	 * 
	 * @param reducer The GroupReduceFunction that is applied on each group of the DataSet.
	 * @return A GroupReduceOperator that represents the reduced DataSet.
	 * 
	 * @see org.apache.flink.api.common.functions.RichGroupReduceFunction
	 * @see GroupReduceOperator
	 * @see DataSet
	 */
	public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) {
		if (reducer == null) {
			throw new NullPointerException("GroupReduce function must not be null.");
		}
		TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer,
				this.getInputDataSet().getType(), Utils.getCallLocationName(), true);

		return new GroupReduceOperator<T, R>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName());
	}

	/**
	 * Applies a GroupCombineFunction on a grouped {@link DataSet}.
	 * A GroupCombineFunction is similar to a GroupReduceFunction but does not perform a full data exchange. Instead, the
	 * CombineFunction calls the combine method once per partition for combining a group of results. This
	 * operator is suitable for combining values into an intermediate format before doing a proper groupReduce where
	 * the data is shuffled across the node for further reduction. The GroupReduce operator can also be supplied with
	 * a combiner by implementing the RichGroupReduce function. The combine method of the RichGroupReduce function
	 * demands input and output type to be the same. The CombineFunction, on the other side, can have an arbitrary
	 * output type.
	 * @param combiner The GroupCombineFunction that is applied on the DataSet.
	 * @return A GroupCombineOperator which represents the combined DataSet.
	 */
	public <R> GroupCombineOperator<T, R> combineGroup(GroupCombineFunction<T, R> combiner) {
		if (combiner == null) {
			throw new NullPointerException("GroupCombine function must not be null.");
		}
		TypeInformation<R> resultType = TypeExtractor.getGroupCombineReturnTypes(combiner,
				this.getInputDataSet().getType(), Utils.getCallLocationName(), true);

		return new GroupCombineOperator<T, R>(this, resultType, inputDataSet.clean(combiner), Utils.getCallLocationName());
	}

	/**
	 * Returns a new set containing the first n elements in this grouped {@link DataSet}.<br>
	 * @param n The desired number of elements for each group.
	 * @return A GroupReduceOperator that represents the DataSet containing the elements.
	*/
	public GroupReduceOperator<T, T> first(int n) {
		if(n < 1) {
			throw new InvalidProgramException("Parameter n of first(n) must be at least 1.");
		}
		
		return reduceGroup(new FirstReducer<T>(n));
	}

	/**
	 * Applies a special case of a reduce transformation (minBy) on a grouped {@link DataSet}.<br>
	 * The transformation consecutively calls a {@link ReduceFunction} 
	 * until only a single element remains which is the result of the transformation.
	 * A ReduceFunction combines two elements into one new element of the same type.
	 *  
	 * @param fields Keys taken into account for finding the minimum.
	 * @return A {@link ReduceOperator} representing the minimum.
	 */
	@SuppressWarnings({ "unchecked", "rawtypes" })
	public ReduceOperator<T> minBy(int... fields)  {
		
		// Check for using a tuple
		if(!this.inputDataSet.getType().isTupleType()) {
			throw new InvalidProgramException("Method minBy(int) only works on tuples.");
		}
			
		return new ReduceOperator<T>(this, new SelectByMinFunction(
				(TupleTypeInfo) this.inputDataSet.getType(), fields), Utils.getCallLocationName());
	}
	
	/**
	 * Applies a special case of a reduce transformation (maxBy) on a grouped {@link DataSet}.<br>
	 * The transformation consecutively calls a {@link ReduceFunction} 
	 * until only a single element remains which is the result of the transformation.
	 * A ReduceFunction combines two elements into one new element of the same type.
	 *  
	 * @param fields Keys taken into account for finding the minimum.
	 * @return A {@link ReduceOperator} representing the minimum.
	 */
	@SuppressWarnings({ "unchecked", "rawtypes" })
	public ReduceOperator<T> maxBy(int... fields)  {
		
		// Check for using a tuple
		if(!this.inputDataSet.getType().isTupleType()) {
			throw new InvalidProgramException("Method maxBy(int) only works on tuples.");
		}
			
		return new ReduceOperator<T>(this, new SelectByMaxFunction(
				(TupleTypeInfo) this.inputDataSet.getType(), fields), Utils.getCallLocationName());
	}
	// --------------------------------------------------------------------------------------------
	//  Group Operations
	// --------------------------------------------------------------------------------------------
	
	/**
	 * Sorts {@link org.apache.flink.api.java.tuple.Tuple} elements within a group on the specified field in the specified {@link Order}.<br>
	 * <b>Note: Only groups of Tuple elements and Pojos can be sorted.</b><br>
	 * Groups can be sorted by multiple fields by chaining {@link #sortGroup(int, Order)} calls.
	 * 
	 * @param field The Tuple field on which the group is sorted.
	 * @param order The Order in which the specified Tuple field is sorted.
	 * @return A SortedGrouping with specified order of group element.
	 * 
	 * @see org.apache.flink.api.java.tuple.Tuple
	 * @see Order
	 */
	public SortedGrouping<T> sortGroup(int field, Order order) {
		if (this.getKeys() instanceof Keys.SelectorFunctionKeys) {
			throw new InvalidProgramException("KeySelector grouping keys and field index group-sorting keys cannot be used together.");
		}

		SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, field, order);
		sg.customPartitioner = getCustomPartitioner();
		return sg;
	}
	
	/**
	 * Sorts Pojos within a group on the specified field in the specified {@link Order}.<br>
	 * <b>Note: Only groups of Tuple elements and Pojos can be sorted.</b><br>
	 * Groups can be sorted by multiple fields by chaining {@link #sortGroup(String, Order)} calls.
	 * 
	 * @param field The Tuple or Pojo field on which the group is sorted.
	 * @param order The Order in which the specified field is sorted.
	 * @return A SortedGrouping with specified order of group element.
	 * 
	 * @see Order
	 */
	public SortedGrouping<T> sortGroup(String field, Order order) {
		if (this.getKeys() instanceof Keys.SelectorFunctionKeys) {
			throw new InvalidProgramException("KeySelector grouping keys and field expression group-sorting keys cannot be used together.");
		}

		SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, field, order);
		sg.customPartitioner = getCustomPartitioner();
		return sg;
	}

	/**
	 * Sorts elements within a group on a key extracted by the specified {@link org.apache.flink.api.java.functions.KeySelector}
	 * in the specified {@link Order}.<br>
	 * Chaining {@link #sortGroup(KeySelector, Order)} calls is not supported.
	 *
	 * @param keySelector The KeySelector with which the group is sorted.
	 * @param order The Order in which the extracted key is sorted.
	 * @return A SortedGrouping with specified order of group element.
	 *
	 * @see Order
	 */
	public <K> SortedGrouping<T> sortGroup(KeySelector<T, K> keySelector, Order order) {
		if (!(this.getKeys() instanceof Keys.SelectorFunctionKeys)) {
			throw new InvalidProgramException("KeySelector group-sorting keys can only be used with KeySelector grouping keys.");
		}

		TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keySelector, this.inputDataSet.getType());
		SortedGrouping<T> sg = new SortedGrouping<T>(this.inputDataSet, this.keys, new Keys.SelectorFunctionKeys<T, K>(keySelector, this.inputDataSet.getType(), keyType), order);
		sg.customPartitioner = getCustomPartitioner();
		return sg;
	}
	
}