/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.javaApiOperators;
import org.apache.flink.api.common.functions.GroupCombineFunction;
import org.apache.flink.api.common.functions.GroupReduceFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.DiscardingOutputFormat;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.test.javaApiOperators.util.CollectionDataSets;
import org.apache.flink.test.util.MultipleProgramsTestBase;
import org.apache.flink.util.Collector;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import java.util.Arrays;
import java.util.List;
@SuppressWarnings("serial")
@RunWith(Parameterized.class)
/**
* The GroupCombine operator is not easy to test because it is essentially just a combiner. The result can be
* the result of a normal groupReduce at any stage its execution. The basic idea is to preserve the grouping key
* in the partial result, so that we can do a reduceGroup afterwards to finalize the results for verification.
* In addition, we can use hashPartition to partition the data and check if no shuffling (just combining) has
* been performed.
*/
public class GroupCombineITCase extends MultipleProgramsTestBase {
public GroupCombineITCase(TestExecutionMode mode) {
super(mode);
}
private static String identityResult = "1,1,Hi\n" +
"2,2,Hello\n" +
"3,2,Hello world\n" +
"4,3,Hello world, how are you?\n" +
"5,3,I am fine.\n" +
"6,3,Luke Skywalker\n" +
"7,4,Comment#1\n" +
"8,4,Comment#2\n" +
"9,4,Comment#3\n" +
"10,4,Comment#4\n" +
"11,5,Comment#5\n" +
"12,5,Comment#6\n" +
"13,5,Comment#7\n" +
"14,5,Comment#8\n" +
"15,5,Comment#9\n" +
"16,6,Comment#10\n" +
"17,6,Comment#11\n" +
"18,6,Comment#12\n" +
"19,6,Comment#13\n" +
"20,6,Comment#14\n" +
"21,6,Comment#15\n";
@Test
public void testAllGroupCombineIdentity() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
DataSet<Tuple3<Integer, Long, String>> reduceDs = ds
// combine
.combineGroup(new IdentityFunction())
// fully reduce
.reduceGroup(new IdentityFunction());
List<Tuple3<Integer, Long, String>> result = reduceDs.collect();
compareResultAsTuples(result, identityResult);
}
@Test
public void testIdentity() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
DataSet<Tuple3<Integer, Long, String>> reduceDs = ds
// combine
.combineGroup(new IdentityFunction())
// fully reduce
.reduceGroup(new IdentityFunction());
List<Tuple3<Integer, Long, String>> result = reduceDs.collect();
compareResultAsTuples(result, identityResult);
}
@Test
public void testIdentityWithGroupBy() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
DataSet<Tuple3<Integer, Long, String>> reduceDs = ds
.groupBy(1)
// combine
.combineGroup(new IdentityFunction())
// fully reduce
.reduceGroup(new IdentityFunction());
List<Tuple3<Integer, Long, String>> result = reduceDs.collect();
compareResultAsTuples(result, identityResult);
}
@Test
public void testIdentityWithGroupByAndSort() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
DataSet<Tuple3<Integer, Long, String>> reduceDs = ds
.groupBy(1)
.sortGroup(1, Order.DESCENDING)
// reduce partially
.combineGroup(new IdentityFunction())
.groupBy(1)
.sortGroup(1, Order.DESCENDING)
// fully reduce
.reduceGroup(new IdentityFunction());
List<Tuple3<Integer, Long, String>> result = reduceDs.collect();
compareResultAsTuples(result, identityResult);
}
@Test
public void testPartialReduceWithIdenticalInputOutputType() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
DataSet<Tuple2<Long, Tuple3<Integer, Long, String>>> dsWrapped = ds
// wrap values as Kv pairs with the grouping key as key
.map(new Tuple3KvWrapper());
List<Tuple3<Integer, Long, String>> result = dsWrapped
.groupBy(0)
// reduce partially
.combineGroup(new Tuple3toTuple3GroupReduce())
.groupBy(0)
// reduce fully to check result
.reduceGroup(new Tuple3toTuple3GroupReduce())
//unwrap
.map(new MapFunction<Tuple2<Long, Tuple3<Integer, Long, String>>, Tuple3<Integer, Long, String>>() {
@Override
public Tuple3<Integer, Long, String> map(Tuple2<Long, Tuple3<Integer, Long, String>> value) throws Exception {
return value.f1;
}
}).collect();
String expected = "1,1,combined\n" +
"5,4,combined\n" +
"15,9,combined\n" +
"34,16,combined\n" +
"65,25,combined\n" +
"111,36,combined\n";
compareResultAsTuples(result, expected);
}
@Test
public void testPartialReduceWithDifferentInputOutputType() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
DataSet<Tuple2<Long, Tuple3<Integer, Long, String>>> dsWrapped = ds
// wrap values as Kv pairs with the grouping key as key
.map(new Tuple3KvWrapper());
List<Tuple2<Integer, Long>> result = dsWrapped
.groupBy(0)
// reduce partially
.combineGroup(new Tuple3toTuple2GroupReduce())
.groupBy(0)
// reduce fully to check result
.reduceGroup(new Tuple2toTuple2GroupReduce())
//unwrap
.map(new MapFunction<Tuple2<Long,Tuple2<Integer,Long>>, Tuple2<Integer,Long>>() {
@Override
public Tuple2<Integer, Long> map(Tuple2<Long, Tuple2<Integer, Long>> value) throws Exception {
return value.f1;
}
}).collect();
String expected = "1,3\n" +
"5,20\n" +
"15,58\n" +
"34,52\n" +
"65,70\n" +
"111,96\n";
compareResultAsTuples(result, expected);
}
@Test
// check if no shuffle is being executed
public void testCheckPartitionShuffleGroupBy() throws Exception {
org.junit.Assume.assumeTrue(mode != TestExecutionMode.COLLECTION);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
// partition and group data
UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
List<Tuple2<Long, Integer>> result = partitionedDS
.combineGroup(
new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
int count = 0;
long key = 0;
for (Tuple3<Integer, Long, String> value : values) {
key = value.f1;
count++;
}
out.collect(new Tuple2<>(key, count));
}
}).collect();
String[] localExpected = new String[] { "(6,6)", "(5,5)" + "(4,4)", "(3,3)", "(2,2)", "(1,1)" };
String[] resultAsStringArray = new String[result.size()];
for (int i = 0; i < resultAsStringArray.length; ++i) {
resultAsStringArray[i] = result.get(i).toString();
}
Arrays.sort(resultAsStringArray);
Assert.assertEquals("The two arrays were identical.", false, Arrays.equals(localExpected, resultAsStringArray));
}
@Test
// check if parallelism of 1 results in the same data like a shuffle
public void testCheckPartitionShuffleDOP1() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
// partition and group data
UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
List<Tuple2<Long, Integer>> result = partitionedDS
.combineGroup(
new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
int count = 0;
long key = 0;
for (Tuple3<Integer, Long, String> value : values) {
key = value.f1;
count++;
}
out.collect(new Tuple2<>(key, count));
}
}).collect();
String expected = "6,6\n" +
"5,5\n" +
"4,4\n" +
"3,3\n" +
"2,2\n" +
"1,1\n";
compareResultAsTuples(result, expected);
}
@Test
// check if all API methods are callable
public void testAPI() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple1<String>> ds = CollectionDataSets.getStringDataSet(env).map(new MapFunction<String, Tuple1<String>>() {
@Override
public Tuple1<String> map(String value) throws Exception {
return new Tuple1<>(value);
}
});
// all methods on DataSet
ds.combineGroup(new GroupCombineFunctionExample())
.output(new DiscardingOutputFormat<Tuple1<String>>());
// all methods on UnsortedGrouping
ds.groupBy(0).combineGroup(new GroupCombineFunctionExample())
.output(new DiscardingOutputFormat<Tuple1<String>>());
// all methods on SortedGrouping
ds.groupBy(0).sortGroup(0, Order.ASCENDING).combineGroup(new GroupCombineFunctionExample())
.output(new DiscardingOutputFormat<Tuple1<String>>());
env.execute();
}
public static class GroupCombineFunctionExample implements GroupCombineFunction<Tuple1<String>, Tuple1<String>> {
@Override
public void combine(Iterable<Tuple1<String>> values, Collector<Tuple1<String>> out) throws Exception {
for (Tuple1<String> value : values) {
out.collect(value);
}
}
}
public static class ScalaGroupCombineFunctionExample implements GroupCombineFunction<scala.Tuple1<String>, scala.Tuple1<String>> {
@Override
public void combine(Iterable<scala.Tuple1<String>> values, Collector<scala.Tuple1<String>> out) throws Exception {
for (scala.Tuple1<String> value : values) {
out.collect(value);
}
}
}
public static class IdentityFunction implements GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple3<Integer, Long, String>>,
GroupReduceFunction<Tuple3<Integer, Long, String>, Tuple3<Integer, Long, String>> {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple3<Integer, Long, String>> out) throws Exception {
for (Tuple3<Integer, Long, String> value : values) {
out.collect(new Tuple3<>(value.f0, value.f1, value.f2));
}
}
@Override
public void reduce(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple3<Integer, Long, String>> out) throws Exception {
for (Tuple3<Integer, Long, String> value : values) {
out.collect(new Tuple3<>(value.f0, value.f1, value.f2));
}
}
}
public static class Tuple3toTuple3GroupReduce implements KvGroupReduce<Long, Tuple3<Integer, Long, String>,
Tuple3<Integer, Long, String>, Tuple3<Integer, Long, String>> {
@Override
public void combine(Iterable<Tuple2<Long, Tuple3<Integer, Long, String>>> values, Collector<Tuple2<Long,
Tuple3<Integer, Long, String>>> out) throws Exception {
int i = 0;
long l = 0;
long key = 0;
// collapse groups
for (Tuple2<Long, Tuple3<Integer, Long, String>> value : values) {
key = value.f0;
Tuple3<Integer, Long, String> extracted = value.f1;
i += extracted.f0;
l += extracted.f1;
}
Tuple3<Integer, Long, String> result = new Tuple3<>(i, l, "combined");
out.collect(new Tuple2<>(key, result));
}
@Override
public void reduce(Iterable<Tuple2<Long, Tuple3<Integer, Long, String>>> values,
Collector<Tuple2<Long, Tuple3<Integer, Long, String>>> out) throws Exception {
combine(values, out);
}
}
public static class Tuple3toTuple2GroupReduce implements KvGroupReduce<Long, Tuple3<Integer, Long, String>,
Tuple2<Integer, Long>, Tuple2<Integer, Long>> {
@Override
public void combine(Iterable<Tuple2<Long, Tuple3<Integer, Long, String>>> values, Collector<Tuple2<Long,
Tuple2<Integer, Long>>> out) throws Exception {
int i = 0;
long l = 0;
long key = 0;
// collapse groups
for (Tuple2<Long, Tuple3<Integer, Long, String>> value : values) {
key = value.f0;
Tuple3<Integer, Long, String> extracted = value.f1;
i += extracted.f0;
l += extracted.f1 + extracted.f2.length();
}
Tuple2<Integer, Long> result = new Tuple2<>(i, l);
out.collect(new Tuple2<>(key, result));
}
@Override
public void reduce(Iterable<Tuple2<Long, Tuple2<Integer, Long>>> values, Collector<Tuple2<Long,
Tuple2<Integer, Long>>> out) throws Exception {
new Tuple2toTuple2GroupReduce().reduce(values, out);
}
}
public static class Tuple2toTuple2GroupReduce implements KvGroupReduce<Long, Tuple2<Integer, Long>,
Tuple2<Integer, Long>, Tuple2<Integer, Long>> {
@Override
public void combine(Iterable<Tuple2<Long, Tuple2<Integer, Long>>> values, Collector<Tuple2<Long, Tuple2<Integer,
Long>>> out) throws Exception {
int i = 0;
long l = 0;
long key = 0;
// collapse groups
for (Tuple2<Long, Tuple2<Integer, Long>> value : values) {
key = value.f0;
Tuple2<Integer, Long> extracted = value.f1;
i += extracted.f0;
l += extracted.f1;
}
Tuple2<Integer, Long> result = new Tuple2<>(i, l);
out.collect(new Tuple2<>(key, result));
}
@Override
public void reduce(Iterable<Tuple2<Long, Tuple2<Integer, Long>>> values, Collector<Tuple2<Long,
Tuple2<Integer, Long>>> out) throws Exception {
combine(values, out);
}
}
public class Tuple3KvWrapper implements MapFunction<Tuple3<Integer, Long, String>, Tuple2<Long,
Tuple3<Integer, Long, String>>> {
@Override
public Tuple2<Long, Tuple3<Integer, Long, String>> map(Tuple3<Integer, Long, String> value) throws Exception {
return new Tuple2<>(value.f1, value);
}
}
public interface CombineAndReduceGroup <IN, INT, OUT> extends GroupCombineFunction<IN, INT>,
GroupReduceFunction<INT, OUT> {
}
public interface KvGroupReduce<K, V, INT, OUT> extends CombineAndReduceGroup<Tuple2<K, V>, Tuple2<K, INT>,
Tuple2<K, OUT>> {
}
}