/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.java.operators.translation; import org.apache.flink.api.common.Plan; import org.apache.flink.api.common.operators.GenericDataSinkBase; import org.apache.flink.api.common.operators.GenericDataSourceBase; import org.apache.flink.api.common.operators.base.MapOperatorBase; import org.apache.flink.api.common.operators.base.ReduceOperatorBase; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.api.java.io.DiscardingOutputFormat; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.typeutils.TupleTypeInfo; import org.apache.flink.api.java.typeutils.ValueTypeInfo; import org.apache.flink.types.LongValue; import org.apache.flink.types.StringValue; import org.junit.Test; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @SuppressWarnings("serial") public class DistinctTranslationTest { @Test public void translateDistinctPlain() { try { final int parallelism = 8; ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism); DataSet<Tuple3<Double, StringValue, LongValue>> initialData = getSourceDataSet(env); initialData.distinct().output(new DiscardingOutputFormat<Tuple3<Double, StringValue, LongValue>>()); Plan p = env.createProgramPlan(); GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next(); // currently distinct is translated to a Reduce ReduceOperatorBase<?, ?> reducer = (ReduceOperatorBase<?, ?>) sink.getInput(); // check types assertEquals(initialData.getType(), reducer.getOperatorInfo().getInputType()); assertEquals(initialData.getType(), reducer.getOperatorInfo().getOutputType()); // check keys assertArrayEquals(new int[] {0, 1, 2}, reducer.getKeyColumns(0)); // parallelism was not configured on the operator assertTrue(reducer.getParallelism() == 1 || reducer.getParallelism() == -1); assertTrue(reducer.getInput() instanceof GenericDataSourceBase<?, ?>); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail("Test caused an error: " + e.getMessage()); } } @Test public void translateDistinctPlain2() { try { final int parallelism = 8; ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism); DataSet<CustomType> initialData = getSourcePojoDataSet(env); initialData.distinct().output(new DiscardingOutputFormat<CustomType>()); Plan p = env.createProgramPlan(); GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next(); // currently distinct is translated to a Reduce ReduceOperatorBase<?, ?> reducer = (ReduceOperatorBase<?, ?>) sink.getInput(); // check types assertEquals(initialData.getType(), reducer.getOperatorInfo().getInputType()); assertEquals(initialData.getType(), reducer.getOperatorInfo().getOutputType()); // check keys assertArrayEquals(new int[] {0}, reducer.getKeyColumns(0)); // parallelism was not configured on the operator assertTrue(reducer.getParallelism() == 1 || reducer.getParallelism() == -1); assertTrue(reducer.getInput() instanceof GenericDataSourceBase<?, ?>); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail("Test caused an error: " + e.getMessage()); } } @Test public void translateDistinctPosition() { try { final int parallelism = 8; ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism); DataSet<Tuple3<Double, StringValue, LongValue>> initialData = getSourceDataSet(env); initialData.distinct(1, 2).output(new DiscardingOutputFormat<Tuple3<Double, StringValue, LongValue>>()); Plan p = env.createProgramPlan(); GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next(); // currently distinct is translated to a Reduce ReduceOperatorBase<?, ?> reducer = (ReduceOperatorBase<?, ?>) sink.getInput(); // check types assertEquals(initialData.getType(), reducer.getOperatorInfo().getInputType()); assertEquals(initialData.getType(), reducer.getOperatorInfo().getOutputType()); // check keys assertArrayEquals(new int[] {1, 2}, reducer.getKeyColumns(0)); // parallelism was not configured on the operator assertTrue(reducer.getParallelism() == 1 || reducer.getParallelism() == -1); assertTrue(reducer.getInput() instanceof GenericDataSourceBase<?, ?>); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail("Test caused an error: " + e.getMessage()); } } @Test public void translateDistinctKeySelector() { try { final int parallelism = 8; ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism); DataSet<Tuple3<Double, StringValue, LongValue>> initialData = getSourceDataSet(env); initialData.distinct(new KeySelector<Tuple3<Double,StringValue,LongValue>, StringValue>() { public StringValue getKey(Tuple3<Double, StringValue, LongValue> value) { return value.f1; } }).setParallelism(4).output(new DiscardingOutputFormat<Tuple3<Double, StringValue, LongValue>>()); Plan p = env.createProgramPlan(); GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next(); MapOperatorBase<?, ?, ?> keyRemover = (MapOperatorBase<?, ?, ?>) sink.getInput(); PlanUnwrappingReduceOperator<?, ?> reducer = (PlanUnwrappingReduceOperator<?, ?>) keyRemover.getInput(); MapOperatorBase<?, ?, ?> keyExtractor = (MapOperatorBase<?, ?, ?>) reducer.getInput(); // check the parallelisms assertEquals(1, keyExtractor.getParallelism()); assertEquals(4, reducer.getParallelism()); // check types TypeInformation<?> keyValueInfo = new TupleTypeInfo<Tuple2<StringValue, Tuple3<Double,StringValue,LongValue>>>( new ValueTypeInfo<StringValue>(StringValue.class), initialData.getType()); assertEquals(initialData.getType(), keyExtractor.getOperatorInfo().getInputType()); assertEquals(keyValueInfo, keyExtractor.getOperatorInfo().getOutputType()); assertEquals(keyValueInfo, reducer.getOperatorInfo().getInputType()); assertEquals(keyValueInfo, reducer.getOperatorInfo().getOutputType()); assertEquals(keyValueInfo, keyRemover.getOperatorInfo().getInputType()); assertEquals(initialData.getType(), keyRemover.getOperatorInfo().getOutputType()); // check keys assertEquals(KeyExtractingMapper.class, keyExtractor.getUserCodeWrapper().getUserCodeClass()); assertTrue(keyExtractor.getInput() instanceof GenericDataSourceBase<?, ?>); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail("Test caused an error: " + e.getMessage()); } } @Test public void translateDistinctExpressionKey() { try { final int parallelism = 8; ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism); DataSet<CustomType> initialData = getSourcePojoDataSet(env); initialData.distinct("myInt").output(new DiscardingOutputFormat<CustomType>()); Plan p = env.createProgramPlan(); GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next(); // currently distinct is translated to a Reduce ReduceOperatorBase<?, ?> reducer = (ReduceOperatorBase<?, ?>) sink.getInput(); // check types assertEquals(initialData.getType(), reducer.getOperatorInfo().getInputType()); assertEquals(initialData.getType(), reducer.getOperatorInfo().getOutputType()); // check keys assertArrayEquals(new int[] {0}, reducer.getKeyColumns(0)); // parallelism was not configured on the operator assertTrue(reducer.getParallelism() == 1 || reducer.getParallelism() == -1); assertTrue(reducer.getInput() instanceof GenericDataSourceBase<?, ?>); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail("Test caused an error: " + e.getMessage()); } } @SuppressWarnings("unchecked") private static final DataSet<Tuple3<Double, StringValue, LongValue>> getSourceDataSet(ExecutionEnvironment env) { return env.fromElements(new Tuple3<Double, StringValue, LongValue>(3.141592, new StringValue("foobar"), new LongValue(77))) .setParallelism(1); } private static DataSet<CustomType> getSourcePojoDataSet(ExecutionEnvironment env) { List<CustomType> data = new ArrayList<CustomType>(); data.add(new CustomType(1)); return env.fromCollection(data); } public static class CustomType implements Serializable { private static final long serialVersionUID = 1L; public int myInt; public CustomType() {} public CustomType(int i) { myInt = i; } @Override public String toString() { return ""+myInt; } } }