/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.test.javaApiOperators; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.operators.FlatMapOperator; import org.apache.flink.api.java.operators.MapPartitionOperator; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.utils.DataSetUtils; import org.apache.flink.test.javaApiOperators.util.CollectionDataSets; import org.apache.flink.test.util.MultipleProgramsTestBase; import org.apache.flink.util.Collector; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import java.util.List; import java.util.Random; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @SuppressWarnings("serial") @RunWith(Parameterized.class) public class SampleITCase extends MultipleProgramsTestBase { private static final Random RNG = new Random(); public SampleITCase(TestExecutionMode mode) { super(mode); } @Before public void initiate() { ExecutionEnvironment.getExecutionEnvironment().setParallelism(5); } @Test public void testSamplerWithFractionWithoutReplacement() throws Exception { verifySamplerWithFractionWithoutReplacement(0d); verifySamplerWithFractionWithoutReplacement(0.2d); verifySamplerWithFractionWithoutReplacement(1.0d); } @Test public void testSamplerWithFractionWithReplacement() throws Exception { verifySamplerWithFractionWithReplacement(0d); verifySamplerWithFractionWithReplacement(0.2d); verifySamplerWithFractionWithReplacement(1.0d); verifySamplerWithFractionWithReplacement(2.0d); } @Test public void testSamplerWithSizeWithoutReplacement() throws Exception { verifySamplerWithFixedSizeWithoutReplacement(0); verifySamplerWithFixedSizeWithoutReplacement(2); verifySamplerWithFixedSizeWithoutReplacement(21); } @Test public void testSamplerWithSizeWithReplacement() throws Exception { verifySamplerWithFixedSizeWithReplacement(0); verifySamplerWithFixedSizeWithReplacement(2); verifySamplerWithFixedSizeWithReplacement(21); } private void verifySamplerWithFractionWithoutReplacement(double fraction) throws Exception { verifySamplerWithFractionWithoutReplacement(fraction, RNG.nextLong()); } private void verifySamplerWithFractionWithoutReplacement(double fraction, long seed) throws Exception { verifySamplerWithFraction(false, fraction, seed); } private void verifySamplerWithFractionWithReplacement(double fraction) throws Exception { verifySamplerWithFractionWithReplacement(fraction, RNG.nextLong()); } private void verifySamplerWithFractionWithReplacement(double fraction, long seed) throws Exception { verifySamplerWithFraction(true, fraction, seed); } private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); } private void verifySamplerWithFixedSizeWithoutReplacement(int numSamples) throws Exception { verifySamplerWithFixedSizeWithoutReplacement(numSamples, RNG.nextLong()); } private void verifySamplerWithFixedSizeWithoutReplacement(int numSamples, long seed) throws Exception { verifySamplerWithFixedSize(false, numSamples, seed); } private void verifySamplerWithFixedSizeWithReplacement(int numSamples) throws Exception { verifySamplerWithFixedSizeWithReplacement(numSamples, RNG.nextLong()); } private void verifySamplerWithFixedSizeWithReplacement(int numSamples, long seed) throws Exception { verifySamplerWithFixedSize(true, numSamples, seed); } private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed); List<String> result = sampled.collect(); assertEquals(numSamples, result.size()); containsResultAsText(result, getSourceStrings()); } private FlatMapOperator<Tuple3<Integer, Long, String>, String> getSourceDataSet(ExecutionEnvironment env) { return CollectionDataSets.get3TupleDataSet(env).flatMap( new FlatMapFunction<Tuple3<Integer, Long, String>, String>() { @Override public void flatMap(Tuple3<Integer, Long, String> value, Collector<String> out) throws Exception { out.collect(value.f2); } }); } private String getSourceStrings() { return "Hi\n" + "Hello\n" + "Hello world\n" + "Hello world, how are you?\n" + "I am fine.\n" + "Luke Skywalker\n" + "Comment#1\n" + "Comment#2\n" + "Comment#3\n" + "Comment#4\n" + "Comment#5\n" + "Comment#6\n" + "Comment#7\n" + "Comment#8\n" + "Comment#9\n" + "Comment#10\n" + "Comment#11\n" + "Comment#12\n" + "Comment#13\n" + "Comment#14\n" + "Comment#15\n"; } }