HadoopIOFormatsITCase.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.test.hadoop.mapred;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.test.util.JavaProgramTestBase;
import org.apache.flink.test.util.TestBaseUtils;
import org.apache.flink.util.OperatingSystem;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.junit.Assume;
import org.junit.Before;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.LinkedList;

@RunWith(Parameterized.class)
public class HadoopIOFormatsITCase extends JavaProgramTestBase {

	private static int NUM_PROGRAMS = 2;

	private int curProgId = config.getInteger("ProgramId", -1);
	private String[] resultPath;
	private String[] expectedResult;
	private String sequenceFileInPath;
	private String sequenceFileInPathNull;

	public HadoopIOFormatsITCase(Configuration config) {
		super(config);	
	}
	
	@Before
	public void checkOperatingSystem() {
		// FLINK-5164 - see https://wiki.apache.org/hadoop/WindowsProblems
		Assume.assumeTrue("This test can't run successfully on Windows.", !OperatingSystem.isWindows());
	}

	@Override
	protected void preSubmit() throws Exception {
		resultPath = new String[] {getTempDirPath("result0"), getTempDirPath("result1") };

		File sequenceFile = createAndRegisterTempFile("seqFile");
		sequenceFileInPath = sequenceFile.toURI().toString();

		// Create a sequence file
		org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
		FileSystem fs = FileSystem.get(URI.create(sequenceFile.getAbsolutePath()), conf);
		Path path = new Path(sequenceFile.getAbsolutePath());

		//  ------------------ Long / Text Key Value pair: ------------
		int kvCount = 4;

		LongWritable key = new LongWritable();
		Text value = new Text();
		SequenceFile.Writer writer = null;
		try {
			writer = SequenceFile.createWriter( fs, conf, path, key.getClass(), value.getClass());
			for (int i = 0; i < kvCount; i ++) {
				if(i == 1) {
					// write key = 0 a bit more often.
					for(int a = 0;a < 15; a++) {
						key.set(i);
						value.set(i+" - somestring");
						writer.append(key, value);
					}
				}
				key.set(i);
				value.set(i+" - somestring");
				writer.append(key, value);
			}
		} finally {
			IOUtils.closeStream(writer);
		}


		//  ------------------ Long / Text Key Value pair: ------------

		File sequenceFileNull = createAndRegisterTempFile("seqFileNullKey");
		sequenceFileInPathNull = sequenceFileNull.toURI().toString();
		path = new Path(sequenceFileInPathNull);

		LongWritable value1 = new LongWritable();
		SequenceFile.Writer writer1 = null;
		try {
			writer1 = SequenceFile.createWriter( fs, conf, path, NullWritable.class, value1.getClass());
			for (int i = 0; i < kvCount; i ++) {
				value1.set(i);
				writer1.append(NullWritable.get(), value1);
			}
		} finally {
			IOUtils.closeStream(writer1);
		}
	}

	@Override
	protected void testProgram() throws Exception {
		expectedResult = HadoopIOFormatPrograms.runProgram(curProgId, resultPath, sequenceFileInPath, sequenceFileInPathNull);
	}
	
	@Override
	protected void postSubmit() throws Exception {
		for(int i = 0; i < resultPath.length; i++) {
			compareResultsByLinesInMemory(expectedResult[i], resultPath[i]);
		}
	}
	
	@Parameters
	public static Collection<Object[]> getConfigurations() throws FileNotFoundException, IOException {

		LinkedList<Configuration> tConfigs = new LinkedList<Configuration>();

		for(int i=1; i <= NUM_PROGRAMS; i++) {
			Configuration config = new Configuration();
			config.setInteger("ProgramId", i);
			tConfigs.add(config);
		}
		
		return TestBaseUtils.toParameterList(tConfigs);
	}
	
	public static class HadoopIOFormatPrograms {
		
		public static String[] runProgram(int progId, String resultPath[], String sequenceFileInPath, String sequenceFileInPathNull) throws Exception {
			
			switch(progId) {
			case 1: {
				/**
				 * Test sequence file, including a key access.
				 */
				final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

				SequenceFileInputFormat<LongWritable, Text> sfif = new SequenceFileInputFormat<LongWritable, Text>();
				JobConf hdconf = new JobConf();
				SequenceFileInputFormat.addInputPath(hdconf, new Path(sequenceFileInPath));
				HadoopInputFormat<LongWritable, Text> hif = new HadoopInputFormat<LongWritable, Text>(sfif, LongWritable.class, Text.class, hdconf);
				DataSet<Tuple2<LongWritable, Text>> ds = env.createInput(hif);
				DataSet<Tuple2<Long, Text>> sumed = ds.map(new MapFunction<Tuple2<LongWritable, Text>, Tuple2<Long, Text>>() {
					@Override
					public Tuple2<Long, Text> map(Tuple2<LongWritable, Text> value) throws Exception {
						return new Tuple2<Long, Text>(value.f0.get(), value.f1);
					}
				}).sum(0);
				sumed.writeAsText(resultPath[0]);
				DataSet<String> res = ds.distinct(0).map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
					@Override
					public String map(Tuple2<LongWritable, Text> value) throws Exception {
						return value.f1 + " - " + value.f0.get();
					}
				});
				res.writeAsText(resultPath[1]);
				env.execute();
				
				// return expected result
				return 	new String [] {"(21,3 - somestring)", "0 - somestring - 0\n" +
						"1 - somestring - 1\n" +
						"2 - somestring - 2\n" +
						"3 - somestring - 3\n"};

			}
			case 2: {
				final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

				SequenceFileInputFormat<NullWritable, LongWritable> sfif = new SequenceFileInputFormat<NullWritable, LongWritable>();
				JobConf hdconf = new JobConf();
				SequenceFileInputFormat.addInputPath(hdconf, new Path(sequenceFileInPathNull));
				HadoopInputFormat<NullWritable, LongWritable> hif = new HadoopInputFormat<NullWritable, LongWritable>(sfif, NullWritable.class, LongWritable.class, hdconf);
				DataSet<Tuple2<NullWritable, LongWritable>> ds = env.createInput(hif);
				DataSet<Tuple2<Void, Long>> res = ds.map(new MapFunction<Tuple2<NullWritable, LongWritable>, Tuple2<Void, Long>>() {
					@Override
					public Tuple2<Void, Long> map(Tuple2<NullWritable, LongWritable> value) throws Exception {
						return new Tuple2<Void, Long>(null, value.f1.get());
					}
				});
				DataSet<Tuple2<Void, Long>> res1 = res.groupBy(1).sum(1);
				res1.writeAsText(resultPath[1]);
				res.writeAsText(resultPath[0]);
				env.execute();

				// return expected result
				return 	new String [] {"(null,2)\n" +
						"(null,0)\n" +
						"(null,1)\n" +
						"(null,3)",
						"(null,0)\n" +
						"(null,1)\n" +
						"(null,2)\n" +
						"(null,3)"};
			}
			default:
				throw new IllegalArgumentException("Invalid program id");
			}
			
		}
	
	}
}