/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.hadoop.mapred;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.test.util.JavaProgramTestBase;
import org.apache.flink.test.util.TestBaseUtils;
import org.apache.flink.util.OperatingSystem;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.junit.Assume;
import org.junit.Before;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.LinkedList;
@RunWith(Parameterized.class)
public class HadoopIOFormatsITCase extends JavaProgramTestBase {
private static int NUM_PROGRAMS = 2;
private int curProgId = config.getInteger("ProgramId", -1);
private String[] resultPath;
private String[] expectedResult;
private String sequenceFileInPath;
private String sequenceFileInPathNull;
public HadoopIOFormatsITCase(Configuration config) {
super(config);
}
@Before
public void checkOperatingSystem() {
// FLINK-5164 - see https://wiki.apache.org/hadoop/WindowsProblems
Assume.assumeTrue("This test can't run successfully on Windows.", !OperatingSystem.isWindows());
}
@Override
protected void preSubmit() throws Exception {
resultPath = new String[] {getTempDirPath("result0"), getTempDirPath("result1") };
File sequenceFile = createAndRegisterTempFile("seqFile");
sequenceFileInPath = sequenceFile.toURI().toString();
// Create a sequence file
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
FileSystem fs = FileSystem.get(URI.create(sequenceFile.getAbsolutePath()), conf);
Path path = new Path(sequenceFile.getAbsolutePath());
// ------------------ Long / Text Key Value pair: ------------
int kvCount = 4;
LongWritable key = new LongWritable();
Text value = new Text();
SequenceFile.Writer writer = null;
try {
writer = SequenceFile.createWriter( fs, conf, path, key.getClass(), value.getClass());
for (int i = 0; i < kvCount; i ++) {
if(i == 1) {
// write key = 0 a bit more often.
for(int a = 0;a < 15; a++) {
key.set(i);
value.set(i+" - somestring");
writer.append(key, value);
}
}
key.set(i);
value.set(i+" - somestring");
writer.append(key, value);
}
} finally {
IOUtils.closeStream(writer);
}
// ------------------ Long / Text Key Value pair: ------------
File sequenceFileNull = createAndRegisterTempFile("seqFileNullKey");
sequenceFileInPathNull = sequenceFileNull.toURI().toString();
path = new Path(sequenceFileInPathNull);
LongWritable value1 = new LongWritable();
SequenceFile.Writer writer1 = null;
try {
writer1 = SequenceFile.createWriter( fs, conf, path, NullWritable.class, value1.getClass());
for (int i = 0; i < kvCount; i ++) {
value1.set(i);
writer1.append(NullWritable.get(), value1);
}
} finally {
IOUtils.closeStream(writer1);
}
}
@Override
protected void testProgram() throws Exception {
expectedResult = HadoopIOFormatPrograms.runProgram(curProgId, resultPath, sequenceFileInPath, sequenceFileInPathNull);
}
@Override
protected void postSubmit() throws Exception {
for(int i = 0; i < resultPath.length; i++) {
compareResultsByLinesInMemory(expectedResult[i], resultPath[i]);
}
}
@Parameters
public static Collection<Object[]> getConfigurations() throws FileNotFoundException, IOException {
LinkedList<Configuration> tConfigs = new LinkedList<Configuration>();
for(int i=1; i <= NUM_PROGRAMS; i++) {
Configuration config = new Configuration();
config.setInteger("ProgramId", i);
tConfigs.add(config);
}
return TestBaseUtils.toParameterList(tConfigs);
}
public static class HadoopIOFormatPrograms {
public static String[] runProgram(int progId, String resultPath[], String sequenceFileInPath, String sequenceFileInPathNull) throws Exception {
switch(progId) {
case 1: {
/**
* Test sequence file, including a key access.
*/
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
SequenceFileInputFormat<LongWritable, Text> sfif = new SequenceFileInputFormat<LongWritable, Text>();
JobConf hdconf = new JobConf();
SequenceFileInputFormat.addInputPath(hdconf, new Path(sequenceFileInPath));
HadoopInputFormat<LongWritable, Text> hif = new HadoopInputFormat<LongWritable, Text>(sfif, LongWritable.class, Text.class, hdconf);
DataSet<Tuple2<LongWritable, Text>> ds = env.createInput(hif);
DataSet<Tuple2<Long, Text>> sumed = ds.map(new MapFunction<Tuple2<LongWritable, Text>, Tuple2<Long, Text>>() {
@Override
public Tuple2<Long, Text> map(Tuple2<LongWritable, Text> value) throws Exception {
return new Tuple2<Long, Text>(value.f0.get(), value.f1);
}
}).sum(0);
sumed.writeAsText(resultPath[0]);
DataSet<String> res = ds.distinct(0).map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
@Override
public String map(Tuple2<LongWritable, Text> value) throws Exception {
return value.f1 + " - " + value.f0.get();
}
});
res.writeAsText(resultPath[1]);
env.execute();
// return expected result
return new String [] {"(21,3 - somestring)", "0 - somestring - 0\n" +
"1 - somestring - 1\n" +
"2 - somestring - 2\n" +
"3 - somestring - 3\n"};
}
case 2: {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
SequenceFileInputFormat<NullWritable, LongWritable> sfif = new SequenceFileInputFormat<NullWritable, LongWritable>();
JobConf hdconf = new JobConf();
SequenceFileInputFormat.addInputPath(hdconf, new Path(sequenceFileInPathNull));
HadoopInputFormat<NullWritable, LongWritable> hif = new HadoopInputFormat<NullWritable, LongWritable>(sfif, NullWritable.class, LongWritable.class, hdconf);
DataSet<Tuple2<NullWritable, LongWritable>> ds = env.createInput(hif);
DataSet<Tuple2<Void, Long>> res = ds.map(new MapFunction<Tuple2<NullWritable, LongWritable>, Tuple2<Void, Long>>() {
@Override
public Tuple2<Void, Long> map(Tuple2<NullWritable, LongWritable> value) throws Exception {
return new Tuple2<Void, Long>(null, value.f1.get());
}
});
DataSet<Tuple2<Void, Long>> res1 = res.groupBy(1).sum(1);
res1.writeAsText(resultPath[1]);
res.writeAsText(resultPath[0]);
env.execute();
// return expected result
return new String [] {"(null,2)\n" +
"(null,0)\n" +
"(null,1)\n" +
"(null,3)",
"(null,0)\n" +
"(null,1)\n" +
"(null,2)\n" +
"(null,3)"};
}
default:
throw new IllegalArgumentException("Invalid program id");
}
}
}
}