/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.junit.Before;
import org.junit.Test;
import parquet.example.data.Group;
import parquet.example.data.simple.SimpleGroupFactory;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.example.ExampleOutputFormat;
import parquet.hadoop.example.GroupReadSupport;
import parquet.hadoop.example.GroupWriteSupport;
import parquet.hadoop.mapred.Container;
import parquet.hadoop.mapred.DeprecatedParquetInputFormat;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.util.ContextUtil;
import parquet.schema.MessageTypeParser;
import java.io.IOException;
import static java.lang.Thread.sleep;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
/**
* DeprecatedParquetInputFormat is used by cascading. It initializes the recordReader using an initialize method with
* different parameters than ParquetInputFormat
*
* @author Tianshuo Deng
*/
public class DeprecatedInputFormatTest {
final Path parquetPath = new Path("target/test/example/TestInputOutputFormat/parquet");
final Path inputPath = new Path("src/test/java/parquet/hadoop/example/TestInputOutputFormat.java");
final Path outputPath = new Path("target/test/example/TestInputOutputFormat/out");
Job writeJob;
JobConf jobConf;
RunningJob mapRedJob;
private String writeSchema;
private String readSchema;
private Configuration conf;
@Before
public void setUp() {
conf = new Configuration();
jobConf = new JobConf();
writeSchema = "message example {\n" +
"required int32 line;\n" +
"required binary content;\n" +
"}";
readSchema = "message example {\n" +
"required int32 line;\n" +
"required binary content;\n" +
"}";
}
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {
final FileSystem fileSystem = parquetPath.getFileSystem(conf);
fileSystem.delete(parquetPath, true);
fileSystem.delete(outputPath, true);
{
writeJob = new Job(conf, "write");
TextInputFormat.addInputPath(writeJob, inputPath);
writeJob.setInputFormatClass(TextInputFormat.class);
writeJob.setNumReduceTasks(0);
ExampleOutputFormat.setCompression(writeJob, codec);
ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
writeJob.setOutputFormatClass(ExampleOutputFormat.class);
writeJob.setMapperClass(ReadMapper.class);
ExampleOutputFormat.setSchema(
writeJob,
MessageTypeParser.parseMessageType(
writeSchema));
writeJob.submit();
waitForJob(writeJob);
}
{
jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
jobConf.setInputFormat(MyDeprecatedInputFormat.class);
MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
jobConf.setMapperClass(DeprecatedWriteMapper.class);
jobConf.setNumReduceTasks(0);
mapRedJob = JobClient.runJob(jobConf);
}
}
@Test
public void testReadWriteWithCountDeprecated() throws Exception {
runMapReduceJob(CompressionCodecName.GZIP);
assertTrue(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytesread").getValue() > 0L);
assertTrue(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytestotal").getValue() > 0L);
assertTrue(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytesread").getValue()
== mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytestotal").getValue());
//not testing the time read counter since it could be zero due to the size of data is too small
}
@Test
public void testReadWriteWithoutCounter() throws Exception {
jobConf.set("parquet.benchmark.time.read", "false");
jobConf.set("parquet.benchmark.bytes.total", "false");
jobConf.set("parquet.benchmark.bytes.read", "false");
runMapReduceJob(CompressionCodecName.GZIP);
assertEquals(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytesread").getValue(), 0L);
assertEquals(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytestotal").getValue(), 0L);
assertEquals(mapRedJob.getCounters().getGroup("parquet").getCounterForName("timeread").getValue(), 0L);
}
private void waitForJob(Job job) throws InterruptedException, IOException {
while (!job.isComplete()) {
System.out.println("waiting for job " + job.getJobName());
sleep(100);
}
System.out.println("status for job " + job.getJobName() + ": " + (job.isSuccessful() ? "SUCCESS" : "FAILURE"));
if (!job.isSuccessful()) {
throw new RuntimeException("job failed " + job.getJobName());
}
}
public static class ReadMapper extends Mapper<LongWritable, Text, Void, Group> {
private SimpleGroupFactory factory;
protected void setup(Context context) throws IOException, InterruptedException {
factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
}
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Group group = factory.newGroup()
.append("line", (int) key.get())
.append("content", value.toString());
context.write(null, group);
}
}
public static class DeprecatedWriteMapper implements org.apache.hadoop.mapred.Mapper<Void, Container<Group>, LongWritable, Text> {
public void map(Void aVoid, Container<Group> valueContainer, OutputCollector<LongWritable, Text> longWritableTextOutputCollector, Reporter reporter) throws IOException {
Group value = valueContainer.get();
longWritableTextOutputCollector.collect(new LongWritable(value.getInteger("line", 0)), new Text(value.getString("content", 0)));
}
public void close() throws IOException {
}
public void configure(JobConf entries) {
}
}
static class MyDeprecatedInputFormat extends DeprecatedParquetInputFormat<Group> {
}
}