DeprecatedInputFormatTest.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.junit.Before;
import org.junit.Test;
import parquet.example.data.Group;
import parquet.example.data.simple.SimpleGroupFactory;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.example.ExampleOutputFormat;
import parquet.hadoop.example.GroupReadSupport;
import parquet.hadoop.example.GroupWriteSupport;
import parquet.hadoop.mapred.Container;
import parquet.hadoop.mapred.DeprecatedParquetInputFormat;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.util.ContextUtil;
import parquet.schema.MessageTypeParser;

import java.io.IOException;

import static java.lang.Thread.sleep;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

/**
 * DeprecatedParquetInputFormat is used by cascading. It initializes the recordReader using an initialize method with
 * different parameters than ParquetInputFormat
 *
 * @author Tianshuo Deng
 */
public class DeprecatedInputFormatTest {
    final Path parquetPath = new Path("target/test/example/TestInputOutputFormat/parquet");
    final Path inputPath = new Path("src/test/java/parquet/hadoop/example/TestInputOutputFormat.java");
    final Path outputPath = new Path("target/test/example/TestInputOutputFormat/out");
    Job writeJob;
    JobConf jobConf;
    RunningJob mapRedJob;
    private String writeSchema;
    private String readSchema;
    private Configuration conf;

    @Before
    public void setUp() {
        conf = new Configuration();
        jobConf = new JobConf();
        writeSchema = "message example {\n" +
                "required int32 line;\n" +
                "required binary content;\n" +
                "}";

        readSchema = "message example {\n" +
                "required int32 line;\n" +
                "required binary content;\n" +
                "}";
    }

    private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

        final FileSystem fileSystem = parquetPath.getFileSystem(conf);
        fileSystem.delete(parquetPath, true);
        fileSystem.delete(outputPath, true);
        {
            writeJob = new Job(conf, "write");
            TextInputFormat.addInputPath(writeJob, inputPath);
            writeJob.setInputFormatClass(TextInputFormat.class);
            writeJob.setNumReduceTasks(0);
            ExampleOutputFormat.setCompression(writeJob, codec);
            ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
            writeJob.setOutputFormatClass(ExampleOutputFormat.class);
            writeJob.setMapperClass(ReadMapper.class);
            ExampleOutputFormat.setSchema(
                    writeJob,
                    MessageTypeParser.parseMessageType(
                            writeSchema));
            writeJob.submit();
            waitForJob(writeJob);
        }
        {
            jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
            jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
            jobConf.setInputFormat(MyDeprecatedInputFormat.class);
            MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
            jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
            org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
            jobConf.setMapperClass(DeprecatedWriteMapper.class);
            jobConf.setNumReduceTasks(0);
            mapRedJob = JobClient.runJob(jobConf);
        }
    }

    @Test
    public void testReadWriteWithCountDeprecated() throws Exception {
        runMapReduceJob(CompressionCodecName.GZIP);
        assertTrue(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytesread").getValue() > 0L);
        assertTrue(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytestotal").getValue() > 0L);
        assertTrue(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytesread").getValue()
                == mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytestotal").getValue());
        //not testing the time read counter since it could be zero due to the size of data is too small
    }

    @Test
    public void testReadWriteWithoutCounter() throws Exception {
        jobConf.set("parquet.benchmark.time.read", "false");
        jobConf.set("parquet.benchmark.bytes.total", "false");
        jobConf.set("parquet.benchmark.bytes.read", "false");
        runMapReduceJob(CompressionCodecName.GZIP);
        assertEquals(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytesread").getValue(), 0L);
        assertEquals(mapRedJob.getCounters().getGroup("parquet").getCounterForName("bytestotal").getValue(), 0L);
        assertEquals(mapRedJob.getCounters().getGroup("parquet").getCounterForName("timeread").getValue(), 0L);
    }

    private void waitForJob(Job job) throws InterruptedException, IOException {
        while (!job.isComplete()) {
            System.out.println("waiting for job " + job.getJobName());
            sleep(100);
        }
        System.out.println("status for job " + job.getJobName() + ": " + (job.isSuccessful() ? "SUCCESS" : "FAILURE"));
        if (!job.isSuccessful()) {
            throw new RuntimeException("job failed " + job.getJobName());
        }
    }

    public static class ReadMapper extends Mapper<LongWritable, Text, Void, Group> {
        private SimpleGroupFactory factory;

        protected void setup(Context context) throws IOException, InterruptedException {
            factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
        }

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            Group group = factory.newGroup()
                    .append("line", (int) key.get())
                    .append("content", value.toString());
            context.write(null, group);
        }
    }

    public static class DeprecatedWriteMapper implements org.apache.hadoop.mapred.Mapper<Void, Container<Group>, LongWritable, Text> {


        public void map(Void aVoid, Container<Group> valueContainer, OutputCollector<LongWritable, Text> longWritableTextOutputCollector, Reporter reporter) throws IOException {
            Group value = valueContainer.get();
            longWritableTextOutputCollector.collect(new LongWritable(value.getInteger("line", 0)), new Text(value.getString("content", 0)));
        }


        public void close() throws IOException {
        }


        public void configure(JobConf entries) {
        }
    }

    static class MyDeprecatedInputFormat extends DeprecatedParquetInputFormat<Group> {

    }
}