TestInputOutputFormat.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop.example;

import static java.lang.Thread.sleep;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Method;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.junit.Before;
import org.junit.Test;

import parquet.Log;
import parquet.example.data.Group;
import parquet.example.data.simple.SimpleGroupFactory;
import parquet.hadoop.ParquetInputFormat;
import parquet.hadoop.ParquetOutputFormat;
import parquet.hadoop.api.DelegatingReadSupport;
import parquet.hadoop.api.DelegatingWriteSupport;
import parquet.hadoop.api.InitContext;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.util.ContextUtil;
import parquet.schema.MessageTypeParser;

public class TestInputOutputFormat {
    private static final Log LOG = Log.getLog(TestInputOutputFormat.class);
    final Path parquetPath = new Path("target/test/example/TestInputOutputFormat/parquet");
    final Path inputPath = new Path("src/test/java/parquet/hadoop/example/TestInputOutputFormat.java");
    final Path outputPath = new Path("target/test/example/TestInputOutputFormat/out");
    Job writeJob;
    Job readJob;
    private String writeSchema;
    private String readSchema;
    private String partialSchema;
    private Configuration conf;

    private Class<? extends Mapper<?, ?, ?, ?>> readMapperClass;
    private Class<? extends Mapper<?, ?, ?, ?>> writeMapperClass;

    @Before
    public void setUp() {
        conf = new Configuration();
        writeSchema = "message example {\n" +
                "required int32 line;\n" +
                "required binary content;\n" +
                "}";

        readSchema = "message example {\n" +
                "required int32 line;\n" +
                "required binary content;\n" +
                "}";

        partialSchema = "message example {\n" +
                "required int32 line;\n" +
                "}";

        readMapperClass = ReadMapper.class;
        writeMapperClass = WriteMapper.class;
    }


    public static final class MyWriteSupport extends DelegatingWriteSupport<Group> {

        private long count = 0;

        public MyWriteSupport() {
            super(new GroupWriteSupport(null));
        }

        @Override
        public void write(Group record) {
            super.write(record);
            ++count;
        }

        @Override
        public parquet.hadoop.api.WriteSupport.FinalizedWriteContext finalizeWrite() {
            Map<String, String> extraMetadata = new HashMap<String, String>();
            extraMetadata.put("my.count", String.valueOf(count));
            return new FinalizedWriteContext(extraMetadata);
        }
    }

    public static final class MyReadSupport extends DelegatingReadSupport<Group> {
        public MyReadSupport() {
            super(new GroupReadSupport());
        }

        @Override
        public parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
            Set<String> counts = context.getKeyValueMetadata().get("my.count");
            assertTrue("counts: " + counts, counts.size() > 0);
            return super.init(context);
        }
    }

    public static class ReadMapper extends Mapper<LongWritable, Text, Void, Group> {
        private SimpleGroupFactory factory;

        protected void setup(org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Void, Group>.Context context) throws java.io.IOException, InterruptedException {
            factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
        }

        ;

        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Void, Group>.Context context) throws java.io.IOException, InterruptedException {
            Group group = factory.newGroup()
                    .append("line", (int) key.get())
                    .append("content", value.toString());
            context.write(null, group);
        }
    }

    public static class WriteMapper extends Mapper<Void, Group, LongWritable, Text> {
        protected void map(Void key, Group value, Mapper<Void, Group, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(value.getInteger("line", 0)), new Text(value.getString("content", 0)));
        }
    }

    public static class PartialWriteMapper extends Mapper<Void, Group, LongWritable, Text> {
        protected void map(Void key, Group value, Mapper<Void, Group, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(value.getInteger("line", 0)), new Text("dummy"));
        }
    }

    private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {
        runMapReduceJob(codec, Collections.<String, String>emptyMap());
    }

    private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration(this.conf);
        for (Map.Entry<String, String> entry : extraConf.entrySet()) {
            conf.set(entry.getKey(), entry.getValue());
        }
        final FileSystem fileSystem = parquetPath.getFileSystem(conf);
        fileSystem.delete(parquetPath, true);
        fileSystem.delete(outputPath, true);
        {
            writeJob = new Job(conf, "write");
            TextInputFormat.addInputPath(writeJob, inputPath);
            writeJob.setInputFormatClass(TextInputFormat.class);
            writeJob.setNumReduceTasks(0);
            ParquetOutputFormat.setCompression(writeJob, codec);
            ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
            writeJob.setOutputFormatClass(ParquetOutputFormat.class);
            writeJob.setMapperClass(readMapperClass);

            ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class);
            GroupWriteSupport.setSchema(
                    MessageTypeParser.parseMessageType(writeSchema),
                    writeJob.getConfiguration());
            writeJob.submit();
            waitForJob(writeJob);
        }
        {
            conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
            readJob = new Job(conf, "read");

            readJob.setInputFormatClass(ParquetInputFormat.class);
            ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class);

            ParquetInputFormat.setInputPaths(readJob, parquetPath);
            readJob.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(readJob, outputPath);
            readJob.setMapperClass(writeMapperClass);
            readJob.setNumReduceTasks(0);
            readJob.submit();
            waitForJob(readJob);
        }
    }

    private void testReadWrite(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {
        testReadWrite(codec, Collections.<String, String>emptyMap());
    }

    private void testReadWrite(CompressionCodecName codec, Map<String, String> conf) throws IOException, ClassNotFoundException, InterruptedException {
        runMapReduceJob(codec, conf);
        final BufferedReader in = new BufferedReader(new FileReader(new File(inputPath.toString())));
        final BufferedReader out = new BufferedReader(new FileReader(new File(outputPath.toString(), "part-m-00000")));
        String lineIn;
        String lineOut = null;
        int lineNumber = 0;
        while ((lineIn = in.readLine()) != null && (lineOut = out.readLine()) != null) {
            ++lineNumber;
            lineOut = lineOut.substring(lineOut.indexOf("\t") + 1);
            assertEquals("line " + lineNumber, lineIn, lineOut);
        }
        assertNull("line " + lineNumber, out.readLine());
        assertNull("line " + lineNumber, lineIn);
        in.close();
        out.close();
    }

    @Test
    public void testReadWrite() throws IOException, ClassNotFoundException, InterruptedException {
        // TODO: Lzo requires additional external setup steps so leave it out for now
        testReadWrite(CompressionCodecName.GZIP);
        testReadWrite(CompressionCodecName.UNCOMPRESSED);
        testReadWrite(CompressionCodecName.SNAPPY);
    }

    @Test
    public void testReadWriteTaskSideMD() throws IOException, ClassNotFoundException, InterruptedException {
        testReadWrite(CompressionCodecName.UNCOMPRESSED, new HashMap<String, String>() {{
            put("parquet.task.side.metadata", "true");
        }});
    }

    @Test
    public void testProjection() throws Exception {
        readSchema = partialSchema;
        writeMapperClass = PartialWriteMapper.class;
        runMapReduceJob(CompressionCodecName.GZIP);
    }

    private static long value(Job job, String groupName, String name) throws Exception {
        // getGroup moved to AbstractCounters
        Method getGroup = org.apache.hadoop.mapreduce.Counters.class.getMethod("getGroup", String.class);
        // CounterGroup changed to an interface
        Method findCounter = org.apache.hadoop.mapreduce.CounterGroup.class.getMethod("findCounter", String.class);
        // Counter changed to an interface
        Method getValue = org.apache.hadoop.mapreduce.Counter.class.getMethod("getValue");
        CounterGroup group = (CounterGroup) getGroup.invoke(job.getCounters(), groupName);
        Counter counter = (Counter) findCounter.invoke(group, name);
        return (Long) getValue.invoke(counter);
    }

    @Test
    public void testReadWriteWithCounter() throws Exception {
        runMapReduceJob(CompressionCodecName.GZIP);
        assertTrue(value(readJob, "parquet", "bytesread") > 0L);
        assertTrue(value(readJob, "parquet", "bytestotal") > 0L);
        assertTrue(value(readJob, "parquet", "bytesread")
                == value(readJob, "parquet", "bytestotal"));
        //not testing the time read counter since it could be zero due to the size of data is too small
    }

    @Test
    public void testReadWriteWithoutCounter() throws Exception {
        conf.set("parquet.benchmark.time.read", "false");
        conf.set("parquet.benchmark.bytes.total", "false");
        conf.set("parquet.benchmark.bytes.read", "false");
        runMapReduceJob(CompressionCodecName.GZIP);
        assertTrue(value(readJob, "parquet", "bytesread") == 0L);
        assertTrue(value(readJob, "parquet", "bytestotal") == 0L);
        assertTrue(value(readJob, "parquet", "timeread") == 0L);
    }

    private void waitForJob(Job job) throws InterruptedException, IOException {
        while (!job.isComplete()) {
            LOG.debug("waiting for job " + job.getJobName());
            sleep(100);
        }
        LOG.info("status for job " + job.getJobName() + ": " + (job.isSuccessful() ? "SUCCESS" : "FAILURE"));
        if (!job.isSuccessful()) {
            throw new RuntimeException("job failed " + job.getJobName());
        }
    }
}