ParquetInOutFormatsTest.java example

Explorer

iow-hadoop-streaming-master
- src
  - main
    - java
      - net
        iponweb
        hadoop
        streaming
        avro
        AvroAsJsonInputFormat.java
        AvroAsJsonOutputFormat.java
        AvroAsJsonRecordReader.java
        AvroAsTextInputFormat.java
        AvroAsTextOutputFormat.java
        AvroAsTextRecordReaderCopy.java
        EmptyRecordReader.java
        GenericDataTSV.java
        IOWJsonDecoder.java
        TextEmptyRecordReader.java
        io
        ByKeyOutputFormat.java
        parquet
        GroupReadSupport.java
        GroupWriteSupport.java
        JsonRecordWriterWrapper.java
        ParquetAsJsonInputFormat.java
        ParquetAsJsonOutputFormat.java
        ParquetAsTextInputFormat.java
        ParquetAsTextOutputFormat.java
        PathAction.java
        TextRecordWriterWrapper.java
        tools
        KeyValueSplitter.java
  - test
    - java
      - net
        iponweb
        hadoop
        streaming
        avro
        AvroInOutFormatsTest.java
        GenericDataTSVTest.java
        dummyReporter.java
        parquet
        ParquetInOutFormatsTest.java

package net.iponweb.hadoop.streaming.parquet;

import net.iponweb.hadoop.streaming.dummyReporter;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import java.io.File;
import java.io.IOException;

public class ParquetInOutFormatsTest {

    private final String schema = "message test { required int32 x; required binary y; optional binary z; repeated int32 a; }";

    private static String tsv = "25\twtf\t\t[1, 3, 4]";
    private static String json = "{\"x\": 25, \"y\": \"wtf\", \"z\": null, \"a\": [1, 3, 4]}";

    private static JobConf defaultConf = new JobConf();
    private static Path workDir = new Path("file:///tmp/iow-hadoop-streaming-" + Thread.currentThread().getId());
    private static String fname = "parquetastexttest";
    private static String fname2 = "parquetasjsontest";
    private static Path file = new Path(workDir, fname);
    private static Path file2 = new Path(workDir, fname2);


    @Before
    public void setup() {

        defaultConf.set("iow.streaming.output.schema", schema);
        defaultConf.set("mapreduce.task.partition", "0");
        defaultConf.set("mapreduce.task.attempt.id", "attempt_200707121733_0003_m_000005_0");
        defaultConf.set("parquet.read.support.class","net.iponweb.hadoop.streaming.parquet.GroupReadSupport");
    }

    @After
    public void cleanup() throws IOException {
        FileUtils.deleteDirectory(new File(workDir.toUri()));
    }

    @Test
    public void testParquetAsTextFmt() throws IOException {

        ParquetAsTextOutputFormat outfmt = new ParquetAsTextOutputFormat();
        FileOutputFormat.setOutputPath(defaultConf, file);
        String outpath = FileOutputFormat.getTaskOutputPath(defaultConf, "wtf").toString();
        defaultConf.set("mapreduce.task.output.dir", outpath);
        RecordWriter<Text, Text> writer = outfmt.getRecordWriter(file.getFileSystem(defaultConf),
                defaultConf, fname, new dummyReporter());

        writer.write(new Text(tsv), null);
        writer.close(null);

        FileInputFormat.setInputPaths(defaultConf, outpath + "/" + fname + "-m-00000.parquet");
        ParquetAsTextInputFormat informat = new ParquetAsTextInputFormat();
        RecordReader<Text, Text> reader = informat.getRecordReader(informat.getSplits(defaultConf, 1)[0],
                defaultConf, new dummyReporter());

        Text k = new Text();
        Text v = new Text();

        reader.next(k, v);
        Assert.assertEquals("read back tsv", tsv, k.toString() + "\t" + v.toString());
    }


    @Test
    public void testParquetAsJsonFmt() throws IOException {

        ParquetAsJsonOutputFormat outfmt = new ParquetAsJsonOutputFormat();
        FileOutputFormat.setOutputPath(defaultConf, file2);
        String outpath = FileOutputFormat.getTaskOutputPath(defaultConf, "wtf").toString();
        defaultConf.set("mapreduce.task.output.dir", outpath);
        RecordWriter<Text, Text> writer = outfmt.getRecordWriter(file.getFileSystem(defaultConf),
                defaultConf, fname2, new dummyReporter());

        writer.write(new Text(json), null);
        writer.close(null);

        FileInputFormat.setInputPaths(defaultConf, outpath + "/" + fname2 + "-m-00000.parquet");
        ParquetAsJsonInputFormat informat = new ParquetAsJsonInputFormat();
        RecordReader<Text, Text> reader = informat.getRecordReader(informat.getSplits(defaultConf, 1)[0],
                defaultConf, new dummyReporter());

        Text k = new Text();
        Text v = new Text();

        reader.next(k, v);
        ObjectMapper mapper = new ObjectMapper();
        JsonNode n0 = mapper.readTree(k.toString());
        JsonNode n1 = mapper.readTree(json);
        Assert.assertEquals("read back json", n0, n1);
    }
}