package net.iponweb.hadoop.streaming.parquet;
import net.iponweb.hadoop.streaming.dummyReporter;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
public class ParquetInOutFormatsTest {
private final String schema = "message test { required int32 x; required binary y; optional binary z; repeated int32 a; }";
private static String tsv = "25\twtf\t\t[1, 3, 4]";
private static String json = "{\"x\": 25, \"y\": \"wtf\", \"z\": null, \"a\": [1, 3, 4]}";
private static JobConf defaultConf = new JobConf();
private static Path workDir = new Path("file:///tmp/iow-hadoop-streaming-" + Thread.currentThread().getId());
private static String fname = "parquetastexttest";
private static String fname2 = "parquetasjsontest";
private static Path file = new Path(workDir, fname);
private static Path file2 = new Path(workDir, fname2);
@Before
public void setup() {
defaultConf.set("iow.streaming.output.schema", schema);
defaultConf.set("mapreduce.task.partition", "0");
defaultConf.set("mapreduce.task.attempt.id", "attempt_200707121733_0003_m_000005_0");
defaultConf.set("parquet.read.support.class","net.iponweb.hadoop.streaming.parquet.GroupReadSupport");
}
@After
public void cleanup() throws IOException {
FileUtils.deleteDirectory(new File(workDir.toUri()));
}
@Test
public void testParquetAsTextFmt() throws IOException {
ParquetAsTextOutputFormat outfmt = new ParquetAsTextOutputFormat();
FileOutputFormat.setOutputPath(defaultConf, file);
String outpath = FileOutputFormat.getTaskOutputPath(defaultConf, "wtf").toString();
defaultConf.set("mapreduce.task.output.dir", outpath);
RecordWriter<Text, Text> writer = outfmt.getRecordWriter(file.getFileSystem(defaultConf),
defaultConf, fname, new dummyReporter());
writer.write(new Text(tsv), null);
writer.close(null);
FileInputFormat.setInputPaths(defaultConf, outpath + "/" + fname + "-m-00000.parquet");
ParquetAsTextInputFormat informat = new ParquetAsTextInputFormat();
RecordReader<Text, Text> reader = informat.getRecordReader(informat.getSplits(defaultConf, 1)[0],
defaultConf, new dummyReporter());
Text k = new Text();
Text v = new Text();
reader.next(k, v);
Assert.assertEquals("read back tsv", tsv, k.toString() + "\t" + v.toString());
}
@Test
public void testParquetAsJsonFmt() throws IOException {
ParquetAsJsonOutputFormat outfmt = new ParquetAsJsonOutputFormat();
FileOutputFormat.setOutputPath(defaultConf, file2);
String outpath = FileOutputFormat.getTaskOutputPath(defaultConf, "wtf").toString();
defaultConf.set("mapreduce.task.output.dir", outpath);
RecordWriter<Text, Text> writer = outfmt.getRecordWriter(file.getFileSystem(defaultConf),
defaultConf, fname2, new dummyReporter());
writer.write(new Text(json), null);
writer.close(null);
FileInputFormat.setInputPaths(defaultConf, outpath + "/" + fname2 + "-m-00000.parquet");
ParquetAsJsonInputFormat informat = new ParquetAsJsonInputFormat();
RecordReader<Text, Text> reader = informat.getRecordReader(informat.getSplits(defaultConf, 1)[0],
defaultConf, new dummyReporter());
Text k = new Text();
Text v = new Text();
reader.next(k, v);
ObjectMapper mapper = new ObjectMapper();
JsonNode n0 = mapper.readTree(k.toString());
JsonNode n1 = mapper.readTree(json);
Assert.assertEquals("read back json", n0, n1);
}
}