package com.linkedin.thirdeye.hadoop.join; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.BinaryDecoder; import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.EncoderFactory; public class MapOutputValue { private static BinaryDecoder binaryDecoder; private String schemaName; private GenericRecord record; private GenericDatumWriter<GenericRecord> WRITER; private EncoderFactory factory = EncoderFactory.get(); private BinaryEncoder binaryEncoder; public MapOutputValue(String schemaName, GenericRecord record) { this.schemaName = schemaName; this.record = record; } public String getSchemaName() { return schemaName; } public GenericRecord getRecord() { return record; } public byte[] toBytes() throws IOException { ByteArrayOutputStream dataStream = new ByteArrayOutputStream(); Schema schema = record.getSchema(); if (WRITER == null) { WRITER = new GenericDatumWriter<GenericRecord>(schema); } binaryEncoder = factory.directBinaryEncoder(dataStream, binaryEncoder); WRITER.write(record, binaryEncoder); // serialize to bytes, we also need to know the schema name when we // process this record on the reducer since reducer gets the record from // multiple mappers. So we first write the schema/source name and then // write the serialized bytes ByteArrayOutputStream out = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(out); dos.writeInt(schema.getName().getBytes().length); dos.write(schema.getName().getBytes()); byte[] dataBytes = dataStream.toByteArray(); dos.writeInt(dataBytes.length); dos.write(dataBytes); return out.toByteArray(); } public static MapOutputValue fromBytes(byte[] bytes, Map<String, Schema> schemaMap) throws IOException { DataInputStream dataInputStream = new DataInputStream(new ByteArrayInputStream(bytes)); int length = dataInputStream.readInt(); byte[] sourceNameBytes = new byte[length]; dataInputStream.read(sourceNameBytes); String schemaName = new String(sourceNameBytes); int recordDataLength = dataInputStream.readInt(); byte[] recordBytes = new byte[recordDataLength]; dataInputStream.read(recordBytes); Schema schema = schemaMap.get(schemaName); GenericRecord record = new GenericData.Record(schema); binaryDecoder = DecoderFactory.get().binaryDecoder(recordBytes, binaryDecoder); GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>(schema); gdr.read(record, binaryDecoder); return new MapOutputValue(schemaName, record); } }