package com.linkedin.camus.etl.kafka.common; import com.linkedin.camus.coders.CamusWrapper; import com.linkedin.camus.etl.IEtlKey; import com.linkedin.camus.etl.RecordWriterProvider; import com.linkedin.camus.etl.kafka.mapred.EtlMultiOutputFormat; import java.io.DataOutputStream; import java.io.IOException; import org.apache.avro.file.CodecFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.log4j.Logger; /** * Provides a RecordWriter that uses FSDataOutputStream to write * a String record as bytes to HDFS without any reformatting or compression. */ public class StringRecordWriterProvider implements RecordWriterProvider { public static final String ETL_OUTPUT_RECORD_DELIMITER = "etl.output.record.delimiter"; public static final String DEFAULT_RECORD_DELIMITER = "\n"; protected String recordDelimiter = null; private String extension = ""; private boolean isCompressed = false; private CompressionCodec codec = null; public StringRecordWriterProvider(TaskAttemptContext context) { Configuration conf = context.getConfiguration(); if (recordDelimiter == null) { recordDelimiter = conf.get(ETL_OUTPUT_RECORD_DELIMITER, DEFAULT_RECORD_DELIMITER); } isCompressed = FileOutputFormat.getCompressOutput(context); if (isCompressed) { Class<? extends CompressionCodec> codecClass = null; if ("snappy".equals(EtlMultiOutputFormat.getEtlOutputCodec(context))) { codecClass = SnappyCodec.class; } else if ("gzip".equals((EtlMultiOutputFormat.getEtlOutputCodec(context)))) { codecClass = GzipCodec.class; } else { codecClass = DefaultCodec.class; } codec = ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } } // TODO: Make this configurable somehow. // To do this, we'd have to make RecordWriterProvider have an // init(JobContext context) method signature that EtlMultiOutputFormat would always call. @Override public String getFilenameExtension() { return extension; } @Override public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(TaskAttemptContext context, String fileName, CamusWrapper camusWrapper, FileOutputCommitter committer) throws IOException, InterruptedException { // If recordDelimiter hasn't been initialized, do so now if (recordDelimiter == null) { recordDelimiter = context.getConfiguration().get(ETL_OUTPUT_RECORD_DELIMITER, DEFAULT_RECORD_DELIMITER); } // Get the filename for this RecordWriter. Path path = new Path(committer.getWorkPath(), EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension())); FileSystem fs = path.getFileSystem(context.getConfiguration()); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(path, false); return new ByteRecordWriter(fileOut, recordDelimiter); } else { FSDataOutputStream fileOut = fs.create(path, false); return new ByteRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut)), recordDelimiter); } /* // Create a FSDataOutputStream stream that will write to path. final FSDataOutputStream writer = path.getFileSystem(context.getConfiguration()).create(path); // Return a new anonymous RecordWriter that uses the // FSDataOutputStream writer to write bytes straight into path. return new RecordWriter<IEtlKey, CamusWrapper>() { @Override public void write(IEtlKey ignore, CamusWrapper data) throws IOException { String record = (String)data.getRecord() + recordDelimiter; writer.write(record.getBytes()); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { writer.close(); } }; */ } protected static class ByteRecordWriter extends RecordWriter<IEtlKey, CamusWrapper> { private DataOutputStream out; private String recordDelimiter; public ByteRecordWriter(DataOutputStream out, String recordDelimiter) { this.out = out; this.recordDelimiter = recordDelimiter; } @Override public void write(IEtlKey ignore, CamusWrapper value) throws IOException { boolean nullValue = value == null; if (!nullValue) { String record = (String) value.getRecord() + recordDelimiter; out.write(record.getBytes()); } } @Override public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { out.close(); } } }