package org.archive.hadoop.mapreduce;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.archive.format.gzip.GZIPConstants;
import org.archive.format.gzip.GZIPFooter;
import org.archive.format.gzip.GZIPHeader;
import org.archive.util.io.CRCOutputStream;
public class ZipNumRecordWriterOld extends RecordWriter<Text, Text>{
protected DataOutputStream outMain;
protected DataOutputStream outSummary;
protected int limit;
private int count;
private long offset;
private ByteArrayOutputStream mainBuffer;
private ByteArrayOutputStream summaryBuffer;
private ByteArrayOutputStream gzBuffer;
public static int DEFAULT_MAX_GZ_BUFFER = 1024 * 1024 * 2;
public static int DEFAULT_MAX_BUFFER = 1024 * 1024 * 10;
public static int DEFAULT_DELIM = 32;
public static int newline = 10;
public int delim = DEFAULT_DELIM;
private final static Charset UTF8 = Charset.forName("utf-8");
public ZipNumRecordWriterOld(int limit,
DataOutputStream outMain, DataOutputStream outSummary) {
this.outMain = outMain;
this.outSummary = outSummary;
this.limit = limit;
count = 0;
offset = 0;
mainBuffer = new ByteArrayOutputStream(DEFAULT_MAX_BUFFER);
summaryBuffer = new ByteArrayOutputStream(DEFAULT_MAX_BUFFER);
gzBuffer = new ByteArrayOutputStream(DEFAULT_MAX_GZ_BUFFER);
}
@Override
public void close(TaskAttemptContext arg0) throws IOException,
InterruptedException {
finishCurrent();
outMain.close();
outSummary.close();
}
public void writeBytes(byte[] key, int delim, byte[] value) throws IOException {
if(count == 0) {
summaryBuffer.write(key);
summaryBuffer.write(delim);
summaryBuffer.write(value);
summaryBuffer.write(newline);
}
mainBuffer.write(key);
mainBuffer.write(delim);
mainBuffer.write(value);
mainBuffer.write(newline);
count++;
if(count == limit) {
finishCurrent();
}
}
public void writeLineBytes(byte[] line) throws IOException {
if(count == 0) {
summaryBuffer.write(line);
summaryBuffer.write(newline);
}
mainBuffer.write(line);
mainBuffer.write(newline);
count++;
if(count == limit) {
finishCurrent();
}
}
@Override
public void write(Text key, Text val) throws IOException,
InterruptedException {
writeBytes(key.toString().getBytes(UTF8), delim,
val.toString().getBytes(UTF8));
}
private void finishCurrent() throws IOException {
if(count == 0) {
return;
}
gzBuffer.reset();
// deflate the main buffer into the temp gzBuffer:
Deflater deflater = new Deflater(Deflater.DEFAULT_COMPRESSION, true);
DeflaterOutputStream deflateOut = new DeflaterOutputStream(gzBuffer,deflater);
CRCOutputStream crcOut = new CRCOutputStream(deflateOut);
mainBuffer.writeTo(crcOut);
deflateOut.finish();
// now calculate the gzip header and footer:
GZIPHeader gzHeader = new GZIPHeader();
gzHeader.addRecord(GZIPConstants.SL_RECORD,
deflater.getBytesWritten() + GZIPConstants.GZIP_FOOTER_BYTES);
GZIPFooter gzFooter = new GZIPFooter(crcOut.getCRCValue(),
crcOut.getBytesWritten());
// write the header, the deflated bytes, and the footer:
int len = gzHeader.getLength() + gzBuffer.size()
+ GZIPConstants.GZIP_FOOTER_BYTES;
long startOffset = offset;
offset += len;
gzHeader.writeBytes(outMain);
gzBuffer.writeTo(outMain);
gzFooter.writeBytes(outMain);
outMain.flush();
// write the summary buffer:
String offsetAndLength = String.format("%d\t%d\t",startOffset, len);
outSummary.writeBytes(offsetAndLength);
summaryBuffer.writeTo(outSummary);
outSummary.flush();
// reset the main and summary buffers for the next block:
mainBuffer.reset();
summaryBuffer.reset();
count = 0;
}
/**
* @return the delim
*/
public int getDelim() {
return delim;
}
/**
* @param delim the delim to set
*/
public void setDelim(int delim) {
this.delim = delim;
}
}