package org.archive.hadoop.mapreduce;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.archive.format.gzip.GZIPConstants;
import org.archive.format.gzip.GZIPFooter;
import org.archive.format.gzip.GZIPHeader;
import org.archive.util.io.CRCOutputStream;
public class ZipNumAllShardRecordWriter extends RecordWriter<Text, Text>{
protected DataOutputStream outMain;
protected DataOutputStream outSummary;
protected int limit;
private int count;
private long offset;
private ByteArrayOutputStream mainBuffer;
//private ByteArrayOutputStream summaryBuffer;
private ByteArrayOutputStream gzBuffer;
public static int DEFAULT_MAX_GZ_BUFFER = 1024 * 1024 * 2;
public static int DEFAULT_MAX_BUFFER = 1024 * 1024 * 10;
public static char DEFAULT_DELIM = ' ';
public static int newline = 10;
public char delim = DEFAULT_DELIM;
private final static Charset UTF8 = Charset.forName("utf-8");
protected String partName;
public ZipNumAllShardRecordWriter(int limit,
DataOutputStream outMain, DataOutputStream outSummary, String partName) {
this.outMain = outMain;
this.outSummary = outSummary;
this.limit = limit;
count = 0;
offset = 0;
mainBuffer = new ByteArrayOutputStream(DEFAULT_MAX_BUFFER);
//summaryBuffer = new ByteArrayOutputStream(DEFAULT_MAX_BUFFER);
gzBuffer = new ByteArrayOutputStream(DEFAULT_MAX_GZ_BUFFER);
this.partName = partName;
}
@Override
public void close(TaskAttemptContext arg0) throws IOException,
InterruptedException {
finishCurrent();
outMain.close();
outSummary.close();
}
// public void writeBytes(byte[] key, int delim, byte[] value) throws IOException {
//// if(count == 0) {
//// summaryBuffer.write(key);
//// summaryBuffer.write(delim);
//// summaryBuffer.write(value);
//// summaryBuffer.write(newline);
//// }
// mainBuffer.write(key);
// mainBuffer.write(delim);
// mainBuffer.write(value);
// mainBuffer.write(newline);
// count++;
// if(count == limit) {
// finishCurrent();
// }
// }
public void writeLine(String line) throws IOException {
if(count == 0) {
int spaceIndex = line.indexOf(delim);
// Include 2nd field (timestamp)
if (spaceIndex >= 0) {
spaceIndex = line.indexOf(delim, spaceIndex + 1);
}
String urlkey = ((spaceIndex >= 0) ? line.substring(0, spaceIndex) : line);
if (spaceIndex < 0) {
System.err.println("POSSIBLY INVALID CDX LINE: " + line);
}
outSummary.writeBytes(urlkey);
// summaryBuffer.write(line);
//summaryBuffer.write(newline);
}
mainBuffer.write(line.toString().getBytes(UTF8));
mainBuffer.write(newline);
count++;
if(count == limit) {
finishCurrent();
}
}
@Override
public void write(Text key, Text val) throws IOException,
InterruptedException {
if (key.getLength() == 0) {
writeLine(val.toString());
} else if (val.getLength() == 0) {
writeLine(key.toString());
} else {
writeLine(key.toString() + delim + val.toString());
}
}
private void finishCurrent() throws IOException {
if(count == 0) {
return;
}
gzBuffer.reset();
// deflate the main buffer into the temp gzBuffer:
Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION, true);
DeflaterOutputStream deflateOut = new DeflaterOutputStream(gzBuffer,deflater);
CRCOutputStream crcOut = new CRCOutputStream(deflateOut);
mainBuffer.writeTo(crcOut);
deflateOut.finish();
// now calculate the gzip header and footer:
GZIPHeader gzHeader = new GZIPHeader();
gzHeader.addRecord(GZIPConstants.SL_RECORD,
deflater.getBytesWritten() + GZIPConstants.GZIP_FOOTER_BYTES);
GZIPFooter gzFooter = new GZIPFooter(crcOut.getCRCValue(),
crcOut.getBytesWritten());
// write the header, the deflated bytes, and the footer:
int len = gzHeader.getLength() + gzBuffer.size()
+ GZIPConstants.GZIP_FOOTER_BYTES;
long startOffset = offset;
offset += len;
gzHeader.writeBytes(outMain);
gzBuffer.writeTo(outMain);
gzFooter.writeBytes(outMain);
outMain.flush();
// write the summary buffer:
String offsetAndLength = String.format("\t%s\t%d\t%d\n", partName, startOffset, len);
outSummary.writeBytes(offsetAndLength);
//summaryBuffer.writeTo(outSummary);
outSummary.flush();
// reset the main and summary buffers for the next block:
mainBuffer.reset();
//summaryBuffer.reset();
count = 0;
}
/**
* @return the delim
*/
public int getDelim() {
return delim;
}
/**
* @param delim the delim to set
*/
public void setDelim(char delim) {
this.delim = delim;
}
}