package org.archive.hadoop.storage; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.Charset; import java.util.Iterator; import java.util.Map; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.pig.PigException; import org.apache.pig.StoreFunc; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.archive.hadoop.mapreduce.AlphaPartitioner; import org.archive.hadoop.mapreduce.ZipNumOutputFormat; import org.archive.hadoop.mapreduce.ZipNumRecordWriter; public class ZipNumStorage extends StoreFunc { ZipNumRecordWriter writer; private static int DEFAULT_COUNT = 5000; private int count; public ZipNumStorage() { this(DEFAULT_COUNT); } public ZipNumStorage(String count) { try { this.count = Integer.parseInt(count); } catch(NumberFormatException e) { this.count = DEFAULT_COUNT; } } public ZipNumStorage(int count) { this.count = count; } @SuppressWarnings("rawtypes") @Override public OutputFormat getOutputFormat() throws IOException { return new ZipNumOutputFormat(count); } @SuppressWarnings("rawtypes") @Override public void prepareToWrite(RecordWriter writer) throws IOException { this.writer = (ZipNumRecordWriter) writer; } private static final Charset UTF8 = Charset.forName("UTF-8"); private static int BUFFER_SIZE = 1024 * 1024 * 5; private byte fieldDel = '\t'; ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE); @Override public void putNext(Tuple tuple) throws IOException { int sz = tuple.size(); for (int i = 0; i < sz; i++) { Object field; try { field = tuple.get(i); } catch (ExecException ee) { throw ee; } putField(field); if (i != sz - 1) { mOut.write(fieldDel); } } writer.write(mOut.toByteArray()); mOut.reset(); } @SuppressWarnings("unchecked") private void putField(Object field) throws IOException { // string constants for each delimiter String tupleBeginDelim = "("; String tupleEndDelim = ")"; String bagBeginDelim = "{"; String bagEndDelim = "}"; String mapBeginDelim = "["; String mapEndDelim = "]"; String fieldDelim = ","; String mapKeyValueDelim = "#"; switch (DataType.findType(field)) { case DataType.NULL: break; // just leave it empty case DataType.BOOLEAN: mOut.write(((Boolean) field).toString().getBytes(UTF8)); break; case DataType.INTEGER: mOut.write(((Integer) field).toString().getBytes(UTF8)); break; case DataType.LONG: mOut.write(((Long) field).toString().getBytes(UTF8)); break; case DataType.FLOAT: mOut.write(((Float) field).toString().getBytes(UTF8)); break; case DataType.DOUBLE: mOut.write(((Double) field).toString().getBytes(UTF8)); break; case DataType.BYTEARRAY: { byte[] b = ((DataByteArray) field).get(); mOut.write(b, 0, b.length); break; } case DataType.CHARARRAY: // oddly enough, writeBytes writes a string mOut.write(((String) field).getBytes(UTF8)); break; case DataType.MAP: boolean mapHasNext = false; Map<String, Object> m = (Map<String, Object>) field; mOut.write(mapBeginDelim.getBytes(UTF8)); for (Map.Entry<String, Object> e : m.entrySet()) { if (mapHasNext) { mOut.write(fieldDelim.getBytes(UTF8)); } else { mapHasNext = true; } putField(e.getKey()); mOut.write(mapKeyValueDelim.getBytes(UTF8)); putField(e.getValue()); } mOut.write(mapEndDelim.getBytes(UTF8)); break; case DataType.TUPLE: boolean tupleHasNext = false; Tuple t = (Tuple) field; mOut.write(tupleBeginDelim.getBytes(UTF8)); for (int i = 0; i < t.size(); ++i) { if (tupleHasNext) { mOut.write(fieldDelim.getBytes(UTF8)); } else { tupleHasNext = true; } try { putField(t.get(i)); } catch (ExecException ee) { throw ee; } } mOut.write(tupleEndDelim.getBytes(UTF8)); break; case DataType.BAG: boolean bagHasNext = false; mOut.write(bagBeginDelim.getBytes(UTF8)); Iterator<Tuple> tupleIter = ((DataBag) field).iterator(); while (tupleIter.hasNext()) { if (bagHasNext) { mOut.write(fieldDelim.getBytes(UTF8)); } else { bagHasNext = true; } putField((Object) tupleIter.next()); } mOut.write(bagEndDelim.getBytes(UTF8)); break; default: { int errCode = 2108; String msg = "Could not determine data type of field: " + field; throw new ExecException(msg, errCode, PigException.BUG); } } } @Override public void setStoreLocation(String path, Job job) throws IOException { System.err.format("Set partitioner class\n"); job.getConfiguration().set("mapreduce.partitioner.class", AlphaPartitioner.class.getCanonicalName()); // job.getConfiguration().set("mapreduce.partitioner.class", org.archive.hadoop.AlphaPartitioner.class.getCanonicalName()); // job.setPartitionerClass(AlphaPartitioner.class); job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(path)); } }