package org.archive.hadoop.streaming; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.FileOutputCommitter; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.ReflectionUtils; import org.archive.hadoop.util.PartitionName; public class NativeZipNumOutputFormat extends FileOutputFormat<Text, Text> { public static void setZipNumLineCount(Configuration conf, int count) { org.archive.hadoop.mapreduce.ZipNumOutputFormat.setZipNumLineCount(conf, count); } @Override public RecordWriter<Text, Text> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException { int count = org.archive.hadoop.mapreduce.ZipNumOutputFormat.getZipNumLineCount(conf); String partMod = org.archive.hadoop.mapreduce.ZipNumOutputFormat.getPartMod(conf); String partitionName = getPartitionName(conf, partMod); // Obtain the compression codec from the Hadoop environment. Class<? extends CompressionCodec> codecClass = getOutputCompressorClass( conf, GzipCodec.class ); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance( codecClass, conf ); // System.err.println( "Using codec:" + codec.toString() ); // Use a file extension basd on the codec, don't hard-code it. Path mainFile = getWorkFile(conf, partitionName + codec.getDefaultExtension() ); Path summaryFile = getWorkFile(conf, partitionName + "-idx"); FileSystem mainFs = mainFile.getFileSystem(conf); FileSystem summaryFs = summaryFile.getFileSystem(conf); int buffSize = conf.getInt("io.file.buffer.size", 4096); FSDataOutputStream mainOut = mainFs.create(mainFile, false, buffSize, progress); FSDataOutputStream summaryOut = summaryFs.create(summaryFile, false, buffSize, progress); //return new ZipNumRecordWriter(count, mainOut, summaryOut, partitionName); return new org.archive.hadoop.streaming.NativeZipNumRecordWriter( codec, mainOut, summaryOut, partitionName, count ); } /** * Get the path and filename for the output format. * * @param conf * the task context * @param extension * an extension to add to the filename * @return a full path $output/_temporary/$taskid/part-[mr]-$id * @throws IOException */ public Path getWorkFile(JobConf conf, String partWithExt) throws IOException { //FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context); //return new Path(this.getWorkOutputPath(context.getJobConf()), partWithExt); if (conf.getBoolean("conf.zipnum.noTmp", false)) { Path outputPath = getOutputPath(conf); Path directPath = new Path(outputPath, partWithExt); return directPath; } return FileOutputFormat.getTaskOutputPath(conf, partWithExt); } public String getPartitionName(JobConf conf, String partMod) { //TaskID taskId = conf.getTaskAttemptID().getTaskID(); TaskID taskId = TaskAttemptID.forName(conf.get("mapred.task.id")).getTaskID(); int partition = taskId.getId(); String basename = PartitionName .getPartitionOutputName(conf, partition); if (basename == null) { // use default name: basename = String.format("part-%s%05d", partMod, partition); } return basename; } }