package org.archive.hadoop.streaming; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.util.Progressable; import org.archive.hadoop.util.PartitionName; public class ZipNumOutputFormat extends FileOutputFormat<Text, Text> { private int count; private static final int DEFAULT_ZIP_NUM_LINES = 3000; private static final String ZIP_NUM_LINES_CONFIGURATION = "conf.zipnum.count"; private static final String ZIP_NUM_OVERCRAWL_CONFIGURATION = "conf.zipnum.overcrawl.daycount"; private static final String ZIP_NUM_PART_MOD = "conf.zipnum.partmod"; private static final String ZIP_NUM_CDX_HEADER = "conf.zipnum.cdxheader"; private static final String DEFAULT_PART_MOD = "a-"; private String partMod = ""; public ZipNumOutputFormat() { this(DEFAULT_ZIP_NUM_LINES); } public ZipNumOutputFormat(int count) { this.count = count; } public static void setZipNumLineCount(Configuration conf, int count) { conf.setInt(ZIP_NUM_LINES_CONFIGURATION, count); } public static void setZipNumOvercrawlDayCount(Configuration conf, int count) { conf.setInt(ZIP_NUM_OVERCRAWL_CONFIGURATION, count); } @Override public RecordWriter<Text, Text> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException { //Configuration conf = job.getConfiguration(); count = conf.getInt(ZIP_NUM_LINES_CONFIGURATION, DEFAULT_ZIP_NUM_LINES); //int dayLimit = conf.getInt(ZIP_NUM_OVERCRAWL_CONFIGURATION, -1); partMod = conf.get(ZIP_NUM_PART_MOD, DEFAULT_PART_MOD); String partitionName = getPartitionName(conf); Path mainFile = getWorkFile(conf, partitionName + ".gz"); Path summaryFile = getWorkFile(conf, partitionName + "-idx"); FileSystem mainFs = mainFile.getFileSystem(conf); FileSystem summaryFs = summaryFile.getFileSystem(conf); int buffSize = conf.getInt("io.file.buffer.size", 4096); FSDataOutputStream mainOut = mainFs.create(mainFile, false, buffSize, progress); FSDataOutputStream summaryOut = summaryFs.create(summaryFile, false, buffSize, progress); String cdxHeader = conf.get(ZIP_NUM_CDX_HEADER); return new ZipNumRecordWriter(count, mainOut, summaryOut, partitionName, cdxHeader); } /** * Get the path and filename for the output format. * * @param conf * the task context * @param extension * an extension to add to the filename * @return a full path $output/_temporary/$taskid/part-[mr]-$id * @throws IOException */ public Path getWorkFile(JobConf conf, String partWithExt) throws IOException { //FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context); //return new Path(this.getWorkOutputPath(context.getJobConf()), partWithExt); return FileOutputFormat.getTaskOutputPath(conf, partWithExt); } public String getPartitionName(JobConf conf) { //TaskID taskId = conf.getTaskAttemptID().getTaskID(); TaskID taskId = TaskAttemptID.forName(conf.get("mapred.task.id")).getTaskID(); int partition = taskId.getId(); String basename = PartitionName .getPartitionOutputName(conf, partition); if (basename == null) { // use default name: basename = String.format("part-%s%05d", partMod, partition); } return basename; } }