package org.archive.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.archive.hadoop.util.PartitionName;
public class ZipNumOutputFormat extends FileOutputFormat<Text, Text> {
private int count;
private static final int DEFAULT_ZIP_NUM_LINES = 5000;
private static final String ZIP_NUM_LINES_CONFIGURATION = "conf.zipnum.count";
private static final String ZIP_NUM_OVERCRAWL_CONFIGURATION = "conf.zipnum.overcrawl.daycount";
public ZipNumOutputFormat() {
this(DEFAULT_ZIP_NUM_LINES);
}
public ZipNumOutputFormat(int count) {
this.count = count;
}
public static void setZipNumLineCount(Configuration conf, int count) {
conf.setInt(ZIP_NUM_LINES_CONFIGURATION, count);
}
public static void setZipNumOvercrawlDayCount(Configuration conf, int count) {
conf.setInt(ZIP_NUM_OVERCRAWL_CONFIGURATION, count);
}
@Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
count = conf.getInt(ZIP_NUM_LINES_CONFIGURATION, DEFAULT_ZIP_NUM_LINES);
int dayLimit = conf.getInt(ZIP_NUM_OVERCRAWL_CONFIGURATION, -1);
Path mainFile = getWorkFile(context, ".gz");
Path summaryFile = getWorkFile(context, ".summary");
FileSystem mainFs = mainFile.getFileSystem(conf);
FileSystem summaryFs = summaryFile.getFileSystem(conf);
FSDataOutputStream mainOut = mainFs.create(mainFile, false);
FSDataOutputStream summaryOut = summaryFs.create(summaryFile, false);
if(dayLimit == -1) {
// This (should be) a better implementation, but appears to have a
// bug - summary files are empty in some cases.. Should track it down
// return new ZipNumRecordWriter(count, mainOut, summaryOut);
return new ZipNumRecordWriterOld(count, mainOut, summaryOut);
} else {
return new OvercrawlZipNumRecordWriter(count,dayLimit, mainOut, summaryOut);
}
}
/**
* Get the path and filename for the output format.
*
* @param context
* the task context
* @param extension
* an extension to add to the filename
* @return a full path $output/_temporary/$taskid/part-[mr]-$id
* @throws IOException
*/
public Path getWorkFile(TaskAttemptContext context, String extension)
throws IOException {
FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
TaskID taskId = context.getTaskAttemptID().getTaskID();
int partition = taskId.getId();
String basename =
PartitionName
.getPartitionOutputName(context.getConfiguration(), partition);
if(basename == null) {
// use default name:
basename = String.format("part-%05d", partition);
}
basename = basename + extension;
return new Path(committer.getWorkPath(), basename);
}
}