// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package it.crs4.seal.demux; import it.crs4.seal.common.SealToolParser; // for OUTPUT_FORMAT_CONF import org.seqdoop.hadoop_bam.QseqOutputFormat.QseqRecordWriter; import org.seqdoop.hadoop_bam.FastqOutputFormat.FastqRecordWriter; import org.seqdoop.hadoop_bam.SequencedFragment; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.ReflectionUtils; import java.io.DataOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.HashMap; public class DemuxOutputFormat extends FileOutputFormat<Text, SequencedFragment> { protected static class DemuxMultiFileLineRecordWriter extends RecordWriter<Text,SequencedFragment> implements Configurable { protected static final String DEFAULT_OUTPUT_FORMAT = "qseq"; protected HashMap<Text,RecordWriter<Text, SequencedFragment>> outputs; protected FileSystem fs; protected Path outputPath; protected Configuration conf; protected boolean isCompressed; protected CompressionCodec codec; protected enum OutputFormatType { Qseq, Fastq; }; protected OutputFormatType outputFormat; public DemuxMultiFileLineRecordWriter(TaskAttemptContext task, FileSystem fs, Path defaultFile) throws IOException { conf = task.getConfiguration(); outputPath = defaultFile; this.fs = outputPath.getFileSystem(conf); isCompressed = FileOutputFormat.getCompressOutput(task); if (isCompressed) { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(task, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); outputPath = outputPath.suffix(codec.getDefaultExtension()); } outputs = new HashMap<Text,RecordWriter<Text,SequencedFragment>>(20); // XXX: I don't think there's a better way to pass the desired output format // into this object. If we go through the configuration object, we might as // well re-use the OUTPUT_FORMAT_CONF property set by the SealToolParser. String oformatName = conf.get(SealToolParser.OUTPUT_FORMAT_CONF, DEFAULT_OUTPUT_FORMAT); if ("qseq".equalsIgnoreCase(oformatName)) this.outputFormat = OutputFormatType.Qseq; else if ("fastq".equalsIgnoreCase(oformatName)) this.outputFormat = OutputFormatType.Fastq; else throw new RuntimeException("Unexpected output format " + oformatName); } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return conf; } public void write(Text key, SequencedFragment value) throws IOException , InterruptedException { if (value == null) return; if (key == null) throw new RuntimeException("trying to output a null key. I don't know where to put that."); RecordWriter<Text, SequencedFragment> writer = getOutputStream(key); writer.write(null, value); } protected RecordWriter<Text, SequencedFragment> makeWriter(Path outputPath) throws IOException { DataOutputStream ostream; if (isCompressed) { FSDataOutputStream fileOut = fs.create(outputPath, false); ostream = new DataOutputStream(codec.createOutputStream(fileOut)); } else ostream = fs.create(outputPath, false); if (outputFormat == OutputFormatType.Qseq) return new QseqRecordWriter(conf, ostream); else if (outputFormat == OutputFormatType.Fastq) return new FastqRecordWriter(conf, ostream); else throw new RuntimeException("BUG! Unexpected outputFormat value " + outputFormat); } protected RecordWriter<Text, SequencedFragment> getOutputStream(Text key) throws IOException, InterruptedException { RecordWriter<Text, SequencedFragment> writer = outputs.get(key); if (writer == null) { // create it Path dir = new Path(outputPath.getParent(), key.toString()); Path file = new Path(dir, outputPath.getName()); if (!fs.exists(dir)) fs.mkdirs(dir); // now create a new writer that will write to the desired file path // (which should not already exist, since we didn't find it in our hash map) writer = makeWriter(file); outputs.put(key, writer); // insert the record writer into our map } return writer; } public synchronized void close(TaskAttemptContext context) throws IOException, InterruptedException { for (RecordWriter<Text, SequencedFragment> out: outputs.values()) out.close(null); } } public RecordWriter<Text,SequencedFragment> getRecordWriter(TaskAttemptContext job) throws IOException { Path defaultFile = getDefaultWorkFile(job, ""); FileSystem fs = defaultFile.getFileSystem(job.getConfiguration()); return new DemuxMultiFileLineRecordWriter(job, fs, defaultFile); } }