// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.demux;
import it.crs4.seal.common.ClusterUtils;
import it.crs4.seal.common.ContextAdapter;
import it.crs4.seal.common.FormatNameMap;
import it.crs4.seal.common.GroupByLocationComparator;
import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.common.SequenceIdLocationPartitioner;
import it.crs4.seal.common.SealToolRunner;
import it.crs4.seal.demux.TwoOneThreeSortComparator;
import org.seqdoop.hadoop_bam.QseqInputFormat;
import org.seqdoop.hadoop_bam.SequencedFragment;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import java.net.URI;
import java.io.IOException;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.OutputStream;
import java.io.Writer;
import java.util.Collection;
public class Demux extends Configured implements Tool
{
private static final Log LOG = LogFactory.getLog(Demux.class);
private static final String LocalSampleSheetName = "sample_sheet.csv";
public static final String DEFAULT_PROJECT = "DefaultProject";
public static final int DEFAULT_MAX_MISMATCHES = 0;
public static final String CONF_MAX_MISMATCHES = "seal.demux.max-mismatches";
public static final String CONF_NO_INDEX_READS = "seal.demux.no-index";
public static final String CONF_SEPARATE_READS = "seal.demux.separate-reads";
public static class Map extends Mapper<Text, SequencedFragment, SequenceId, SequencedFragment>
{
private DemuxMapper impl;
private IMRContext<SequenceId,SequencedFragment> contextAdapter;
@Override
public void setup(Context context)
{
impl = new DemuxMapper();
contextAdapter = new ContextAdapter<SequenceId,SequencedFragment>(context);
}
@Override
public void map(Text qseqKey, SequencedFragment seq, Context context) throws java.io.IOException, InterruptedException
{
impl.map(qseqKey, seq, contextAdapter);
}
}
public static class Red extends Reducer<SequenceId, SequencedFragment, Text, SequencedFragment>
{
private DemuxReducer impl;
private IMRContext<Text,SequencedFragment> contextAdapter;
@Override
public void setup(Context context) throws IOException
{
impl = new DemuxReducer();
impl.setup(new java.io.File(".", LocalSampleSheetName).getCanonicalPath(), context.getConfiguration());
contextAdapter = new ContextAdapter<Text,SequencedFragment>(context);
LOG.info("DemuxReducer setup. Sample sheet loaded");
}
@Override
public void reduce(SequenceId key, Iterable<SequencedFragment> values, Context context) throws IOException, InterruptedException
{
impl.reduce(key, values, contextAdapter);
}
}
private String makeJobName(Path firstInputPath)
{
// TODO: if the path is too long look at some smart way to trim the name
return "Demux " + firstInputPath.toString();
}
private void distributeSampleSheet(Path sampleSheetPath) throws java.net.URISyntaxException
{
Configuration conf = getConf();
DistributedCache.createSymlink(conf); // create symlinks in each task's working directory for the distributed files
String distPath = sampleSheetPath.toString() + "#" + LocalSampleSheetName;
DistributedCache.addCacheFile(new URI(distPath), conf);
}
private Writer makeLaneContentWriter(Path outputPath, String sampleName) throws IOException
{
Path destPath = new Path(outputPath, "LaneContent." + sampleName);
FileSystem destFs = destPath.getFileSystem(getConf());
OutputStream rawOut = destFs.create(destPath, true); // create and overwrite if it exists
Writer out =
new BufferedWriter(
new OutputStreamWriter(rawOut));
return out;
}
private void createLaneContentFiles(Path outputPath, Path sampleSheetPath) throws IOException
{
StringBuilder builder = new StringBuilder(100);
try
{
Path qualifiedPath = sampleSheetPath.makeQualified(sampleSheetPath.getFileSystem(getConf()));
SampleSheet sheet = DemuxUtils.loadSampleSheet(qualifiedPath, getConf());
Collection<String> samples = sheet.getSamples();
// we have one output directory per sample, thus we need one LaneContent file per sample.
for (String sample: samples)
{
Writer out = makeLaneContentWriter(outputPath, sample);
try
{
for (int lane = 1; lane <= 8; ++lane)
{
builder.delete(0, builder.length());
builder.append(lane-1).append(":");
if (sheet.getSamplesInLane(lane).contains(sample))
builder.append(sample);
builder.append("\n");
out.write(builder.toString());
}
}
finally {
out.close();
}
}
}
catch (SampleSheet.FormatException e) {
throw new RuntimeException("Error in sample sheet. " + e.getMessage());
}
}
@Override
public int run(String[] args) throws Exception {
LOG.info("starting");
Configuration conf = getConf();
DemuxOptionParser parser = new DemuxOptionParser();
parser.parse(conf, args);
conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads());
conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads());
LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");
if (parser.getNoIndexReads())
LOG.info("Not expecting to find any index reads. Will demultiplex based only on lane.");
// load sample sheet to fail early in case of problems
DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf);
// must be called before creating the job, since the job
// *copies* the Configuration.
distributeSampleSheet(parser.getSampleSheetPath());
// Create a Job using the processed conf
Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0)));
job.setJarByClass(Demux.class);
// input paths
for (Path p: parser.getInputPaths())
FileInputFormat.addInputPath(job, p);
job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq")));
job.setMapperClass(Map.class);
job.setMapOutputKeyClass(SequenceId.class);
job.setMapOutputValueClass(SequencedFragment.class);
job.setPartitionerClass(SequenceIdLocationPartitioner.class);
job.setGroupingComparatorClass(GroupByLocationComparator.class);
job.setSortComparatorClass(TwoOneThreeSortComparator.class);
job.setReducerClass(Red.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(SequencedFragment.class);
// output
job.setOutputFormatClass(DemuxOutputFormat.class);
FileOutputFormat.setOutputPath(job, parser.getOutputPath());
// Submit the job, then poll for progress until the job is complete
boolean result = job.waitForCompletion(true);
if (result)
{
LOG.info("done");
if (parser.getCreateLaneContent())
createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath());
return 0;
}
else
{
LOG.fatal(this.getClass().getName() + " failed!");
return 1;
}
}
public static void main(String[] args) throws Exception {
int res = new SealToolRunner().run(new Demux(), args);
System.exit(res);
}
}