// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.demux;
import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.common.Utils;
import org.seqdoop.hadoop_bam.SequencedFragment;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.util.Iterator;
public class DemuxReducer
{
private static final Log LOG = LogFactory.getLog(DemuxReducer.class);
private static final byte[] SLASH_X = "/X".getBytes();
private BarcodeLookup barcodeLookup;
private Text outputKey = new Text();
private boolean expectIndexRead = true;
private boolean separatesReads = false;
public void setup(String localSampleSheetPath, Configuration conf) throws IOException
{
// load the sample sheet
Path path = new Path(localSampleSheetPath).makeQualified(FileSystem.getLocal(conf));
SampleSheet sampleSheet;
try {
sampleSheet = DemuxUtils.loadSampleSheet(path, conf);
}
catch (SampleSheet.FormatException e) {
throw new RuntimeException("Error loading sample sheet. Message: " + e.getMessage());
}
barcodeLookup = new BarcodeLookup(sampleSheet, conf.getInt(Demux.CONF_MAX_MISMATCHES, Demux.DEFAULT_MAX_MISMATCHES));
expectIndexRead = !conf.getBoolean(Demux.CONF_NO_INDEX_READS, false);
separatesReads = conf.getBoolean(Demux.CONF_SEPARATE_READS, false);
}
public void reduce(SequenceId key, Iterable<SequencedFragment> sequences, IMRContext<Text,SequencedFragment> context) throws IOException, InterruptedException
{
// XXX: this function is growing too much. Consider refactoring.
//////////////////////////////////////////
// Fragments should all have non-null Read and Lane, as verified by the Mapper.
// They should be ordered read 2, read 1, read 3 and over
//////////////////////////////////////////
Iterator<SequencedFragment> seqs_it = sequences.iterator();
SequencedFragment fragment;
String flowcellId = "";
String indexSeq = ""; // default index is blank
String sampleId;
String project;
fragment = seqs_it.next();
if (expectIndexRead)
{
// Fetch the first fragment from the list -- it should be the index sequence
if (fragment.getRead() != 2)
throw new RuntimeException("Missing read 2 in multiplexed input for location " + key.getLocation() + ". Record: " + fragment);
indexSeq = fragment.getSequence().toString();
// Sequenced tags have an additional 'A' base that separates them from the read. For this
// reason, when we verify their length we check for BAR_CODE_{MIN,MAX}_LENGTH + 1
if ( indexSeq.length() < (SampleSheet.BAR_CODE_MIN_LENGTH + 1)
|| indexSeq.length() > (SampleSheet.BAR_CODE_MAX_LENGTH + 1) )
{
throw new RuntimeException(
String.format("Unexpected barcode sequence of length %d (expected in interval [%d, %d]",
indexSeq.length(), (SampleSheet.BAR_CODE_MIN_LENGTH + 1), (SampleSheet.BAR_CODE_MAX_LENGTH + 1) + "])"));
}
// We've consumed this index read. Advance to the next one.
fragment = seqs_it.next();
}
// From here on, they should be all data reads.
int lane = fragment.getLane();
BarcodeLookup.Match m = barcodeLookup.getSampleId(lane, indexSeq);
if (m == null)
{
sampleId = "unknown";
project = ".";
}
else
{
sampleId = m.getEntry().getSampleId();
flowcellId = m.getEntry().getFlowcellId();
project = m.getEntry().getProject();
if (project == null)
project = Demux.DEFAULT_PROJECT;
context.increment("Barcode base mismatches", String.valueOf(m.getMismatches()), 1);
}
// Project/sample results in that directory structure. The key is the same for all reads in iterator
// TODO: profile! We're sanitizing and rebuilding the file name for
// each set of reads. It may be a significant waste of CPU that could be fixed by a caching mechanism.
String keyString = Utils.sanitizeFilename(project) + '/' + Utils.sanitizeFilename(sampleId);
outputKey.set(keyString);
if (separatesReads) {
// append a slash and an 'X' (the latter to make a space for the read number)
outputKey.append(SLASH_X, 0, SLASH_X.length);
}
boolean done = false;
do {
fragment.setIndexSequence(indexSeq);
// When we read qseq, the flowcell id isn't set (the file format doesn't include that data.
// Since we have the chance here, we'l extract the flowcell id from the sample sheet
// and set it on the outgoing SequencedFragment.
if (fragment.getFlowcellId() == null)
fragment.setFlowcellId(flowcellId);
if (expectIndexRead && fragment.getRead() > 2)
fragment.setRead( fragment.getRead() - 1);
if (separatesReads)
{
// Overwrite the last character of the key with the read number.
// This technique only supports single digit read numbers
outputKey.getBytes()[outputKey.getLength() - 1] = (byte)(fragment.getRead().byteValue() + '0');
}
context.write(outputKey, fragment);
context.increment("Sample reads", keyString, 1);
if (seqs_it.hasNext())
fragment = seqs_it.next();
else
done = true;
} while (!done);
if (fragment.getRead() > 2)
{ // although the code above is generic and will handle any number of reads,
// in our current use cases any more than 2 data reads (non-index) indicate
// a problem with the data.
// XXX: if someone removes this check, verify the "separatesReads" section above.
throw new RuntimeException("Unexpected output read number " + fragment.getRead() +
" at location " + key.getLocation() +
" (note that if read number may have been decremented by 1 if an index sequence was present).");
}
}
}