// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.prq;
import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.common.WritableMapping;
import org.apache.hadoop.io.Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
public class PairReadsQSeqReducer
{
private static final Log LOG = LogFactory.getLog(PairReadsQSeqReducer.class);
private Text outputKey = new Text();
private ReadPair outputValue = new ReadPair();
private ByteBuffer[] sequence;
private ByteBuffer[] quality;
private WritableMapping[] mapping;
private int minBasesThreshold = 0;
private boolean dropFailedFilter = true;
private boolean warnOnlyIfUnpaired = false;
private int nReadsPerTemplate = 2;
private static final byte[] delimByte = { 9 }; // tab character
private static final String delim = "\t";
private static final char UnknownBase = 'N';
private static final int INIT_BUF_SIZE = 150;
public static enum ReadCounters {
NotEnoughBases,
FailedFilter,
Unpaired,
Dropped
}
public void setMinBasesThreshold(int v) { minBasesThreshold = v; }
public void setDropFailedFilter(boolean v) { dropFailedFilter = v; }
public void setWarnOnlyIfUnpaired(boolean v) { warnOnlyIfUnpaired = v; }
public void setNumReadsPerTemplate(int v) {
if (v <= 0)
throw new IllegalArgumentException("number of reads per template must be 0 or 1");
if (v > 2)
throw new UnsupportedOperationException("number of reads per template must be 0 or 1");
nReadsPerTemplate = v;
}
public void setup(IMRContext<Text, ReadPair> context)
{
sequence = new ByteBuffer[nReadsPerTemplate];
quality = new ByteBuffer[nReadsPerTemplate];
mapping = new WritableMapping[nReadsPerTemplate];
for (int i = 0; i < sequence.length; ++i)
{
sequence[i] = ByteBuffer.allocate(INIT_BUF_SIZE);
sequence[i].limit(0).position(0);
quality[i] = ByteBuffer.allocate(INIT_BUF_SIZE);
quality[i].limit(0).position(0);
mapping[i] = new WritableMapping();
}
// create counters with a value of 0.
context.increment(ReadCounters.NotEnoughBases, 0);
context.increment(ReadCounters.FailedFilter, 0);
context.increment(ReadCounters.Dropped, 0);
LOG.debug("Set up prq reducer for templates with " + nReadsPerTemplate + " reads");
}
public void reduce(SequenceId key, Iterable<Text> values, IMRContext<Text,ReadPair> context)
throws IOException, InterruptedException
{
outputKey.set( key.getLocation() );
outputValue.clear();
int nReads = 0;
int nBadReads = 0;
for (Text read: values)
{
++nReads;
if (nReads > nReadsPerTemplate)
throw new RuntimeException("got more than " + nReadsPerTemplate + " reads for sequence key " + key + ". Record: " + read);
int[] fieldsPos = findFields(read);
// filtered read?
// If dropFailedFilter is false it shortcuts the test and sets filterPassed directly to true.
// If it's true then we check whether the field is equal to '1'
boolean filterPassed = !dropFailedFilter || read.getBytes()[fieldsPos[2]] == (byte)'1';
if (!filterPassed)
{
context.increment(ReadCounters.FailedFilter, 1);
++nBadReads;
}
else if (!checkReadQuality(read, fieldsPos))
{
context.increment(ReadCounters.NotEnoughBases, 1);
++nBadReads;
}
// In here we do all the work to prepare the read for output. It will be written to the
// appropriate WritableMapping, which will in turn be inserted into the ReadPair outputValue.
prepMapping(read.getBytes(), fieldsPos, nReads - 1);
}
if (nReads < nReadsPerTemplate)
{
context.increment(ReadCounters.Unpaired, nReads);
String msg = String.format("Too few reads for template! (found %s). Key: %s", nReads, key);
if (warnOnlyIfUnpaired)
LOG.warn(msg);
else
throw new RuntimeException(msg + "\nread: " + outputValue.toString());
}
// nReads can't be > nReadsPerTemplate since that should be caught in the loop above.
// If they're a complete template and they're not all bad write. Unpaired are dropped
if (nReads == nReadsPerTemplate && nBadReads < nReads)
context.write(outputKey, outputValue);
else
context.increment(ReadCounters.Dropped, nReads);
context.progress();
}
/**
* Set a mapping from the mapper-serialized data.
*
* @param data The byte array from the Text object where the data was serialized by the mapper.
* @param fieldPositions Array of positions indexing the byte array, as produced by findFields.
* @param index Index of the Mapping we're creating, whether 0 or 1. We use this to index into
* the sequence, quality and mapping arrays and to set the read number in the WritableMapping.
*
* @post The WritableMapping in mapping[index] will be reset with the data from the input byte
* array data. It will use the ByteBuffers in sequence and quality to store its
* sequence and base quality data. The reset mapping will be set as read 1 or 2 of the
* outputValue ReadPair.
*/
private void prepMapping(byte[] data, int[] fieldPositions, int index)
{
WritableMapping map = mapping[index];
int length = fieldPositions[1] - 1;
ensureCapacity(length);
map.clear();
// ByteBuffer.clear() resets position to 0 and limit to capacity()
sequence[index].clear();
sequence[index].put(data, 0, length).rewind().mark().limit(length);
map.setSequence(sequence[index]);
quality[index].clear();
quality[index].put(data, fieldPositions[1], length).rewind().mark().limit(length);
map.setBaseQualities(quality[index]);
if (index == 0)
{
map.setIsRead1(true);
outputValue.setRead1(map);
}
else if (index == 1)
{
map.setIsRead2(true);
// set both reads as paired
mapping[0].setIsPaired(true);
map.setIsPaired(true);
outputValue.setRead2(map);
}
else
throw new UnsupportedOperationException("working with more than two reads per template is not supported");
}
private void ensureCapacity(int required)
{
int current = sequence[0].capacity();
if (required > current)
{
int newSize = required*2;
// grow
for (int i = 0; i < sequence.length; ++i)
{
ByteBuffer temp;
// allocate and copy sequence first
temp = ByteBuffer.allocate(newSize);
temp.put(sequence[i]).rewind().mark().limit(sequence[i].limit());
sequence[i] = temp;
if (mapping[i].getSequence() != null)
mapping[i].setSequence(sequence[i]);
// repeat for quality
temp = ByteBuffer.allocate(newSize);
temp.put(quality[i]).rewind().mark().limit(quality[i].limit());
quality[i] = temp;
if (mapping[i].getBaseQualities() != null)
mapping[i].setBaseQualities(quality[i]);
}
}
}
// read format:
// read1 <tab> quality1 <tab> filter flag
// field idx 0 1 2
private int[] findFields(Text read)
{
int[] fieldsPos = new int[3];
fieldsPos[0] = 0;
for (int i = 1; i <= 2; ++i)
{
fieldsPos[i] = read.find(delim, fieldsPos[i-1]) + 1; // +1 since we get the position of the delimiter
if (fieldsPos[i] <= 0)
throw new RuntimeException("invalid read/quality format: " + read.toString());
}
int seqLength = fieldsPos[1] - 1;
int qualLength = fieldsPos[2] - fieldsPos[1] - 1;
if (seqLength != qualLength)
throw new RuntimeException("sequence and quality lengths don't match! (got " + seqLength + " and " + qualLength + ")");
return fieldsPos;
}
/**
* Verify whether a read satisfies quality standards.
* For now this method verifies whether the read has at least
* minBasesThreshold known bases (ignoring unknown bases N).
*/
protected boolean checkReadQuality(Text read, int[] fieldsPos)
{
/* The read's delimiter is at the bytes before the second field starts */
int readEnd = fieldsPos[1] - 1;
// The condition is "min number of valid bases". However, we consider
// the inverse condition "max number of unknowns".
// readEnd is also the length of the read fragment
// readEnd - minBasesThreshold gives us the maximum number of unknowns acceptable.
int nAcceptableUnknowns = readEnd - minBasesThreshold;
if (nAcceptableUnknowns < 0) // the fragment is shorter than minBasesThreshold
return false;
int nUnknownBases = 0;
byte[] data = read.getBytes(); // we can work directly in bytes as long as we only have ASCII characters
for (int pos = 0; pos < readEnd; ++pos)
{
if (data[pos] == UnknownBase)
{
++nUnknownBases;
if (nUnknownBases > nAcceptableUnknowns)
return false;
}
}
return true;
}
}