PairReadsQSeqReducer.java example

Explorer

seal-master
- src
  - it
    - crs4
      - seal
        common
        AbstractSamMapping.java
        AbstractTaggedMapping.java
        AlignFlags.java
        AlignOp.java
        BamInputFormat.java
        BwaRefAnnotation.java
        ClusterUtils.java
        ConfigFileParser.java
        ContextAdapter.java
        CutString.java
        CutText.java
        FormatException.java
        FormatNameMap.java
        GroupByLocationComparator.java
        IMRContext.java
        LineReader.java
        MdOp.java
        OutputStreamFactory.java
        ReadPair.java
        SamInputFormat.java
        SealToolParser.java
        SealToolRunner.java
        SequenceId.java
        SequenceIdLocationPartitioner.java
        TestContext.java
        TextSamMapping.java
        UnknownItemException.java
        Utils.java
        WritableMapping.java
        demux
        BarcodeLookup.java
        Demux.java
        DemuxMapper.java
        DemuxOptionParser.java
        DemuxOutputFormat.java
        DemuxReducer.java
        DemuxUtils.java
        SampleSheet.java
        TwoOneThreeSortComparator.java
        prq
        PairReadsQSeq.java
        PairReadsQSeqMapper.java
        PairReadsQSeqReducer.java
        PrqOptionParser.java
        PrqOutputFormat.java
        read_sort
        FastaChecksummer.java
        MergeAlignments.java
        ReadSort.java
        ReadSortOptionParser.java
        recab
        ArrayListVariantTable.java
        ArrayVariantTable.java
        BytePacking.java
        Covariate.java
        CycleCovariate.java
        DinucCovariate.java
        HashSetVariantTable.java
        ObservationCount.java
        PackedSequence.java
        QualityCovariate.java
        ReadGroupCovariate.java
        RecabTable.java
        RecabTableCombiner.java
        RecabTableMapper.java
        RecabTableOptionParser.java
        RecabTableReducer.java
        RodFileVariantReader.java
        VariantReader.java
        VariantRegion.java
        VariantTable.java
        VcfVariantReader.java
        tsv_sort
        TextSampler.java
        TextValueOutputFormat.java
        TsvInputFormat.java
        TsvSort.java
        TsvSortOptionParser.java
        usort
        USort.java
        USortOptionParser.java
- tests
  - it
    - crs4
      - seal
        common
        TestAbstractSamMapping.java
        TestAlignFlags.java
        TestAlignOp.java
        TestBamInputFormat.java
        TestBwaRefAnnotation.java
        TestConfigFileParser.java
        TestCutString.java
        TestCutText.java
        TestGroupByLocationComparator.java
        TestLineReader.java
        TestMdOp.java
        TestSamInputFormat.java
        TestSealToolParser.java
        TestSequenceId.java
        TestSequenceIdLocationPartitioner.java
        TestTextSamMapping.java
        TestUtils.java
        TestWritableMapping.java
        demux
        TestBarcodeLookup.java
        TestDemuxMapper.java
        TestDemuxReducer.java
        TestSampleSheet.java
        TestTwoOneThreeSortComparator.java
        prq
        TestPairReadsQseqMapper.java
        TestPairReadsQseqReducer.java
        TestPrqOutputFormat.java
        read_sort
        TestFastaChecksummer.java
        recab
        BenchmarkVariantTable.java
        TestArrayListVariantTable.java
        TestArrayVariantTable.java
        TestBytePacking.java
        TestCycleCovariate.java
        TestDinucCovariate.java
        TestHashSetVariantTable.java
        TestObservationCount.java
        TestPackedSequence.java
        TestQualityCovariate.java
        TestReadGroupCovariate.java
        TestRecabTableCombiner.java
        TestRecabTableMapper.java
        TestRecabTableReducer.java
        TestRodFileVariantReader.java
        TestVcfVariantReader.java

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.prq;

import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.common.WritableMapping;

import org.apache.hadoop.io.Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.IOException;
import java.nio.ByteBuffer;

public class PairReadsQSeqReducer
{
	private static final Log LOG = LogFactory.getLog(PairReadsQSeqReducer.class);

	private Text     outputKey   = new Text();
	private ReadPair outputValue = new ReadPair();

	private ByteBuffer[]      sequence;
	private ByteBuffer[]      quality;
	private WritableMapping[] mapping;

	private int     minBasesThreshold  = 0;
	private boolean dropFailedFilter   = true;
	private boolean warnOnlyIfUnpaired = false;
	private int     nReadsPerTemplate  = 2;

	private static final byte[] delimByte     = { 9 }; // tab character
	private static final String delim         = "\t";
	private static final char   UnknownBase   = 'N';
	private static final int    INIT_BUF_SIZE = 150;

	public static enum ReadCounters {
		NotEnoughBases,
		FailedFilter,
		Unpaired,
		Dropped
	}

	public void setMinBasesThreshold(int v) { minBasesThreshold = v; }
	public void setDropFailedFilter(boolean v) { dropFailedFilter = v; }
	public void setWarnOnlyIfUnpaired(boolean v) { warnOnlyIfUnpaired = v; }
	public void setNumReadsPerTemplate(int v) {
		if (v <= 0)
			throw new IllegalArgumentException("number of reads per template must be 0 or 1");
		if (v > 2)
			throw new UnsupportedOperationException("number of reads per template must be 0 or 1");
		nReadsPerTemplate = v;
	}

	public void setup(IMRContext<Text, ReadPair> context)
	{
		sequence = new ByteBuffer[nReadsPerTemplate];
		quality  = new ByteBuffer[nReadsPerTemplate];
		mapping  = new WritableMapping[nReadsPerTemplate];

		for (int i = 0; i < sequence.length; ++i)
		{
			sequence[i] = ByteBuffer.allocate(INIT_BUF_SIZE);
			sequence[i].limit(0).position(0);
			quality[i] = ByteBuffer.allocate(INIT_BUF_SIZE);
			quality[i].limit(0).position(0);
			mapping[i] = new WritableMapping();
		}

		// create counters with a value of 0.
		context.increment(ReadCounters.NotEnoughBases, 0);
		context.increment(ReadCounters.FailedFilter, 0);
		context.increment(ReadCounters.Dropped, 0);

		LOG.debug("Set up prq reducer for templates with " + nReadsPerTemplate + " reads");
	}

	public void reduce(SequenceId key, Iterable<Text> values, IMRContext<Text,ReadPair> context)
		throws IOException, InterruptedException
	{
		outputKey.set( key.getLocation() );
		outputValue.clear();

		int nReads = 0;
		int nBadReads = 0;
		for (Text read: values)
		{
			++nReads;
			if (nReads > nReadsPerTemplate)
				throw new RuntimeException("got more than " + nReadsPerTemplate + " reads for sequence key " + key + ". Record: " + read);

			int[] fieldsPos = findFields(read);
			// filtered read?
			// If dropFailedFilter is false it shortcuts the test and sets filterPassed directly to true.
			// If it's true then we check whether the field is equal to '1'
			boolean filterPassed = !dropFailedFilter || read.getBytes()[fieldsPos[2]] == (byte)'1';

			if (!filterPassed)
			{
				context.increment(ReadCounters.FailedFilter, 1);
				++nBadReads;
			}
			else if (!checkReadQuality(read, fieldsPos))
			{
				context.increment(ReadCounters.NotEnoughBases, 1);
				++nBadReads;
			}

			// In here we do all the work to prepare the read for output.  It will be written to the
			// appropriate WritableMapping, which will in turn be inserted into the ReadPair outputValue.
			prepMapping(read.getBytes(), fieldsPos, nReads - 1);
		}

		if (nReads < nReadsPerTemplate)
		{
			context.increment(ReadCounters.Unpaired, nReads);
			String msg = String.format("Too few reads for template! (found %s). Key: %s", nReads, key);
			if (warnOnlyIfUnpaired)
				LOG.warn(msg);
			else
				throw new RuntimeException(msg + "\nread: " + outputValue.toString());
		}
		// nReads can't be > nReadsPerTemplate since that should be caught in the loop above.

		// If they're a complete template and they're not all bad write. Unpaired are dropped
		if (nReads == nReadsPerTemplate && nBadReads < nReads)
			context.write(outputKey, outputValue);
		else
			context.increment(ReadCounters.Dropped, nReads);

		context.progress();
	}

	/**
	 * Set a mapping from the mapper-serialized data.
	 *
	 * @param data The byte array from the Text object where the data was serialized by the mapper.
	 * @param fieldPositions Array of positions indexing the byte array, as produced by findFields.
	 * @param index Index of the Mapping we're creating, whether 0 or 1.  We use this to index into
	 * the sequence, quality and mapping arrays and to set the read number in the WritableMapping.
	 *
	 * @post The WritableMapping in mapping[index] will be reset with the data from the input byte
	 * array data.  It will use the ByteBuffers in sequence and quality to store its
	 * sequence and base quality data.  The reset mapping will be set as read 1 or 2 of the
	 * outputValue ReadPair.
	 */
	private void prepMapping(byte[] data, int[] fieldPositions, int index)
	{
		WritableMapping map = mapping[index];
		int length = fieldPositions[1] - 1;
		ensureCapacity(length);

		map.clear();

		// ByteBuffer.clear() resets position to 0 and limit to capacity()
		sequence[index].clear();
		sequence[index].put(data, 0, length).rewind().mark().limit(length);
		map.setSequence(sequence[index]);

		quality[index].clear();
		quality[index].put(data, fieldPositions[1], length).rewind().mark().limit(length);
		map.setBaseQualities(quality[index]);

		if (index == 0)
		{
			map.setIsRead1(true);
			outputValue.setRead1(map);
		}
		else if (index == 1)
		{
			map.setIsRead2(true);
			// set both reads as paired
			mapping[0].setIsPaired(true);
			map.setIsPaired(true);
			outputValue.setRead2(map);
		}
		else
			throw new UnsupportedOperationException("working with more than two reads per template is not supported");
	}

	private void ensureCapacity(int required)
	{
		int current = sequence[0].capacity();
		if (required > current)
		{
			int newSize = required*2;
			// grow
			for (int i = 0; i < sequence.length; ++i)
			{
				ByteBuffer temp;
				// allocate and copy sequence first
				temp = ByteBuffer.allocate(newSize);
				temp.put(sequence[i]).rewind().mark().limit(sequence[i].limit());
				sequence[i] = temp;
				if (mapping[i].getSequence() != null)
					mapping[i].setSequence(sequence[i]);

				// repeat for quality
				temp = ByteBuffer.allocate(newSize);
				temp.put(quality[i]).rewind().mark().limit(quality[i].limit());
				quality[i] = temp;
				if (mapping[i].getBaseQualities() != null)
					mapping[i].setBaseQualities(quality[i]);
			}
		}
	}

	// read format:
	//             read1 <tab> quality1 <tab> filter flag
	// field idx   0           1              2
	private int[] findFields(Text read)
	{
		int[] fieldsPos = new int[3];
		fieldsPos[0] = 0;

		for (int i = 1; i <= 2; ++i)
		{
			fieldsPos[i] = read.find(delim, fieldsPos[i-1]) + 1; // +1 since we get the position of the delimiter
			if (fieldsPos[i] <= 0)
				throw new RuntimeException("invalid read/quality format: " + read.toString());
		}

		int seqLength = fieldsPos[1] - 1;
		int qualLength = fieldsPos[2] - fieldsPos[1] - 1;
		if (seqLength != qualLength)
			throw new RuntimeException("sequence and quality lengths don't match! (got " + seqLength + " and " + qualLength + ")");

		return fieldsPos;
	}

	/**
	 * Verify whether a read satisfies quality standards.
	 * For now this method verifies whether the read has at least
	 * minBasesThreshold known bases (ignoring unknown bases N).
	 */
	protected boolean checkReadQuality(Text read, int[] fieldsPos)
	{
		/* The read's delimiter is at the bytes before the second field starts */
		int readEnd = fieldsPos[1] - 1;

		// The condition is "min number of valid bases".  However, we consider
		// the inverse condition "max number of unknowns".
		// readEnd is also the length of the read fragment
		// readEnd - minBasesThreshold gives us the maximum number of unknowns acceptable.
		int nAcceptableUnknowns = readEnd - minBasesThreshold;

		if (nAcceptableUnknowns < 0) // the fragment is shorter than minBasesThreshold
			return false;

		int nUnknownBases = 0;
		byte[] data = read.getBytes(); // we can work directly in bytes as long as we only have ASCII characters
		for (int pos = 0; pos < readEnd; ++pos)
		{
			if (data[pos] == UnknownBase)
			{
				++nUnknownBases;
				if (nUnknownBases > nAcceptableUnknowns)
					return false;
			}
		}
		return true;
	}
}