/*
* Copyright (C) 2011-2012 CRS4.
*
* This file is part of Seal.
*
* Seal is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option)
* any later version.
*
* Seal is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with Seal. If not, see <http://www.gnu.org/licenses/>.
*/
package it.crs4.seal.prq;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.FormatNameMap;
import it.crs4.seal.common.SealToolRunner;
import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.ContextAdapter;
import it.crs4.seal.common.GroupByLocationComparator;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SequenceId;
import org.seqdoop.hadoop_bam.FastqInputFormat;
import org.seqdoop.hadoop_bam.QseqInputFormat;
import org.seqdoop.hadoop_bam.SequencedFragment;
/**
* Trasform data from qseq or fastq format to prq format. In detail, at the moment it matches
* separate read from the same location in the flowcell (head and tail sections of a
* single DNA fragment) and puts them in a single output record so that they may be
* more easily aligned.
*
* The algorithm is analogous to the one used in the SecondarySort Hadoop example.
* We use Hadoop to sort all read sections by their location on the flow cell and
* read number. We group fragments by only their location on the flow cell, so that
* all reads are presented to the reducer together, and they may be output as a
* single record.
*
*/
public class PairReadsQSeq extends Configured implements Tool
{
public static final String PRQ_CONF_TRADITIONAL_IDS = "seal.prq.traditional_id_style";
/**
* Partition based only on the sequence location.
*/
public static class FirstPartitioner extends Partitioner<SequenceId,Text>
{
@Override
public int getPartition(SequenceId key, Text value, int numPartitions)
{
// clear the sign bit with & Integer.MAX_VALUE instead of calling Math.abs,
// which will return a negative number for Math.abs(Integer.MIN_VALUE).
return (key.getLocation().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
public static class PrqMapper extends Mapper<Text,SequencedFragment,SequenceId,Text>
{
private PairReadsQSeqMapper impl;
private IMRContext<SequenceId,Text> contextAdapter;
@Override
public void setup(Context context)
{
impl = new PairReadsQSeqMapper();
impl.setMakeTraditionalIds(context.getConfiguration().getBoolean(PairReadsQSeq.PRQ_CONF_TRADITIONAL_IDS, false));
impl.setup();
contextAdapter = new ContextAdapter<SequenceId,Text>(context);
}
@Override
public void map(Text key, SequencedFragment fragment, Context context)
throws IOException, InterruptedException
{
impl.map(key, fragment, contextAdapter);
}
}
public static class PrqReducer extends Reducer<SequenceId,Text,Text,ReadPair>
{
private IMRContext<Text,ReadPair> contextAdapter;
private PairReadsQSeqReducer impl;
@Override
public void setup(Context context)
{
contextAdapter = new ContextAdapter<Text,ReadPair>(context);
impl = new PairReadsQSeqReducer();
impl.setMinBasesThreshold(
context.getConfiguration().getInt(PrqOptionParser.MinBasesThresholdConfigName,
PrqOptionParser.DefaultMinBasesThreshold));
impl.setDropFailedFilter(
context.getConfiguration().getBoolean(PrqOptionParser.DropFailedFilterConfigName,
PrqOptionParser.DropFailedFilterDefault));
impl.setWarnOnlyIfUnpaired(
context.getConfiguration().getBoolean(PrqOptionParser.WarningOnlyIfUnpairedConfigName,
PrqOptionParser.WarningOnlyIfUnpairedDefault));
impl.setNumReadsPerTemplate(
context.getConfiguration().getInt(PrqOptionParser.NumReadsExpectedConfigName,
PrqOptionParser.NumReadsExpectedDefault));
impl.setup(contextAdapter);
}
@Override
public void reduce(SequenceId key, Iterable<Text> values, Context context)
throws IOException, InterruptedException
{
impl.reduce(key, values, contextAdapter);
}
}
@Override
public int run(String[] args) throws Exception
{
Configuration conf = getConf();
// defaults
conf.set(PrqOptionParser.INPUT_FORMAT_CONF, PrqOptionParser.InputFormatDefault);
// parse command line
PrqOptionParser parser = new PrqOptionParser();
parser.parse(conf, args);
Job job = new Job(conf, "PairReadsQSeq " + parser.getInputPaths().get(0));
job.setJarByClass(PairReadsQSeq.class);
job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName("prq")));
job.setMapperClass(PrqMapper.class);
job.setMapOutputKeyClass(SequenceId.class);
job.setMapOutputValueClass(Text.class);
job.setPartitionerClass(FirstPartitioner.class);
job.setGroupingComparatorClass(GroupByLocationComparator.class);
job.setReducerClass(PrqReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ReadPair.class);
for (Path p: parser.getInputPaths())
FileInputFormat.addInputPath(job, p);
FileOutputFormat.setOutputPath(job, parser.getOutputPath());
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int res = new SealToolRunner().run(new PairReadsQSeq(), args);
System.exit(res);
}
}