/*
* Copyright (C) 2011-2012 CRS4.
*
* This file is part of Seal.
*
* Seal is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option)
* any later version.
*
* Seal is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with Seal. If not, see <http://www.gnu.org/licenses/>.
*/
package it.crs4.seal.usort;
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import it.crs4.seal.common.FormatNameMap;
import it.crs4.seal.common.SealToolParser;
import it.crs4.seal.common.SealToolRunner;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.demux.Demux;
import org.seqdoop.hadoop_bam.SequencedFragment;
public class USort extends Configured implements Tool
{
private static final Log LOG = LogFactory.getLog(USort.class);
public static final int NUM_REDUCE_TASKS = USortPartitioner.EXPECTED_NUM_PARTITIONS;
/**
* Partition assigning each read phase from each tile to the same reducer.
*
* In an Illumina flowcell we currently have 768 tiles split over 8 lanes.
* Though actual number of these is specified in the xml files generated by
* the sequencer, they should be pretty constant so, given the nature of this
* tool, we'll hard code them.
*
* We'll create 768 x 2 reduce tasks--i.e., one per tile per read.
*
* Read 1 are sent to odd partitions; reads 2 to even partitions.
* Specifically, we'll send them to (x+1) and x.
*
* Tiles are numbered in blocks of 16:
* block 1: 1101, 1102, ..., 1116
* block 2: 1201, 1202, ..., 1216
* block 3: 1301, 1302, ..., 1316
* block 4: 2101, 2102, ..., 2116
* block 5: 2201, 2202, ..., 2216
* block 6: 2301, 2302, ..., 2316
*
* Lanes range 1-8. Each lane contains 16*6 = 96 tiles.
*
* Within each tiles there are many read 1 and 2.
*/
public static class USortPartitioner extends Partitioner<SequenceId,SequencedFragment>
{
public static final int EXPECTED_NUM_PARTITIONS = 768 * 2;
protected static final int NUM_LANES = 8;
protected static final int BLOCKS_PER_LANE = 6;
protected static final int TILES_PER_BLOCK = 16;
protected static final int TILES_PER_LANE = BLOCKS_PER_LANE * TILES_PER_BLOCK;
// flow cell composition
protected static final int NUM_SURFACES = 2;
protected static final int NUM_SWATHS = 3; // per surface
protected static final int NUM_TILES = 16; // per swath
protected static HashMap<Integer, Integer> Lookup;
static {
// Fill the look-up table that maps a tile id to a partition number. We
// multiply the partition number by 2, thus leaving a gap between
// consecutive tiles. We'll stick the odd reads in those gaps; i.e.,
//
// tile 1101 -> element 0
// read2 from tile 1101 will go to partition 0
// read1 from tile 1101 will go to partition 0+1
Lookup = new HashMap<Integer, Integer>(EXPECTED_NUM_PARTITIONS);
int elementNumber = 0;
for (int surface = 1; surface <= NUM_SURFACES; ++surface) {
for (int swath = 1; swath <= NUM_SWATHS; ++swath) {
for (int tile = 1; tile <= NUM_TILES; ++tile) {
Lookup.put(surface*1000 + swath*100 + tile, 2*elementNumber);
elementNumber += 1;
}
}
}
}
@Override
public int getPartition(SequenceId key, SequencedFragment read, int numPartitions)
{
if (numPartitions != EXPECTED_NUM_PARTITIONS)
throw new RuntimeException("Expecting " + EXPECTED_NUM_PARTITIONS + " reduce tasks but we have " + numPartitions);
Integer lane = read.getLane();
Integer tile = read.getTile();
Integer read_num = read.getRead();
if (lane == null || lane < 1 || lane > NUM_LANES)
throw new RuntimeException("Invalid lane number '" + lane + "' in partitioner!");
if (tile == null)
throw new RuntimeException("Invalid tile number '" + tile + "' in partitioner!");
if (read_num == null || read_num < 1 || read_num > 2)
throw new RuntimeException("Invalid read number '" + read_num + "' in partitioner!");
int tileIndex = (lane - 1) * TILES_PER_LANE + Lookup.get(tile);
if (read_num == 1)
tileIndex += 1;
if (tileIndex >= EXPECTED_NUM_PARTITIONS)
throw new RuntimeException("BUG! Calculated partition index " + tileIndex + " but the maximum expected is " + EXPECTED_NUM_PARTITIONS);
return tileIndex;
}
}
public static class Reduce extends Reducer<SequenceId,SequencedFragment,Text,SequencedFragment>
{
@Override
public void reduce(SequenceId key, Iterable<SequencedFragment> values, Context context)
throws IOException, InterruptedException
{
for (SequencedFragment f: values)
context.write(null, f);
}
}
@Override
public int run(String[] args) throws Exception
{
Configuration conf = getConf();
// defaults
conf.set(SealToolParser.INPUT_FORMAT_CONF, USortOptionParser.InputFormatDefault);
conf.set(SealToolParser.OUTPUT_FORMAT_CONF, USortOptionParser.OutputFormatDefault);
// parse command line
USortOptionParser parser = new USortOptionParser();
parser.parse(conf, args);
Job job = new Job(conf, "USort " + parser.getInputPaths().get(0));
job.setJarByClass(USort.class);
job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName()));
job.setMapperClass(Demux.Map.class);
job.setMapOutputKeyClass(SequenceId.class);
job.setMapOutputValueClass(SequencedFragment.class);
job.setPartitionerClass(USortPartitioner.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(SequencedFragment.class);
for (Path p: parser.getInputPaths())
FileInputFormat.addInputPath(job, p);
FileOutputFormat.setOutputPath(job, parser.getOutputPath());
boolean result = job.waitForCompletion(true);
if (!result) {
LOG.fatal(this.getClass().getName() + " failed!");
return 1;
}
else
return 0;
}
public static void main(String[] args) throws Exception {
int res = new SealToolRunner().run(new USort(), args);
System.exit(res);
}
}