/* * Copyright (C) 2011-2012 CRS4. * * This file is part of Seal. * * Seal is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) * any later version. * * Seal is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with Seal. If not, see <http://www.gnu.org/licenses/>. */ package it.crs4.seal.usort; import java.io.IOException; import java.util.HashMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import it.crs4.seal.common.FormatNameMap; import it.crs4.seal.common.SealToolParser; import it.crs4.seal.common.SealToolRunner; import it.crs4.seal.common.SequenceId; import it.crs4.seal.demux.Demux; import org.seqdoop.hadoop_bam.SequencedFragment; public class USort extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(USort.class); public static final int NUM_REDUCE_TASKS = USortPartitioner.EXPECTED_NUM_PARTITIONS; /** * Partition assigning each read phase from each tile to the same reducer. * * In an Illumina flowcell we currently have 768 tiles split over 8 lanes. * Though actual number of these is specified in the xml files generated by * the sequencer, they should be pretty constant so, given the nature of this * tool, we'll hard code them. * * We'll create 768 x 2 reduce tasks--i.e., one per tile per read. * * Read 1 are sent to odd partitions; reads 2 to even partitions. * Specifically, we'll send them to (x+1) and x. * * Tiles are numbered in blocks of 16: * block 1: 1101, 1102, ..., 1116 * block 2: 1201, 1202, ..., 1216 * block 3: 1301, 1302, ..., 1316 * block 4: 2101, 2102, ..., 2116 * block 5: 2201, 2202, ..., 2216 * block 6: 2301, 2302, ..., 2316 * * Lanes range 1-8. Each lane contains 16*6 = 96 tiles. * * Within each tiles there are many read 1 and 2. */ public static class USortPartitioner extends Partitioner<SequenceId,SequencedFragment> { public static final int EXPECTED_NUM_PARTITIONS = 768 * 2; protected static final int NUM_LANES = 8; protected static final int BLOCKS_PER_LANE = 6; protected static final int TILES_PER_BLOCK = 16; protected static final int TILES_PER_LANE = BLOCKS_PER_LANE * TILES_PER_BLOCK; // flow cell composition protected static final int NUM_SURFACES = 2; protected static final int NUM_SWATHS = 3; // per surface protected static final int NUM_TILES = 16; // per swath protected static HashMap<Integer, Integer> Lookup; static { // Fill the look-up table that maps a tile id to a partition number. We // multiply the partition number by 2, thus leaving a gap between // consecutive tiles. We'll stick the odd reads in those gaps; i.e., // // tile 1101 -> element 0 // read2 from tile 1101 will go to partition 0 // read1 from tile 1101 will go to partition 0+1 Lookup = new HashMap<Integer, Integer>(EXPECTED_NUM_PARTITIONS); int elementNumber = 0; for (int surface = 1; surface <= NUM_SURFACES; ++surface) { for (int swath = 1; swath <= NUM_SWATHS; ++swath) { for (int tile = 1; tile <= NUM_TILES; ++tile) { Lookup.put(surface*1000 + swath*100 + tile, 2*elementNumber); elementNumber += 1; } } } } @Override public int getPartition(SequenceId key, SequencedFragment read, int numPartitions) { if (numPartitions != EXPECTED_NUM_PARTITIONS) throw new RuntimeException("Expecting " + EXPECTED_NUM_PARTITIONS + " reduce tasks but we have " + numPartitions); Integer lane = read.getLane(); Integer tile = read.getTile(); Integer read_num = read.getRead(); if (lane == null || lane < 1 || lane > NUM_LANES) throw new RuntimeException("Invalid lane number '" + lane + "' in partitioner!"); if (tile == null) throw new RuntimeException("Invalid tile number '" + tile + "' in partitioner!"); if (read_num == null || read_num < 1 || read_num > 2) throw new RuntimeException("Invalid read number '" + read_num + "' in partitioner!"); int tileIndex = (lane - 1) * TILES_PER_LANE + Lookup.get(tile); if (read_num == 1) tileIndex += 1; if (tileIndex >= EXPECTED_NUM_PARTITIONS) throw new RuntimeException("BUG! Calculated partition index " + tileIndex + " but the maximum expected is " + EXPECTED_NUM_PARTITIONS); return tileIndex; } } public static class Reduce extends Reducer<SequenceId,SequencedFragment,Text,SequencedFragment> { @Override public void reduce(SequenceId key, Iterable<SequencedFragment> values, Context context) throws IOException, InterruptedException { for (SequencedFragment f: values) context.write(null, f); } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // defaults conf.set(SealToolParser.INPUT_FORMAT_CONF, USortOptionParser.InputFormatDefault); conf.set(SealToolParser.OUTPUT_FORMAT_CONF, USortOptionParser.OutputFormatDefault); // parse command line USortOptionParser parser = new USortOptionParser(); parser.parse(conf, args); Job job = new Job(conf, "USort " + parser.getInputPaths().get(0)); job.setJarByClass(USort.class); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName())); job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName())); job.setMapperClass(Demux.Map.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(SequencedFragment.class); job.setPartitionerClass(USortPartitioner.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SequencedFragment.class); for (Path p: parser.getInputPaths()) FileInputFormat.addInputPath(job, p); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); boolean result = job.waitForCompletion(true); if (!result) { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } else return 0; } public static void main(String[] args) throws Exception { int res = new SealToolRunner().run(new USort(), args); System.exit(res); } }