// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package it.crs4.seal.read_sort; import it.crs4.seal.common.BwaRefAnnotation; import it.crs4.seal.common.FormatException; import it.crs4.seal.common.ClusterUtils; import it.crs4.seal.common.SealToolRunner; import java.io.IOException; import java.io.InputStreamReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.cli.*; import java.net.URI; import java.net.InetSocketAddress; import org.apache.hadoop.mapred.ClusterStatus; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.GenericOptionsParser; public class ReadSort extends Configured implements Tool { public static final String REF_ANN_PROP_NAME = "readsort.reference.ann"; private static final Log LOG = LogFactory.getLog(ReadSort.class); public static Path getAnnotationPath(Configuration conf) throws IOException { String annotationName = conf.get(ReadSort.REF_ANN_PROP_NAME); if (annotationName == null) throw new RuntimeException("missing property " + REF_ANN_PROP_NAME); LOG.info("reading reference annotation from " + annotationName); Path annPath = new Path(annotationName); FileSystem srcFs; if (conf.get("mapred.cache.archives") != null) { // we're using the distributed cache for the reference, // so it's on the local file system srcFs = FileSystem.getLocal(conf); } else srcFs = annPath.getFileSystem(conf); return annPath.makeQualified(srcFs); } public static class ReadSortSamMapper extends Mapper<LongWritable, Text, LongWritable, Text> { private static final String delim = "\t"; private BwaRefAnnotation annotation; private LongWritable outputKey; @Override public void setup(Context context) throws IOException, FormatException { Configuration conf = context.getConfiguration(); Path annPath = getAnnotationPath(conf); FSDataInputStream in = annPath.getFileSystem(conf).open(annPath); annotation = new BwaRefAnnotation(new InputStreamReader(in)); LOG.info("ReadSortSamMapper successfully read reference annotations"); in.close(); outputKey = new LongWritable(); } /** * Map (xx, SAM record) to (absolute coordinates, SAM record). */ @Override public void map(LongWritable ignored, Text sam, Context context) throws IOException, InterruptedException { try { int pos = 0; for (int i = 1; i <= 2; ++i) pos = sam.find(delim, pos) + 1; // +1 since we get the position of the delimiter int seq_pos = pos; int coord_pos = sam.find(delim, pos) + 1; int coord_end = sam.find(delim, coord_pos); // pos of coordinate delimiter if (seq_pos <= 0 || coord_pos <= 0 || coord_end <= 0) throw new RuntimeException("Invalid SAM record: " + sam.toString()); String seq_name = Text.decode(sam.getBytes(), seq_pos, coord_pos - seq_pos - 1); if (seq_name.equals("*")) outputKey.set(Long.MAX_VALUE); // unmapped read. Send it to the end else { long coord = Long.parseLong( Text.decode(sam.getBytes(), coord_pos, coord_end - coord_pos) ); outputKey.set(annotation.getAbsCoord(seq_name, coord)); } context.write( outputKey, sam); } catch (java.nio.charset.CharacterCodingException e) { throw new RuntimeException("Character coding error in SAM: " + e.getMessage()); } } } /** * Partition the input reads assuming that they cover the entire reference uniformly. * This partitioner needs to know the reference length, and then divides it into * regions of equal length. */ public static class WholeReferencePartitioner extends Partitioner<LongWritable, Text> implements Configurable { private long partitionSize; private long referenceSize; Configuration conf; public WholeReferencePartitioner() { partitionSize = 0; referenceSize = 0; conf = null; } @Override public void setConf(Configuration c) { conf = c; /* Read the reference annotation from the file provided in REF_ANN_PROP_NAME. * The file can be on a mounted filesystem or HDFS, but it has to be accessible * from every node. */ FSDataInputStream in = null; Path annPath = null; try { annPath = getAnnotationPath(conf); System.err.println("WholeReferencePartitioner: annotation path: " + annPath); } catch (IOException e) { throw new RuntimeException("WholeReferencePartitioner: error getting annotation file path. " + e.getMessage()); } try { in = annPath.getFileSystem(conf).open(annPath); BwaRefAnnotation annotation = new BwaRefAnnotation(new InputStreamReader(in)); LOG.info("Partitioner successfully read reference annotations"); referenceSize = annotation.getReferenceLength(); if (referenceSize <= 0) throw new RuntimeException("WholeReferencePartitioner could not get reference length."); int nReducers = conf.getInt(ClusterUtils.NUM_RED_TASKS_PROPERTY, 1); if (nReducers == 1) { partitionSize = referenceSize; } else if (nReducers >= 2) { // leave one reducer for the unmapped reads partitionSize = (long)Math.ceil( referenceSize / ((double)nReducers - 1)); if (LOG.isInfoEnabled()) LOG.info("Reference size: " + referenceSize + "; n reducers: " + nReducers + ". Set partition size to " + partitionSize); } else throw new RuntimeException("Negative number of reducers (" + nReducers + ")"); } catch (IOException e) { // We can't throw IOException since it's not in the setConf specification. String msg = "WholeReferencePartitioner: error reading BWA annotation. " + e.getMessage(); if (annPath.toString().startsWith("hdfs://")) msg += " Maybe you forgot to specify 'file://' for a local path?"; throw new RuntimeException(msg); } finally { if (in != null) { try { in.close(); } catch (IOException e) { LOG.warn("Error closing annotations file. Message: " + e.getMessage()); } } } } @Override public Configuration getConf() { return conf; } @Override public int getPartition(LongWritable key, Text value, int numPartitions) { if (conf == null) throw new RuntimeException("WholeReferencePartitioner isn't configured!"); if (partitionSize <= 0) throw new RuntimeException("WholeReferencePartitioner can't partition with partitionSize " + partitionSize); if (numPartitions == 1 || key.get() == Long.MAX_VALUE) { // If we only have one partition, obviously we return partition 0. // Otherwise, reserve the last partition for the unmapped reads. return numPartitions - 1; } else { int partition = (int)( (key.get() - 1) / partitionSize); // the key coordinate starts at 1 if (partition == numPartitions - 1) // the last partition is reserved for unmapped reads. Something went wrong. { throw new RuntimeException("WholeReferencePartitioner: partition index too big! referenceSize: " + referenceSize + "; key: " + key + "; partitionSize: " + partitionSize + "; numPartitions: " + numPartitions + "; partition: " + partition); } return partition; } } } public static class ReadSortSamReducer extends Reducer<LongWritable, Text, Text, Text> { private Text outputValue = new Text(); @Override public void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // We can get more than one read per position. We could sort them by name, but // it's probably not worth the effort. for (Text record: values) { int delim_pos = record.find("\t"); // copy the part after the name field to outputValue outputValue.clear(); outputValue.append(record.getBytes(), delim_pos+1, record.getLength() - delim_pos - 1); // XXX: now set record to itself, truncating the part from delim_pos onwards (make sure this works!) record.set(record.getBytes(), 0, delim_pos); // the default output formatter joins key and value with a tab. context.write(record, outputValue); } } } private String makeJobName(Path firstInputPath) { // TODO: if the path is too long look at some smart way to trim the name return "ReadSort " + firstInputPath.toString(); } public int run(String[] args) throws Exception { LOG.info("starting"); Configuration conf = getConf(); ReadSortOptionParser parser = new ReadSortOptionParser(); parser.parse(conf, args); LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); // Create a Job using the processed conf Job job = new Job(conf, makeJobName(parser.getInputPaths().get(0))); job.setJarByClass(ReadSort.class); // input paths for (Path p: parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setMapperClass(ReadSortSamMapper .class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(WholeReferencePartitioner.class); job.setReducerClass(ReadSortSamReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // output path FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); return 0; } else { LOG.fatal("ReadSort failed!"); return 1; } } /** * @param args */ public static void main(String[] args) throws Exception { int res = new SealToolRunner().run(new ReadSort(), args); System.exit(res); } }