// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package it.crs4.seal.read_sort; import it.crs4.seal.common.BwaRefAnnotation; import it.crs4.seal.common.SealToolRunner; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.HashMap; import java.util.Map; import org.apache.commons.cli.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.Tool; public class MergeAlignments extends Configured implements Tool { private String userInput; private String userOutput; private String userReferenceRoot; private String userAnnotation; private String sortOrder = "coordinate"; private String genomeAssemblyId; private BwaRefAnnotation refAnnotation; private Path referenceRootPath; private Path annotationPath; private Path[] inputPaths; private Path outputPath; private boolean generatedMd5 = false; private boolean headerOnly = false; private FastaChecksummer checksums; private Map<String, String> readGroupFields; private static final Log log = LogFactory.getLog(MergeAlignments.class); // log must not go to standard output. private Path getQualifiedPath(String simplePath) throws IOException { Path path = new Path(simplePath); return path.makeQualified(path.getFileSystem(getConf())); } @SuppressWarnings("static") // for OptionBuilder private Map<String, Option> defineRGOptions() { Map<String, Option> readGroupOptions = new HashMap<String, Option>(); readGroupOptions.put("ID", OptionBuilder .withDescription("Read group id") .hasArg() .withArgName("ID") .withLongOpt("rg-id") .create("rgid")); readGroupOptions.put("SM", OptionBuilder .withDescription("Read group sample") .hasArg() .withArgName("sample") .withLongOpt("rg-sm") .create("rgsm")); readGroupOptions.put("LB", OptionBuilder .withDescription("Read group library") .hasArg() .withArgName("library") .withLongOpt("rg-lb") .create("rglb")); readGroupOptions.put("PU", OptionBuilder .withDescription("Read group platform unit") .hasArg() .withArgName("pu") .withLongOpt("rg-pu") .create("rgpu")); readGroupOptions.put("CN", OptionBuilder .withDescription("Read group center") .hasArg() .withArgName("center") .withLongOpt("rg-cn") .create("rgcn")); readGroupOptions.put("DT", OptionBuilder .withDescription("Read group date") .hasArg() .withArgName("date") .withLongOpt("rg-dt") .create("rgdt")); readGroupOptions.put("PL", OptionBuilder .withDescription("Read group platform") .hasArg() .withArgName("platform") .withLongOpt("rg-pl") .create("rgpl")); return readGroupOptions; } private Map<String, String> parseReadGroupOptions(Map<String, Option> readGroupOptions, CommandLine args) throws ParseException { HashMap<String, String> fields = new HashMap<String, String>(); for (Map.Entry<String, Option> pair: readGroupOptions.entrySet()) { String fieldName = pair.getKey(); Option opt = pair.getValue(); if (args.hasOption(opt.getOpt())) { fields.put(fieldName, args.getOptionValue( opt.getOpt() )); } } if (!fields.isEmpty()) { if (!fields.containsKey("ID") || !fields.containsKey("SM")) throw new ParseException("If you specify read group tags (RG) you must specify at least id and sample"); } return fields; } /** * Scan command line and set configuration values appropriately. * Calls System.exit in case of a command line error. */ @SuppressWarnings("static") // for OptionBuilder private void scanOptions(String[] args) { Options options = new Options(); Option ref = OptionBuilder .withDescription("root path to the reference used to create the SAM data") .hasArg() .withArgName("REF_PATH") .withLongOpt("reference") .create("ref"); options.addOption(ref); Option ann = OptionBuilder .withDescription("annotation file (.ann) of the BWA reference used to create the SAM data (not required if you specify " + ref.getOpt() + ")") .hasArg() .withArgName("ref.ann") .withLongOpt("annotations") .create("ann"); options.addOption(ann); Option md5 = OptionBuilder .withDescription("generated MD5 checksums for reference contigs") .withLongOpt("md5") .create("md5"); options.addOption(md5); Option optSortOrder = OptionBuilder .withDescription("Sort order. Can be one of: unknown, unsorted, queryname, coordinate. Default: coordinate") .hasArg() .withArgName("sort order") .withLongOpt("sort-order") .create("so"); options.addOption(optSortOrder); Option as = OptionBuilder .withDescription("Genome assembly identifier (@SQ AS:xxxx tag)") .hasArg() .withArgName("ASSEMBLY_ID") .withLongOpt("sq-assembly") .create("sqas"); options.addOption(as); Option optHeaderOnly = OptionBuilder .withDescription("Only output the SAM header, then exit.") .withLongOpt("header-only") .create("ho"); options.addOption(optHeaderOnly); // read group options Map<String, Option> readGroupOptions = defineRGOptions(); for (Option opt: readGroupOptions.values()) options.addOption(opt); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse( options, args ); if (line.hasOption(ref.getOpt())) userReferenceRoot = line.getOptionValue(ref.getOpt()); if (line.hasOption(md5.getOpt())) generatedMd5 = true; if (line.hasOption(ann.getOpt())) userAnnotation = line.getOptionValue(ann.getOpt()); if (line.hasOption(as.getOpt())) // TODO: validate this input genomeAssemblyId = line.getOptionValue(as.getOpt()); if (line.hasOption(optSortOrder.getOpt())) { String value = line.getOptionValue(optSortOrder.getOpt()); if ( "unknown".equals(value) || "unsorted".equals(value) || "queryname".equals(value) || "coordinate".equals(value) ) sortOrder = value; else throw new ParseException("Invalid sort order. Sort order must be one of: unknown, unsorted, queryname, coordinate."); } if (line.hasOption(optHeaderOnly.getOpt())) headerOnly = true; // remaining args String[] otherArgs = line.getArgs(); if (headerOnly) { // We're only generating the header, so we don't expect any input reads. if (otherArgs.length > 1) throw new ParseException("You can't provide an input path with --header-only. Provide an output path or let the output go to stdout."); if (otherArgs.length == 0) userOutput = null; else userOutput = otherArgs[0]; } else // not headerOnly { if (otherArgs.length <= 0 || otherArgs.length > 2) throw new ParseException("You must provide an HDFS input path and, optionally, an output path."); userOutput = null; userInput = otherArgs[0]; if (otherArgs.length >= 2) userOutput = otherArgs[1]; } readGroupFields = parseReadGroupOptions(readGroupOptions, line); // option validation if (generatedMd5 && userReferenceRoot == null) throw new ParseException("You must specify the path the reference if you want to generate MD5 checksums"); if (userReferenceRoot == null && userAnnotation == null) throw new ParseException("You must provide the path to the reference or at least its annotation file (<ref>.ann)"); } catch( ParseException e ) { System.err.println("Usage error: " + e.getMessage()); // XXX: redirect System.out to System.err since the simple version of // HelpFormatter.printHelp prints to System.out, and we're on a way to // a fatal exit. System.setOut(System.err); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "MergeAlignments [options] -ann <ref>.ann [<in>] [<out>]", options); System.exit(1); } } private void configureMerge() throws ParseException, IOException { // convert user-provided paths to Path objects and make them qualified if (userAnnotation != null) annotationPath = getQualifiedPath(userAnnotation); if (userReferenceRoot != null) { referenceRootPath = getQualifiedPath(userReferenceRoot); if (annotationPath == null) annotationPath = referenceRootPath.suffix(".ann"); else { if (!annotationPath.equals(referenceRootPath.suffix(".ann"))) log.warn("specified annotation file does not follow the same naming pattern as the specified reference"); } } // here we load the reference annotations loadAnnotations(); log.info("Reference annotations read"); } private void calculateChecksums() throws IOException { if (generatedMd5) { log.info("Calculating reference checksum..."); log.info("Reference fasta path: " + referenceRootPath.toString()); checksums = new FastaChecksummer(); FileReader reader = new FileReader( new File(referenceRootPath.toUri()) ); try { checksums.setInput(reader); checksums.calculate(); } finally { reader.close(); } log.info("checksum complete"); } } private void loadAnnotations() throws IOException { FileSystem fs = annotationPath.getFileSystem(getConf()); log.info("Reading reference annotations from " + annotationPath); try { FSDataInputStream in = fs.open(annotationPath); refAnnotation = new BwaRefAnnotation(new InputStreamReader(in)); } catch (IOException e) { log.fatal("Can't read annotation file " + annotationPath + " on filesystem " + fs.getUri()); throw e; } } private static class SourcePathFilter implements org.apache.hadoop.fs.PathFilter { public boolean accept(Path p) { boolean decision = true; if (p.getName().toString().startsWith("_")) decision = false; return decision; } } private Path[] getSourcePaths() throws Exception { Path srcPath = new Path(userInput); FileSystem srcFs = srcPath.getFileSystem(getConf()); if (srcFs.exists(srcPath)) { FileStatus stat = srcFs.getFileStatus(srcPath); if (stat.isDir()) { String msg = "source path " + srcPath + " is a directory. Globbing with "; srcPath = new Path(srcPath, "*"); log.info(msg + srcPath); } } // Glob source path. The returned paths are already sorted. We filter out paths starting // with '_' (see SourcePathFilter). // If the path doesn't contain a glob patter, and it doesn't exist, the function will return null. Path[] sources = FileUtil.stat2Paths(srcFs.globStatus(srcPath, new SourcePathFilter())); if (sources == null) throw new IllegalArgumentException("Source path " + srcPath.makeQualified(srcFs) + " doesn't exist"); if (log.isDebugEnabled()) { log.debug("Sources:"); for (int i = 0; i < sources.length; ++i) log.debug(sources[i]); } if (sources.length == 0) throw new IllegalArgumentException("no source files selected"); log.info("Merging " + sources.length + " files."); return sources; } private void writeSamHeader(OutputStream rawOut) throws IOException { Writer out = new BufferedWriter( new OutputStreamWriter(rawOut)); out.write("@HD\tVN:1.0\tSO:" + sortOrder + "\n"); // prep @SQ format string StringBuilder formatBuilder = new StringBuilder(50); formatBuilder.append("@SQ\tSN:%s\tLN:%d"); if (genomeAssemblyId != null) formatBuilder.append("\tAS:").append(genomeAssemblyId); if (referenceRootPath != null) formatBuilder.append("\tUR:").append(referenceRootPath.toUri()); if (generatedMd5) formatBuilder.append("\tM5:%s"); formatBuilder.append("\n"); String format = formatBuilder.toString(); if (generatedMd5) { for (BwaRefAnnotation.Contig c: refAnnotation) out.write( String.format(format, c.getName(), c.getLength(), checksums.getChecksum(c.getName())) ); } else { for (BwaRefAnnotation.Contig c: refAnnotation) out.write( String.format(format, c.getName(), c.getLength()) ); } // @PG: Seal name and version String version = "not available"; Package pkg = this.getClass().getPackage(); if (pkg != null && pkg.getImplementationVersion() != null) version = pkg.getImplementationVersion(); else log.warn("Could not get package version"); out.write("@PG\tID:seal\tVN:" + version + "\n"); // @RG, if provided if (!readGroupFields.isEmpty()) { StringBuilder rgBuilder = new StringBuilder("@RG"); for (Map.Entry<String, String> pair: readGroupFields.entrySet()) rgBuilder.append("\t").append(pair.getKey()).append(":").append(pair.getValue()); rgBuilder.append("\n"); out.write( rgBuilder.toString() ); } out.flush(); } private void copyMerge(Path[] sources, OutputStream out) throws IOException { Configuration conf = getConf(); for (int i = 0; i < sources.length; ++i) { FileSystem fs = sources[i].getFileSystem(conf); InputStream in = fs.open(sources[i]); try { IOUtils.copyBytes(in, out, conf, false); } finally { in.close(); } } } /** * Make an OutputStream to write to the user selected userOutput. */ private OutputStream getOutputStream() throws IOException { OutputStream out; if (userOutput == null) // Write to stdout { log.info("writing to standard out"); out = System.out; } else { Path destPath = getQualifiedPath(userOutput); FileSystem destFs = destPath.getFileSystem(getConf()); if (destFs.exists(destPath)) throw new RuntimeException("Output destination " + destPath + " exists. Please remove it or change the output path."); log.info("Merging sources to " + destPath); out = destFs.create(destPath); } return out; } public int run(String[] args) throws Exception { scanOptions(args); // ensure that annotation options make sense and load the reference annotations configureMerge(); // Get the output stream. This make create a new output file. // Do this before calculateChecksums so that any errors wrt to the output // path are found before spending minutes checksumming the reference. OutputStream destFile = getOutputStream(); try { Path[] sources = null; if (!headerOnly) { // Get input paths. Check for paths that don't exist. // As for getting the output stream, we verify the inputs now to fail early // in case of an invocation error. sources = getSourcePaths(); } // calculate the reference checksums, if necessary calculateChecksums(); writeSamHeader(destFile); if (!headerOnly) copyMerge(sources, destFile); } finally { destFile.close(); } log.info("Finished"); return 0; } public static void main(String[] args) { int res = 0; try { res = new SealToolRunner().run(new MergeAlignments(), args); } catch (Exception e) { System.err.println("Error executing MergeAlignments: " + e.getMessage()); res = 1; } System.exit(res); } }