/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMUtils; import htsjdk.samtools.SAMValidationError; import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.fastq.FastqWriter; import htsjdk.samtools.fastq.FastqWriterFactory; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Lazy; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.SamOrBam; import java.io.File; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * <p/> * Extracts read sequences and qualities from the input SAM/BAM file and writes them into * the output file in Sanger fastq format. * See <a href="http://maq.sourceforge.net/fastq.shtml">MAQ FastQ specification</a> for details. * In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, * the read's sequence from input sam file will be reverse-complemented prior to writing it to fastq in order restore correctly * the original read sequence as it was generated by the sequencer. */ @CommandLineProgramProperties( usage = "Extracts read sequences and qualities from the input SAM/BAM file and writes them into " + "the output file in Sanger fastq format. In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, " + "the read's sequence from input SAM file will be reverse-complemented prior to writing it to fastq in order restore correctly" + "the original read sequence as it was generated by the sequencer.", usageShort = "Converts a SAM/BAM into a FASTQ", programGroup = SamOrBam.class ) public class SamToFastq extends CommandLineProgram { @Option(doc = "Input SAM/BAM file to extract reads from", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) public File INPUT; @Option(shortName = "F", doc = "Output fastq file (single-end fastq or, if paired, first end of the pair fastq).", mutex = {"OUTPUT_PER_RG"}) public File FASTQ; @Option(shortName = "F2", doc = "Output fastq file (if paired, second end of the pair fastq).", optional = true, mutex = {"OUTPUT_PER_RG"}) public File SECOND_END_FASTQ; @Option(shortName = "FU", doc = "Output fastq file for unpaired reads; may only be provided in paired-fastq mode", optional = true, mutex = {"OUTPUT_PER_RG"}) public File UNPAIRED_FASTQ; @Option(shortName = "OPRG", doc = "Output a fastq file per read group (two fastq files per read group if the group is paired).", optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) public boolean OUTPUT_PER_RG; @Option(shortName = "ODIR", doc = "Directory in which to output the fastq file(s). Used only when OUTPUT_PER_RG is true.", optional = true) public File OUTPUT_DIR; @Option(shortName = "RC", doc = "Re-reverse bases and qualities of reads with negative strand flag set before writing them to fastq", optional = true) public boolean RE_REVERSE = true; @Option(shortName = "INTER", doc = "Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from") public boolean INTERLEAVE = false; @Option(shortName = "NON_PF", doc = "Include non-PF reads from the SAM file into the output " + "FASTQ files. PF means 'passes filtering'. Reads whose 'not passing quality controls' " + "flag is set are non-PF reads.") public boolean INCLUDE_NON_PF_READS = false; @Option(shortName = "CLIP_ATTR", doc = "The attribute that stores the position at which " + "the SAM record should be clipped", optional = true) public String CLIPPING_ATTRIBUTE; @Option(shortName = "CLIP_ACT", doc = "The action that should be taken with clipped reads: " + "'X' means the reads and qualities should be trimmed at the clipped position; " + "'N' means the bases should be changed to Ns in the clipped region; and any " + "integer means that the base qualities should be set to that value in the " + "clipped region.", optional = true) public String CLIPPING_ACTION; @Option(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.") public int READ1_TRIM = 0; @Option(shortName = "R1_MAX_BASES", doc = "The maximum number of bases to write from read 1 after trimming. " + "If there are fewer than this many bases left after trimming, all will be written. If this " + "value is null then all bases left after trimming will be written.", optional = true) public Integer READ1_MAX_BASES_TO_WRITE; @Option(shortName = "R2_TRIM", doc = "The number of bases to trim from the beginning of read 2.") public int READ2_TRIM = 0; @Option(shortName = "R2_MAX_BASES", doc = "The maximum number of bases to write from read 2 after trimming. " + "If there are fewer than this many bases left after trimming, all will be written. If this " + "value is null then all bases left after trimming will be written.", optional = true) public Integer READ2_MAX_BASES_TO_WRITE; @Option(doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq " + "is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.") public boolean INCLUDE_NON_PRIMARY_ALIGNMENTS = false; private final Log log = Log.getInstance(SamToFastq.class); public static void main(final String[] argv) { System.exit(new SamToFastq().instanceMain(argv)); } protected int doWork() { IOUtil.assertFileIsReadable(INPUT); final SAMFileReader reader = new SAMFileReader(IOUtil.openFileForReading(INPUT)); final Map<String, SAMRecord> firstSeenMates = new HashMap<String, SAMRecord>(); final FastqWriterFactory factory = new FastqWriterFactory(); factory.setCreateMd5(CREATE_MD5_FILE); final Map<SAMReadGroupRecord, FastqWriters> writers = generateWriters(reader.getFileHeader().getReadGroups(), factory); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord currentRecord : reader) { if (currentRecord.isSecondaryOrSupplementary() && !INCLUDE_NON_PRIMARY_ALIGNMENTS) continue; // Skip non-PF reads as necessary if (currentRecord.getReadFailsVendorQualityCheckFlag() && !INCLUDE_NON_PF_READS) continue; final FastqWriters fq = writers.get(currentRecord.getReadGroup()); if (currentRecord.getReadPairedFlag()) { final String currentReadName = currentRecord.getReadName(); final SAMRecord firstRecord = firstSeenMates.remove(currentReadName); if (firstRecord == null) { firstSeenMates.put(currentReadName, currentRecord); } else { assertPairedMates(firstRecord, currentRecord); final SAMRecord read1 = currentRecord.getFirstOfPairFlag() ? currentRecord : firstRecord; final SAMRecord read2 = currentRecord.getFirstOfPairFlag() ? firstRecord : currentRecord; writeRecord(read1, 1, fq.getFirstOfPair(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE); final FastqWriter secondOfPairWriter = fq.getSecondOfPair(); if (secondOfPairWriter == null) { throw new PicardException("Input contains paired reads but no SECOND_END_FASTQ specified."); } writeRecord(read2, 2, secondOfPairWriter, READ2_TRIM, READ2_MAX_BASES_TO_WRITE); } } else { writeRecord(currentRecord, null, fq.getUnpaired(), READ1_TRIM, READ1_MAX_BASES_TO_WRITE); } progress.record(currentRecord); } reader.close(); // Close all the fastq writers being careful to close each one only once! for (final FastqWriters writerMapping : new HashSet<FastqWriters>(writers.values())) { writerMapping.closeAll(); } if (firstSeenMates.size() > 0) { SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.MATE_NOT_FOUND, "Found " + firstSeenMates.size() + " unpaired mates", null), VALIDATION_STRINGENCY); } return 0; } /** * Generates the writers for the given read groups or, if we are not emitting per-read-group, just returns the single set of writers. */ private Map<SAMReadGroupRecord, FastqWriters> generateWriters(final List<SAMReadGroupRecord> samReadGroupRecords, final FastqWriterFactory factory) { final Map<SAMReadGroupRecord, FastqWriters> writerMap = new HashMap<SAMReadGroupRecord, FastqWriters>(); final FastqWriters fastqWriters; if (!OUTPUT_PER_RG) { IOUtil.assertFileIsWritable(FASTQ); final FastqWriter firstOfPairWriter = factory.newWriter(FASTQ); final FastqWriter secondOfPairWriter; if (INTERLEAVE) { secondOfPairWriter = firstOfPairWriter; } else if (SECOND_END_FASTQ != null) { IOUtil.assertFileIsWritable(SECOND_END_FASTQ); secondOfPairWriter = factory.newWriter(SECOND_END_FASTQ); } else { secondOfPairWriter = null; } /** Prepare the writer that will accept unpaired reads. If we're emitting a single fastq - and assuming single-ended reads - * then this is simply that one fastq writer. Otherwise, if we're doing paired-end, we emit to a third new writer, since * the other two fastqs are accepting only paired end reads. */ final FastqWriter unpairedWriter = UNPAIRED_FASTQ == null ? firstOfPairWriter : factory.newWriter(UNPAIRED_FASTQ); fastqWriters = new FastqWriters(firstOfPairWriter, secondOfPairWriter, unpairedWriter); // For all read groups we may find in the bam, register this single set of writers for them. writerMap.put(null, fastqWriters); for (final SAMReadGroupRecord rg : samReadGroupRecords) { writerMap.put(rg, fastqWriters); } } else { // When we're creating a fastq-group per readgroup, by convention we do not emit a special fastq for unpaired reads. for (final SAMReadGroupRecord rg : samReadGroupRecords) { final FastqWriter firstOfPairWriter = factory.newWriter(makeReadGroupFile(rg, "_1")); // Create this writer on-the-fly; if we find no second-of-pair reads, don't bother making a writer (or delegating, // if we're interleaving). final Lazy<FastqWriter> lazySecondOfPairWriter = new Lazy<FastqWriter>(new Lazy.LazyInitializer<FastqWriter>() { @Override public FastqWriter make() { return INTERLEAVE ? firstOfPairWriter : factory.newWriter(makeReadGroupFile(rg, "_2")); } }); writerMap.put(rg, new FastqWriters(firstOfPairWriter, lazySecondOfPairWriter, firstOfPairWriter)); } } return writerMap; } private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String preExtSuffix) { String fileName = readGroup.getPlatformUnit(); if (fileName == null) fileName = readGroup.getReadGroupId(); fileName = IOUtil.makeFileNameSafe(fileName); if (preExtSuffix != null) fileName += preExtSuffix; fileName += ".fastq"; final File result = (OUTPUT_DIR != null) ? new File(OUTPUT_DIR, fileName) : new File(fileName); IOUtil.assertFileIsWritable(result); return result; } void writeRecord(final SAMRecord read, final Integer mateNumber, final FastqWriter writer, final int basesToTrim, final Integer maxBasesToWrite) { final String seqHeader = mateNumber == null ? read.getReadName() : read.getReadName() + "/" + mateNumber; String readString = read.getReadString(); String baseQualities = read.getBaseQualityString(); // If we're clipping, do the right thing to the bases or qualities if (CLIPPING_ATTRIBUTE != null) { final Integer clipPoint = (Integer) read.getAttribute(CLIPPING_ATTRIBUTE); if (clipPoint != null) { if (CLIPPING_ACTION.equalsIgnoreCase("X")) { readString = clip(readString, clipPoint, null, !read.getReadNegativeStrandFlag()); baseQualities = clip(baseQualities, clipPoint, null, !read.getReadNegativeStrandFlag()); } else if (CLIPPING_ACTION.equalsIgnoreCase("N")) { readString = clip(readString, clipPoint, 'N', !read.getReadNegativeStrandFlag()); } else { final char newQual = SAMUtils.phredToFastq( new byte[]{(byte) Integer.parseInt(CLIPPING_ACTION)}).charAt(0); baseQualities = clip(baseQualities, clipPoint, newQual, !read.getReadNegativeStrandFlag()); } } } if (RE_REVERSE && read.getReadNegativeStrandFlag()) { readString = SequenceUtil.reverseComplement(readString); baseQualities = StringUtil.reverseString(baseQualities); } if (basesToTrim > 0) { readString = readString.substring(basesToTrim); baseQualities = baseQualities.substring(basesToTrim); } if (maxBasesToWrite != null && maxBasesToWrite < readString.length()) { readString = readString.substring(0, maxBasesToWrite); baseQualities = baseQualities.substring(0, maxBasesToWrite); } writer.write(new FastqRecord(seqHeader, readString, "", baseQualities)); } /** * Utility method to handle the changes required to the base/quality strings by the clipping * parameters. * * @param src The string to clip * @param point The 1-based position of the first clipped base in the read * @param replacement If non-null, the character to replace in the clipped positions * in the string (a quality score or 'N'). If null, just trim src * @param posStrand Whether the read is on the positive strand * @return String The clipped read or qualities */ private String clip(final String src, final int point, final Character replacement, final boolean posStrand) { final int len = src.length(); String result = posStrand ? src.substring(0, point - 1) : src.substring(len - point + 1); if (replacement != null) { if (posStrand) { for (int i = point; i <= len; i++) { result += replacement; } } else { for (int i = 0; i <= len - point; i++) { result = replacement + result; } } } return result; } private void assertPairedMates(final SAMRecord record1, final SAMRecord record2) { if (!(record1.getFirstOfPairFlag() && record2.getSecondOfPairFlag() || record2.getFirstOfPairFlag() && record1.getSecondOfPairFlag())) { throw new PicardException("Illegal mate state: " + record1.getReadName()); } } /** * Put any custom command-line validation in an override of this method. * clp is initialized at this point and can be used to print usage and access argv. * Any options set by command-line parser can be validated. * * @return null if command line is valid. If command line is invalid, returns an array of error * messages to be written to the appropriate place. */ protected String[] customCommandLineValidation() { if (INTERLEAVE && SECOND_END_FASTQ != null) { return new String[]{ "Cannot set INTERLEAVE to true and pass in a SECOND_END_FASTQ" }; } if (UNPAIRED_FASTQ != null && SECOND_END_FASTQ == null) { return new String[]{ "UNPAIRED_FASTQ may only be set when also emitting read1 and read2 fastqs (so SECOND_END_FASTQ must also be set)." }; } if ((CLIPPING_ATTRIBUTE != null && CLIPPING_ACTION == null) || (CLIPPING_ATTRIBUTE == null && CLIPPING_ACTION != null)) { return new String[]{ "Both or neither of CLIPPING_ATTRIBUTE and CLIPPING_ACTION should be set."}; } if (CLIPPING_ACTION != null) { if (CLIPPING_ACTION.equals("N") || CLIPPING_ACTION.equals("X")) { // Do nothing, this is fine } else { try { Integer.parseInt(CLIPPING_ACTION); } catch (NumberFormatException nfe) { return new String[]{"CLIPPING ACTION must be one of: N, X, or an integer"}; } } } if ((OUTPUT_PER_RG && OUTPUT_DIR == null) || ((!OUTPUT_PER_RG) && OUTPUT_DIR != null)) { return new String[]{ "If OUTPUT_PER_RG is true, then OUTPUT_DIR should be set. " + "If "}; } return null; } /** * A collection of {@link htsjdk.samtools.fastq.FastqWriter}s for particular types of reads. * <p/> * Allows for lazy construction of the second-of-pair writer, since when we are in the "output per read group mode", we only wish to * generate a second-of-pair fastq if we encounter a second-of-pair read. */ static class FastqWriters { private final FastqWriter firstOfPair, unpaired; private final Lazy<FastqWriter> secondOfPair; /** Constructor if the consumer wishes for the second-of-pair writer to be built on-the-fly. */ private FastqWriters(final FastqWriter firstOfPair, final Lazy<FastqWriter> secondOfPair, final FastqWriter unpaired) { this.firstOfPair = firstOfPair; this.unpaired = unpaired; this.secondOfPair = secondOfPair; } /** Simple constructor; all writers are pre-initialized.. */ private FastqWriters(final FastqWriter firstOfPair, final FastqWriter secondOfPair, final FastqWriter unpaired) { this(firstOfPair, new Lazy<FastqWriter>(new Lazy.LazyInitializer<FastqWriter>() { @Override public FastqWriter make() { return secondOfPair; } }), unpaired); } public FastqWriter getFirstOfPair() { return firstOfPair; } public FastqWriter getSecondOfPair() { return secondOfPair.get(); } public FastqWriter getUnpaired() { return unpaired; } public void closeAll() { final Set<FastqWriter> fastqWriters = new HashSet<FastqWriter>(); fastqWriters.add(firstOfPair); fastqWriters.add(unpaired); // Make sure this is a no-op if the second writer was never fetched. if (secondOfPair.isInitialized()) fastqWriters.add(secondOfPair.get()); for (final FastqWriter fastqWriter : fastqWriters) { fastqWriter.close(); } } } }