/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam; import htsjdk.samtools.ReservedTagConstants; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMUtils; import htsjdk.samtools.fastq.FastqConstants.FastqExtensions; import htsjdk.samtools.fastq.FastqReader; import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.util.FastqQualityFormat; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Iso8601Date; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.QualityEncodingDetector; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.SolexaQualityConverter; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.SamOrBam; import java.io.File; import java.util.ArrayList; import java.util.List; /** * Converts a fastq file to an unaligned BAM/SAM format. * See <a href="http://maq.sourceforge.net/fastq.shtml">MAQ FastQ specification</a> for details. * Three fastq versions are supported: FastqSanger, FastqSolexa and FastqIllumina. * Input files can be in GZip format (end in .gz). */ @CommandLineProgramProperties( usage = FastqToSam.USAGE_SUMMARY + FastqToSam.USAGE_DETAILS, usageShort = FastqToSam.USAGE_SUMMARY, programGroup = SamOrBam.class ) public class FastqToSam extends CommandLineProgram { static final String USAGE_SUMMARY = "Converts a FASTQ file to an unaligned BAM or SAM file. "; static final String USAGE_DETAILS = "This tool extracts read sequences and base qualities from the input FASTQ file and writes them" + " out to a new file in unaligned BAM (uBAM) format. Read group information can be provided on the command line. <br /><br /> " + "Three versions of FASTQ quality scales are supported: FastqSanger, FastqSolexa and FastqIllumina " + "(see http://maq.sourceforge.net/fastq.shtml for details). Input FASTQ files can be in GZip format " + "(with .gz extension)." + "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar FastqToSam \\<br />" + " F1=file_1.fastq \\<br />" + " O=fastq_to_bam.bam \\<br />" + " SM=for_tool_testing " + "</pre>" + "<hr />"; private static final Log LOG = Log.getInstance(FastqToSam.class); @Option(shortName="F1", doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.") public File FASTQ; @Option(shortName="F2", doc="Input fastq file (optionally gzipped) for the second read of paired end data.", optional=true) public File FASTQ2; @Option(doc="Use sequential fastq files with the suffix <prefix>_###.fastq or <prefix>_###.fastq.gz", optional=true) public boolean USE_SEQUENTIAL_FASTQS = false; @Option(shortName="V", doc="A value describing how the quality values are encoded in the input FASTQ file. " + "Either Solexa (phred scaling + 66), Illumina (phred scaling + 64) or Standard (phred scaling + 33). " + "If this value is not specified, the quality format will be detected automatically.", optional = true) public FastqQualityFormat QUALITY_FORMAT; @Option(doc="Output SAM/BAM file. ", shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME) public File OUTPUT ; @Option(shortName="RG", doc="Read group name") public String READ_GROUP_NAME = "A"; @Option(shortName="SM", doc="Sample name to insert into the read group header") public String SAMPLE_NAME; @Option(shortName="LB", doc="The library name to place into the LB attribute in the read group header", optional=true) public String LIBRARY_NAME; @Option(shortName="PU", doc="The platform unit (often run_barcode.lane) to insert into the read group header", optional=true) public String PLATFORM_UNIT; @Option(shortName="PL", doc="The platform type (e.g. illumina, solid) to insert into the read group header", optional=true) public String PLATFORM; @Option(shortName="CN", doc="The sequencing center from which the data originated", optional=true) public String SEQUENCING_CENTER; @Option(shortName = "PI", doc = "Predicted median insert size, to insert into the read group header", optional = true) public Integer PREDICTED_INSERT_SIZE; @Option(shortName = "PG", doc = "Program group to insert into the read group header.", optional=true) public String PROGRAM_GROUP; @Option(shortName = "PM", doc = "Platform model to insert into the group header (free-form text providing further details of the platform/technology used)", optional=true) public String PLATFORM_MODEL; @Option(doc="Comment(s) to include in the merged output file's header.", optional=true, shortName="CO") public List<String> COMMENT = new ArrayList<String>(); @Option(shortName = "DS", doc = "Inserted into the read group header", optional = true) public String DESCRIPTION; @Option(shortName = "DT", doc = "Date the run was produced, to insert into the read group header", optional = true) public Iso8601Date RUN_DATE; @Option(shortName="SO", doc="The sort order for the output sam/bam file.") public SortOrder SORT_ORDER = SortOrder.queryname; @Option(doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.") public int MIN_Q = 0; @Option(doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.") public int MAX_Q = SAMUtils.MAX_PHRED_SCORE; @Deprecated @Option(doc="Deprecated (No longer used). If true and this is an unpaired fastq any occurrence of '/1' or '/2' will be removed from the end of a read name.") public Boolean STRIP_UNPAIRED_MATE_NUMBER = false; @Option(doc="Allow (and ignore) empty lines") public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false; private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton(); /** * Looks at fastq input(s) and attempts to determine the proper quality format * * Closes the reader(s) by side effect * * @param reader1 The first fastq input * @param reader2 The second fastq input, if necessary. To not use this input, set it to null * @param expectedQuality If provided, will be used for sanity checking. If left null, autodetection will occur */ public static FastqQualityFormat determineQualityFormat(final FastqReader reader1, final FastqReader reader2, final FastqQualityFormat expectedQuality) { final QualityEncodingDetector detector = new QualityEncodingDetector(); if (reader2 == null) { detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1); } else { detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader1, reader2); reader2.close(); } reader1.close(); final FastqQualityFormat qualityFormat = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, expectedQuality); if (detector.isDeterminationAmbiguous()) { LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities."); } LOG.info(String.format("Auto-detected quality format as: %s.", qualityFormat)); return qualityFormat; } /** Stock main method. */ public static void main(final String[] argv) { System.exit(new FastqToSam().instanceMain(argv)); } /** * Get a list of FASTQs that are sequentially numbered based on the first (base) fastq. * The files should be named: * <prefix>_001.<extension>, <prefix>_002.<extension>, ..., <prefix>_XYZ.<extension> * The base files should be: * <prefix>_001.<extension> * An example would be: * RUNNAME_S8_L005_R1_001.fastq * RUNNAME_S8_L005_R1_002.fastq * RUNNAME_S8_L005_R1_003.fastq * RUNNAME_S8_L005_R1_004.fastq * where `baseFastq` is the first in that list. */ protected static List<File> getSequentialFileList(final File baseFastq) { final List<File> files = new ArrayList<File>(); files.add(baseFastq); // Find the correct extension used in the base FASTQ FastqExtensions fastqExtensions = null; String suffix = null; // store the suffix including the extension for (final FastqExtensions ext : FastqExtensions.values()) { suffix = "_001" + ext.getExtension(); if (baseFastq.getAbsolutePath().endsWith(suffix)) { fastqExtensions = ext; break; } } if (null == fastqExtensions) { throw new PicardException(String.format("Could not parse the FASTQ extension (expected '_001' + '%s'): %s", FastqExtensions.values().toString(), baseFastq)); } // Find all the files for (int idx = 2; true; idx++) { String fastq = baseFastq.getAbsolutePath(); fastq = String.format("%s_%03d%s", fastq.substring(0, fastq.length() - suffix.length()), idx, fastqExtensions.getExtension()); try { IOUtil.assertFileIsReadable(new File(fastq)); } catch (final SAMException e) { // the file is not readable, so do not continue break; } files.add(new File(fastq)); } return files; } /* Simply invokes the right method for unpaired or paired data. */ protected int doWork() { IOUtil.assertFileIsReadable(FASTQ); if (FASTQ2 != null) { IOUtil.assertFileIsReadable(FASTQ2); } IOUtil.assertFileIsWritable(OUTPUT); final SAMFileHeader header = createSamFileHeader(); final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); // Set the quality format QUALITY_FORMAT = FastqToSam.determineQualityFormat(fileToFastqReader(FASTQ), (FASTQ2 == null) ? null : fileToFastqReader(FASTQ2), QUALITY_FORMAT); // Lists for sequential files, but also used when not sequential final List<FastqReader> readers1 = new ArrayList<FastqReader>(); final List<FastqReader> readers2 = new ArrayList<FastqReader>(); if (USE_SEQUENTIAL_FASTQS) { // Get all the files for (final File fastq : getSequentialFileList(FASTQ)) { readers1.add(fileToFastqReader(fastq)); } if (null != FASTQ2) { for (final File fastq : getSequentialFileList(FASTQ2)) { readers2.add(fileToFastqReader(fastq)); } if (readers1.size() != readers2.size()) { throw new PicardException(String.format("Found %d files for FASTQ and %d files for FASTQ2.", readers1.size(), readers2.size())); } } } else { readers1.add(fileToFastqReader(FASTQ)); if (FASTQ2 != null) { readers2.add(fileToFastqReader(FASTQ2)); } } // Loop through the FASTQs for (int idx = 0; idx < readers1.size(); idx++) { makeItSo(readers1.get(idx), (readers2.isEmpty()) ? null : readers2.get(idx), writer); } // Close all the things for (final FastqReader reader : readers1) reader.close(); for (final FastqReader reader : readers2) reader.close(); writer.close(); return 0; } /** * Handles the FastqToSam execution on the FastqReader(s). * * In some circumstances it might be useful to circumvent the command line based instantiation of this * class, however note that there is no handholding or guardrails to running in this manner. * * It is the caller's responsibility to close the reader(s) * * @param reader1 The FastqReader for the first fastq file * @param reader2 The second FastqReader if applicable. Pass in null if only using a single reader * @param writer The SAMFileWriter where the new SAM file is written * */ public void makeItSo(final FastqReader reader1, final FastqReader reader2, final SAMFileWriter writer) { final int readCount = (reader2 == null) ? doUnpaired(reader1, writer) : doPaired(reader1, reader2, writer); LOG.info("Processed " + readCount + " fastq reads"); } /** Creates a simple SAM file from a single fastq file. */ protected int doUnpaired(final FastqReader freader, final SAMFileWriter writer) { int readCount = 0; final ProgressLogger progress = new ProgressLogger(LOG); for ( ; freader.hasNext() ; readCount++) { final FastqRecord frec = freader.next(); final SAMRecord srec = createSamRecord(writer.getFileHeader(), SequenceUtil.getSamReadNameFromFastqHeader(frec.getReadHeader()) , frec, false) ; srec.setReadPairedFlag(false); writer.addAlignment(srec); progress.record(srec); } return readCount; } /** More complicated method that takes two fastq files and builds pairing information in the SAM. */ protected int doPaired(final FastqReader freader1, final FastqReader freader2, final SAMFileWriter writer) { int readCount = 0; final ProgressLogger progress = new ProgressLogger(LOG); for ( ; freader1.hasNext() && freader2.hasNext() ; readCount++) { final FastqRecord frec1 = freader1.next(); final FastqRecord frec2 = freader2.next(); final String frec1Name = SequenceUtil.getSamReadNameFromFastqHeader(frec1.getReadHeader()); final String frec2Name = SequenceUtil.getSamReadNameFromFastqHeader(frec2.getReadHeader()); final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2); final SAMRecord srec1 = createSamRecord(writer.getFileHeader(), baseName, frec1, true) ; srec1.setFirstOfPairFlag(true); srec1.setSecondOfPairFlag(false); writer.addAlignment(srec1); progress.record(srec1); final SAMRecord srec2 = createSamRecord(writer.getFileHeader(), baseName, frec2, true) ; srec2.setFirstOfPairFlag(false); srec2.setSecondOfPairFlag(true); writer.addAlignment(srec2); progress.record(srec2); } if (freader1.hasNext() || freader2.hasNext()) { throw new PicardException("Input paired fastq files must be the same length"); } return readCount; } private FastqReader fileToFastqReader(final File file) { return new FastqReader(file, ALLOW_AND_IGNORE_EMPTY_LINES); } private SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired) { final SAMRecord srec = new SAMRecord(header); srec.setReadName(baseName); srec.setReadString(frec.getReadString()); srec.setReadUnmappedFlag(true); srec.setAttribute(ReservedTagConstants.READ_GROUP_ID, READ_GROUP_NAME); final byte[] quals = StringUtil.stringToBytes(frec.getBaseQualityString()); convertQuality(quals, QUALITY_FORMAT); for (final byte qual : quals) { final int uQual = qual & 0xff; if (uQual < MIN_Q || uQual > MAX_Q) { throw new PicardException("Base quality " + uQual + " is not in the range " + MIN_Q + ".." + MAX_Q + " for read " + frec.getReadHeader()); } } srec.setBaseQualities(quals); if (paired) { srec.setReadPairedFlag(true); srec.setMateUnmappedFlag(true); } return srec ; } /** Creates a simple header with the values provided on the command line. */ public SAMFileHeader createSamFileHeader() { final SAMReadGroupRecord rgroup = new SAMReadGroupRecord(this.READ_GROUP_NAME); rgroup.setSample(this.SAMPLE_NAME); if (this.LIBRARY_NAME != null) rgroup.setLibrary(this.LIBRARY_NAME); if (this.PLATFORM != null) rgroup.setPlatform(this.PLATFORM); if (this.PLATFORM_UNIT != null) rgroup.setPlatformUnit(this.PLATFORM_UNIT); if (this.SEQUENCING_CENTER != null) rgroup.setSequencingCenter(SEQUENCING_CENTER); if (this.PREDICTED_INSERT_SIZE != null) rgroup.setPredictedMedianInsertSize(PREDICTED_INSERT_SIZE); if (this.DESCRIPTION != null) rgroup.setDescription(this.DESCRIPTION); if (this.RUN_DATE != null) rgroup.setRunDate(this.RUN_DATE); if (this.PLATFORM_MODEL != null) rgroup.setPlatformModel(this.PLATFORM_MODEL); if (this.PROGRAM_GROUP != null) rgroup.setProgramGroup(this.PROGRAM_GROUP); final SAMFileHeader header = new SAMFileHeader(); header.addReadGroup(rgroup); for (final String comment : COMMENT) { header.addComment(comment); } header.setSortOrder(this.SORT_ORDER); return header ; } /** Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale. */ void convertQuality(final byte[] quals, final FastqQualityFormat version) { switch (version) { case Standard: SAMUtils.fastqToPhred(quals); break ; case Solexa: solexaQualityConverter.convertSolexaQualityCharsToPhredBinary(quals); break ; case Illumina: solexaQualityConverter.convertSolexa_1_3_QualityCharsToPhredBinary(quals); break ; } } /** Returns read baseName and asserts correct pair read name format: * <ul> * <li> Paired reads must either have the exact same read names or they must contain at least one "/" * <li> and the First pair read name must end with "/1" and second pair read name ends with "/2" * <li> The baseName (read name part before the /) must be the same for both read names * <li> If the read names are exactly the same but end in "/2" or "/1" then an exception will be thrown * </ul> */ String getBaseName(final String readName1, final String readName2, final FastqReader freader1, final FastqReader freader2) { String [] toks = getReadNameTokens(readName1, 1, freader1); final String baseName1 = toks[0] ; final String num1 = toks[1] ; toks = getReadNameTokens(readName2, 2, freader2); final String baseName2 = toks[0] ; final String num2 = toks[1]; if (!baseName1.equals(baseName2)) { throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", baseName1,baseName2)); } final boolean num1Blank = StringUtil.isBlank(num1); final boolean num2Blank = StringUtil.isBlank(num2); if (num1Blank || num2Blank) { if(!num1Blank) throw new PicardException(error(freader1,"Pair 1 number is missing (" +readName1+ "). Both pair numbers must be present or neither.")); //num1 != blank and num2 == blank else if(!num2Blank) throw new PicardException(error(freader2, "Pair 2 number is missing (" +readName2+ "). Both pair numbers must be present or neither.")); //num1 == blank and num =2 != blank } else { if (!num1.equals("1")) throw new PicardException(error(freader1,"Pair 1 number must be 1 ("+readName1+")")); if (!num2.equals("2")) throw new PicardException(error(freader2,"Pair 2 number must be 2 ("+readName2+")")); } return baseName1 ; } /** Breaks up read name into baseName and number separated by the last / */ private String [] getReadNameTokens(final String readName, final int pairNum, final FastqReader freader) { if(readName.equals("")) throw new PicardException(error(freader,"Pair read name "+pairNum+" cannot be empty: "+readName)); final int idx = readName.lastIndexOf('/'); final String[] result = new String[2]; if (idx == -1) { result[0] = readName; result[1] = null; } else { result[1] = readName.substring(idx+1, readName.length()); // should be a 1 or 2 if(!result[1].equals("1") && !result[1].equals("2")) { //if not a 1 or 2 then names must be identical result[0] = readName; result[1] = null; } else { result[0] = readName.substring(0,idx); // baseName } } return result ; } /** Little utility to give error messages corresponding to line numbers in the input files. */ private String error(final FastqReader freader, final String str) { return str +" at line "+freader.getLineNumber() +" in file "+freader.getFile().getAbsolutePath(); } @Override protected String[] customCommandLineValidation() { if (MIN_Q < 0) return new String[]{"MIN_Q must be >= 0"}; if (MAX_Q > SAMUtils.MAX_PHRED_SCORE) return new String[]{"MAX_Q must be <= " + SAMUtils.MAX_PHRED_SCORE}; return null; } }