/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam; import htsjdk.samtools.ReservedTagConstants; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMUtils; import htsjdk.samtools.fastq.FastqReader; import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.util.FastqQualityFormat; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Iso8601Date; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.QualityEncodingDetector; import htsjdk.samtools.util.SolexaQualityConverter; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.SamOrBam; import java.io.File; import java.util.ArrayList; import java.util.List; /** * Converts a fastq file to an unaligned BAM/SAM format. * See <a href="http://maq.sourceforge.net/fastq.shtml">MAQ FastQ specification</a> for details. * Three fastq versions are supported: FastqSanger, FastqSolexa and FastqIllumina. * Input files can be in GZip format (end in .gz). */ @CommandLineProgramProperties( usage = "Extracts read sequences and qualities from the input fastq file and writes them into the output file in unaligned BAM format." + " Input files can be in GZip format (end in .gz).\n", usageShort = "Converts a fastq file to an unaligned BAM or SAM file", programGroup = SamOrBam.class ) public class FastqToSam extends CommandLineProgram { private static final Log LOG = Log.getInstance(FastqToSam.class); @Option(shortName="F1", doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.") public File FASTQ; @Option(shortName="F2", doc="Input fastq file (optionally gzipped) for the second read of paired end data.", optional=true) public File FASTQ2; @Option(shortName="V", doc="A value describing how the quality values are encoded in the fastq. Either Solexa for pre-pipeline 1.3 " + "style scores (solexa scaling + 66), Illumina for pipeline 1.3 and above (phred scaling + 64) or Standard for phred scaled " + "scores with a character shift of 33. If this value is not specified, the quality format will be detected automatically.", optional = true) public FastqQualityFormat QUALITY_FORMAT; @Option(doc="Output SAM/BAM file. ", shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME) public File OUTPUT ; @Option(shortName="RG", doc="Read group name") public String READ_GROUP_NAME = "A"; @Option(shortName="SM", doc="Sample name to insert into the read group header") public String SAMPLE_NAME; @Option(shortName="LB", doc="The library name to place into the LB attribute in the read group header", optional=true) public String LIBRARY_NAME; @Option(shortName="PU", doc="The platform unit (often run_barcode.lane) to insert into the read group header", optional=true) public String PLATFORM_UNIT; @Option(shortName="PL", doc="The platform type (e.g. illumina, solid) to insert into the read group header", optional=true) public String PLATFORM; @Option(shortName="CN", doc="The sequencing center from which the data originated", optional=true) public String SEQUENCING_CENTER; @Option(shortName = "PI", doc = "Predicted median insert size, to insert into the read group header", optional = true) public Integer PREDICTED_INSERT_SIZE; @Option(doc="Comment(s) to include in the merged output file's header.", optional=true, shortName="CO") public List<String> COMMENT = new ArrayList<String>(); @Option(shortName = "DS", doc = "Inserted into the read group header", optional = true) public String DESCRIPTION; @Option(shortName = "DT", doc = "Date the run was produced, to insert into the read group header", optional = true) public Iso8601Date RUN_DATE; @Option(shortName="SO", doc="The sort order for the output sam/bam file.") public SortOrder SORT_ORDER = SortOrder.queryname; @Option(doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.") public int MIN_Q = 0; @Option(doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.") public int MAX_Q = SAMUtils.MAX_PHRED_SCORE; @Option(doc="If true and this is an unpaired fastq any occurance of '/1' will be removed from the end of a read name.") public Boolean STRIP_UNPAIRED_MATE_NUMBER = false; @Option(doc="Allow (and ignore) empty lines") public Boolean ALLOW_AND_IGNORE_EMPTY_LINES = false; private static final SolexaQualityConverter solexaQualityConverter = SolexaQualityConverter.getSingleton(); /** Stock main method. */ public static void main(final String[] argv) { System.exit(new FastqToSam().instanceMain(argv)); } /* Simply invokes the right method for unpaired or paired data. */ protected int doWork() { final QualityEncodingDetector detector = new QualityEncodingDetector(); final FastqReader reader = new FastqReader(FASTQ,ALLOW_AND_IGNORE_EMPTY_LINES); if (FASTQ2 == null) { detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader); } else { final FastqReader reader2 = new FastqReader(FASTQ2,ALLOW_AND_IGNORE_EMPTY_LINES); detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, reader, reader2); reader2.close(); } reader.close(); QUALITY_FORMAT = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, QUALITY_FORMAT); if (detector.isDeterminationAmbiguous()) LOG.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities."); LOG.info(String.format("Auto-detected quality format as: %s.", QUALITY_FORMAT)); final int readCount = (FASTQ2 == null) ? doUnpaired() : doPaired(); LOG.info("Processed " + readCount + " fastq reads"); return 0; } /** Creates a simple SAM file from a single fastq file. */ protected int doUnpaired() { IOUtil.assertFileIsReadable(FASTQ); IOUtil.assertFileIsWritable(OUTPUT); final FastqReader freader = new FastqReader(FASTQ,ALLOW_AND_IGNORE_EMPTY_LINES); final SAMFileHeader header = createFileHeader(); final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); int readCount = 0; final ProgressLogger progress = new ProgressLogger(LOG); for ( ; freader.hasNext() ; readCount++) { final FastqRecord frec = freader.next(); final SAMRecord srec = createSamRecord(header, getReadName(frec.getReadHeader(), false) , frec, false) ; srec.setReadPairedFlag(false); writer.addAlignment(srec); progress.record(srec); } writer.close(); return readCount; } /** More complicated method that takes two fastq files and builds pairing information in the SAM. */ protected int doPaired() { IOUtil.assertFileIsReadable(FASTQ); IOUtil.assertFileIsReadable(FASTQ2); IOUtil.assertFileIsWritable(OUTPUT); final FastqReader freader1 = new FastqReader(FASTQ,ALLOW_AND_IGNORE_EMPTY_LINES); final FastqReader freader2 = new FastqReader(FASTQ2,ALLOW_AND_IGNORE_EMPTY_LINES); final SAMFileHeader header = createFileHeader() ; final SAMFileWriter writer = (new SAMFileWriterFactory()).makeSAMOrBAMWriter(header, false, OUTPUT); int readCount = 0; final ProgressLogger progress = new ProgressLogger(LOG); for ( ; freader1.hasNext() && freader2.hasNext() ; readCount++) { final FastqRecord frec1 = freader1.next(); final FastqRecord frec2 = freader2.next(); final String frec1Name = getReadName(frec1.getReadHeader(), true); final String frec2Name = getReadName(frec2.getReadHeader(), true); final String baseName = getBaseName(frec1Name, frec2Name, freader1, freader2); final SAMRecord srec1 = createSamRecord(header, baseName, frec1, true) ; srec1.setFirstOfPairFlag(true); srec1.setSecondOfPairFlag(false); writer.addAlignment(srec1); progress.record(srec1); final SAMRecord srec2 = createSamRecord(header, baseName, frec2, true) ; srec2.setFirstOfPairFlag(false); srec2.setSecondOfPairFlag(true); writer.addAlignment(srec2); progress.record(srec2); } writer.close(); if (freader1.hasNext() || freader2.hasNext()) { throw new PicardException("Input paired fastq files must be the same length"); } return readCount; } private SAMRecord createSamRecord(final SAMFileHeader header, final String baseName, final FastqRecord frec, final boolean paired) { final SAMRecord srec = new SAMRecord(header); srec.setReadName(baseName); srec.setReadString(frec.getReadString()); srec.setReadUnmappedFlag(true); srec.setAttribute(ReservedTagConstants.READ_GROUP_ID, READ_GROUP_NAME); final byte[] quals = StringUtil.stringToBytes(frec.getBaseQualityString()); convertQuality(quals, QUALITY_FORMAT); for (final byte qual : quals) { final int uQual = qual & 0xff; if (uQual < MIN_Q || uQual > MAX_Q) { throw new PicardException("Base quality " + uQual + " is not in the range " + MIN_Q + ".." + MAX_Q + " for read " + frec.getReadHeader()); } } srec.setBaseQualities(quals); if (paired) { srec.setReadPairedFlag(true); srec.setMateUnmappedFlag(true); } return srec ; } /** Creates a simple header with the values provided on the command line. */ private SAMFileHeader createFileHeader() { final SAMReadGroupRecord rgroup = new SAMReadGroupRecord(this.READ_GROUP_NAME); rgroup.setSample(this.SAMPLE_NAME); if (this.LIBRARY_NAME != null) rgroup.setLibrary(this.LIBRARY_NAME); if (this.PLATFORM != null) rgroup.setPlatform(this.PLATFORM); if (this.PLATFORM_UNIT != null) rgroup.setPlatformUnit(this.PLATFORM_UNIT); if (this.SEQUENCING_CENTER != null) rgroup.setSequencingCenter(SEQUENCING_CENTER); if (this.PREDICTED_INSERT_SIZE != null) rgroup.setPredictedMedianInsertSize(PREDICTED_INSERT_SIZE); if (this.DESCRIPTION != null) rgroup.setDescription(this.DESCRIPTION); if (this.RUN_DATE != null) rgroup.setRunDate(this.RUN_DATE); final SAMFileHeader header = new SAMFileHeader(); header.addReadGroup(rgroup); for (final String comment : COMMENT) { header.addComment(comment); } header.setSortOrder(this.SORT_ORDER); return header ; } /** Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale. */ void convertQuality(final byte[] quals, final FastqQualityFormat version) { switch (version) { case Standard: SAMUtils.fastqToPhred(quals); break ; case Solexa: solexaQualityConverter.convertSolexaQualityCharsToPhredBinary(quals); break ; case Illumina: solexaQualityConverter.convertSolexa_1_3_QualityCharsToPhredBinary(quals); break ; } } /** Returns read baseName and asserts correct pair read name format: * <ul> * <li> Paired reads must either have the exact same read names or they must contain at least one "/" * <li> and the First pair read name must end with "/1" and second pair read name ends with "/2" * <li> The baseName (read name part before the /) must be the same for both read names * <li> If the read names are exactly the same but end in "/2" or "/1" then an exception will be thrown * </ul> */ String getBaseName(final String readName1, final String readName2, final FastqReader freader1, final FastqReader freader2) { String [] toks = getReadNameTokens(readName1, 1, freader1); final String baseName1 = toks[0] ; final String num1 = toks[1] ; toks = getReadNameTokens(readName2, 2, freader2); final String baseName2 = toks[0] ; final String num2 = toks[1]; if (!baseName1.equals(baseName2)) { throw new PicardException(String.format("In paired mode, read name 1 (%s) does not match read name 2 (%s)", baseName1,baseName2)); } final boolean num1Blank = StringUtil.isBlank(num1); final boolean num2Blank = StringUtil.isBlank(num2); if (num1Blank || num2Blank) { if(!num1Blank) throw new PicardException(error(freader1,"Pair 1 number is missing (" +readName1+ "). Both pair numbers must be present or neither.")); //num1 != blank and num2 == blank else if(!num2Blank) throw new PicardException(error(freader2, "Pair 2 number is missing (" +readName2+ "). Both pair numbers must be present or neither.")); //num1 == blank and num =2 != blank } else { if (!num1.equals("1")) throw new PicardException(error(freader1,"Pair 1 number must be 1 ("+readName1+")")); if (!num2.equals("2")) throw new PicardException(error(freader2,"Pair 2 number must be 2 ("+readName2+")")); } return baseName1 ; } /** Breaks up read name into baseName and number separated by the last / */ private String [] getReadNameTokens(final String readName, final int pairNum, final FastqReader freader) { if(readName.equals("")) throw new PicardException(error(freader,"Pair read name "+pairNum+" cannot be empty: "+readName)); final int idx = readName.lastIndexOf("/"); final String result[] = new String[2]; if (idx == -1) { result[0] = readName; result[1] = null; } else { result[1] = readName.substring(idx+1, readName.length()); // should be a 1 or 2 if(!result[1].equals("1") && !result[1].equals("2")) { //if not a 1 or 2 then names must be identical result[0] = readName; result[1] = null; } else { result[0] = readName.substring(0,idx); // baseName } } return result ; } /** Little utility to give error messages corresponding to line numbers in the input files. */ private String error(final FastqReader freader, final String str) { return str +" at line "+freader.getLineNumber() +" in file "+freader.getFile().getAbsolutePath(); } // Read names cannot contain blanks private String getReadName(final String fastqHeader, final boolean paired) { final int idx = fastqHeader.indexOf(" "); String readName = (idx == -1) ? fastqHeader : fastqHeader.substring(0,idx); // NOTE: the while loop isn't necessarily the most efficient way to handle this but we don't // expect this to ever happen more than once, just trapping pathological cases while (STRIP_UNPAIRED_MATE_NUMBER && !paired && readName.endsWith("/1")) { // If this is an unpaired run we want to make sure that "/1" isn't tacked on the end of the read name, // as this can cause problems down the road in MergeBamAlignment readName = readName.substring(0, readName.length() - 2); } return readName; } @Override protected String[] customCommandLineValidation() { if (MIN_Q < 0) return new String[]{"MIN_Q must be >= 0"}; if (MAX_Q > SAMUtils.MAX_PHRED_SCORE) return new String[]{"MAX_Q must be <= " + SAMUtils.MAX_PHRED_SCORE}; return null; } }