/* * The MIT License * * Copyright (c) 2013 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.illumina; import htsjdk.samtools.SAMRecordQueryNameComparator; import htsjdk.samtools.SAMUtils; import htsjdk.samtools.fastq.BasicFastqWriter; import htsjdk.samtools.fastq.FastqReader; import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.fastq.FastqWriter; import htsjdk.samtools.fastq.FastqWriterFactory; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.SortingCollection; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Illumina; import picard.cmdline.StandardOptionDefinitions; import picard.fastq.Casava18ReadNameEncoder; import picard.fastq.IlluminaReadNameEncoder; import picard.fastq.ReadNameEncoder; import picard.illumina.parser.ClusterData; import picard.illumina.parser.ReadData; import picard.illumina.parser.ReadStructure; import picard.illumina.parser.readers.BclQualityEvaluationStrategy; import picard.util.IlluminaUtil; import picard.util.TabbedTextFileWithHeaderParser; import java.io.BufferedReader; import java.io.File; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; @CommandLineProgramProperties( usage = "Generate fastq file(s) from data in an Illumina basecalls output directory.\n" + "Separate fastq file(s) are created for each template read, and for each barcode read, in the basecalls.\n" + "Template fastqs have extensions like .<number>.fastq, where <number> is the number of the template read,\n" + "starting with 1. Barcode fastqs have extensions like .barcode_<number>.fastq, where <number> is the number\n" + "of the barcode read, starting with 1.", usageShort = "Generate fastq file(s) from data in an Illumina basecalls output directory", programGroup = Illumina.class ) public class IlluminaBasecallsToFastq extends CommandLineProgram { // The following attributes define the command-line arguments @Option(doc = "The basecalls directory. ", shortName = "B") public File BASECALLS_DIR; @Option(doc = "The barcodes directory with _barcode.txt files (generated by ExtractIlluminaBarcodes). If not set, use BASECALLS_DIR. ", shortName = "BCD", optional = true) public File BARCODES_DIR; @Option(doc = "Lane number. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME) public Integer LANE; @Option(doc = "The prefix for output fastqs. Extensions as described above are appended. Use this option for " + "a non-barcoded run, or for a barcoded run in which it is not desired to demultiplex reads into separate " + "files by barcode.", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, mutex = {"MULTIPLEX_PARAMS"}) public File OUTPUT_PREFIX; @Option(doc = "The barcode of the run. Prefixed to read names.", optional = false) public String RUN_BARCODE; @Option(doc = "The name of the machine on which the run was sequenced; required if emitting Casava1.8-style read name headers", optional = true) public String MACHINE_NAME; @Option(doc = "The barcode of the flowcell that was sequenced; required if emitting Casava1.8-style read name headers", optional = true) public String FLOWCELL_BARCODE; @Option(doc = ReadStructure.PARAMETER_DOC, shortName = "RS") public String READ_STRUCTURE; @Option(doc = "Tab-separated file for creating all output fastqs demultiplexed by barcode for a lane with single " + "IlluminaBasecallsToFastq invocation. The columns are OUTPUT_PREFIX, and BARCODE_1, BARCODE_2 ... BARCODE_X " + "where X = number of barcodes per cluster (optional). Row with BARCODE_1 set to 'N' is used to specify " + "an output_prefix for no barcode match.", mutex = {"OUTPUT_PREFIX"}) public File MULTIPLEX_PARAMS; @Option(doc = "Which adapters to look for in the read.") public List<IlluminaUtil.IlluminaAdapterPair> ADAPTERS_TO_CHECK = new ArrayList<IlluminaUtil.IlluminaAdapterPair>( Arrays.asList(IlluminaUtil.IlluminaAdapterPair.INDEXED, IlluminaUtil.IlluminaAdapterPair.DUAL_INDEXED, IlluminaUtil.IlluminaAdapterPair.NEXTERA_V2, IlluminaUtil.IlluminaAdapterPair.FLUIDIGM)); @Option(doc = "The number of threads to run in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " + "the number of cores available on the machine. If NUM_PROCESSORS < 0, then the number of cores used will" + " be the number available on the machine less NUM_PROCESSORS.") public Integer NUM_PROCESSORS = 0; @Option(doc = "If set, this is the first tile to be processed (used for debugging). Note that tiles are not processed" + " in numerical order.", optional = true) public Integer FIRST_TILE; @Option(doc = "If set, process no more than this many tiles (used for debugging).", optional = true) public Integer TILE_LIMIT; @Option(doc="Apply EAMSS filtering to identify inappropriately quality scored bases towards the ends of reads" + " and convert their quality scores to Q2.") public boolean APPLY_EAMSS_FILTER = true; @Option(doc = "If true, call System.gc() periodically. This is useful in cases in which the -Xmx value passed " + "is larger than the available memory.") public Boolean FORCE_GC = true; @Option(doc = "Configure SortingCollections to store this many records before spilling to disk. For an indexed" + " run, each SortingCollection gets this value/number of indices.") public int MAX_READS_IN_RAM_PER_TILE = 1200000; @Option(doc="The minimum quality (after transforming 0s to 1s) expected from reads. If qualities are lower than this value, an error is thrown." + "The default of 2 is what the Illumina's spec describes as the minimum, but in practice the value has been observed lower.") public int MINIMUM_QUALITY = BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY; @Option(doc="Whether to include non-PF reads", shortName="NONPF", optional=true) public boolean INCLUDE_NON_PF_READS = true; @Option(doc="The read name header formatting to emit. Casava1.8 formatting has additional information beyond Illumina, including: " + "the passing-filter flag value for the read, the flowcell name, and the sequencer name.", optional = false) public ReadNameFormat READ_NAME_FORMAT = ReadNameFormat.CASAVA_1_8; @Option(shortName = "GZIP", doc = "Compress output FASTQ files using gzip and append a .gz extension to the file names.") public boolean COMPRESS_OUTPUTS = false; /** Simple switch to control the read name format to emit. */ public enum ReadNameFormat { CASAVA_1_8, ILLUMINA } private final Map<String, FastqRecordsWriter> barcodeFastqWriterMap = new HashMap<String, FastqRecordsWriter>(); private ReadStructure readStructure; IlluminaBasecallsConverter<FastqRecordsForCluster> basecallsConverter; private static final Log log = Log.getInstance(IlluminaBasecallsToFastq.class); private final FastqWriterFactory fastqWriterFactory = new FastqWriterFactory(); private ReadNameEncoder readNameEncoder; private static final Comparator<FastqRecordsForCluster> queryNameComparator = new Comparator<FastqRecordsForCluster>() { @Override public int compare(final FastqRecordsForCluster r1, final FastqRecordsForCluster r2) { return SAMRecordQueryNameComparator.compareReadNames(r1.templateRecords[0].getReadHeader(), r2.templateRecords[0].getReadHeader()); } }; @Override protected int doWork() { initialize(); basecallsConverter.doTileProcessing(); return 0; } @Override protected String[] customCommandLineValidation() { final LinkedList<String> errors = new LinkedList<String>(); if (READ_NAME_FORMAT == ReadNameFormat.CASAVA_1_8 && MACHINE_NAME == null) { errors.add("MACHINE_NAME is required when using Casava1.8-style read name headers."); } if (READ_NAME_FORMAT == ReadNameFormat.CASAVA_1_8 && FLOWCELL_BARCODE == null) { errors.add("FLOWCELL_BARCODE is required when using Casava1.8-style read name headers."); } if (errors.isEmpty()) { return null; } else { return errors.toArray(new String[errors.size()]); } } /** * Prepares loggers, initiates garbage collection thread, parses arguments and initialized variables appropriately/ */ private void initialize() { fastqWriterFactory.setCreateMd5(CREATE_MD5_FILE); switch (READ_NAME_FORMAT) { case CASAVA_1_8: readNameEncoder = new Casava18ReadNameEncoder(MACHINE_NAME, RUN_BARCODE, FLOWCELL_BARCODE); break; case ILLUMINA: readNameEncoder = new IlluminaReadNameEncoder(RUN_BARCODE); break; } final BclQualityEvaluationStrategy bclQualityEvaluationStrategy = new BclQualityEvaluationStrategy(MINIMUM_QUALITY); readStructure = new ReadStructure(READ_STRUCTURE); if (MULTIPLEX_PARAMS != null) { IOUtil.assertFileIsReadable(MULTIPLEX_PARAMS); } final boolean demultiplex; if (OUTPUT_PREFIX != null) { barcodeFastqWriterMap.put(null, buildWriter(OUTPUT_PREFIX)); demultiplex = false; } else { populateWritersFromMultiplexParams(); demultiplex = true; } final int readsPerCluster = readStructure.templates.length() + readStructure.barcodes.length(); basecallsConverter = new IlluminaBasecallsConverter<FastqRecordsForCluster>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure, barcodeFastqWriterMap, demultiplex, MAX_READS_IN_RAM_PER_TILE/readsPerCluster, TMP_DIR, NUM_PROCESSORS, FORCE_GC, FIRST_TILE, TILE_LIMIT, queryNameComparator, new FastqRecordsForClusterCodec(readStructure.templates.length(), readStructure.barcodes.length()), FastqRecordsForCluster.class, bclQualityEvaluationStrategy, this.APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS); log.info("READ STRUCTURE IS " + readStructure.toString()); basecallsConverter.setConverter( new ClusterToFastqRecordsForClusterConverter( basecallsConverter.getFactory().getOutputReadStructure())); } /** * Assert that expectedCols are present * * @param actualCols The columns present in the MULTIPLEX_PARAMS file * @param expectedCols The columns that are REQUIRED */ private void assertExpectedColumns(final Set<String> actualCols, final Set<String> expectedCols) { final Set<String> missingColumns = new HashSet<String>(expectedCols); missingColumns.removeAll(actualCols); if (missingColumns.size() > 0) { throw new PicardException(String.format( "MULTIPLEX_PARAMS file %s is missing the following columns: %s.", MULTIPLEX_PARAMS.getAbsolutePath(), StringUtil.join(", ", missingColumns ))); } } /** * For each line in the MULTIPLEX_PARAMS file create a FastqRecordsWriter and put it in the barcodeFastqWriterMap map, * where the key to the map is the concatenation of all barcodes in order for the given line. */ private void populateWritersFromMultiplexParams() { final TabbedTextFileWithHeaderParser libraryParamsParser = new TabbedTextFileWithHeaderParser(MULTIPLEX_PARAMS); final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT_PREFIX"); final List<String> barcodeColumnLabels = new ArrayList<String>(); for (int i = 1; i <= readStructure.barcodes.length(); i++) { barcodeColumnLabels.add("BARCODE_" + i); } expectedColumnLabels.addAll(barcodeColumnLabels); assertExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels); for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) { List<String> barcodeValues = null; if (barcodeColumnLabels.size() > 0) { barcodeValues = new ArrayList<String>(); for (final String barcodeLabel : barcodeColumnLabels) { barcodeValues.add(row.getField(barcodeLabel)); } } final String key = (barcodeValues == null || barcodeValues.contains("N")) ? null : StringUtil.join("", barcodeValues); if (barcodeFastqWriterMap.containsKey(key)) { //This will catch the case of having more than 1 line in a non-barcoded MULTIPLEX_PARAMS file throw new PicardException("Row for barcode " + key + " appears more than once in MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS); } final FastqRecordsWriter writer = buildWriter(new File(row.getField("OUTPUT_PREFIX"))); barcodeFastqWriterMap.put(key, writer); } if (barcodeFastqWriterMap.isEmpty()) { throw new PicardException("MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS + " does have any data rows."); } libraryParamsParser.close(); } /** * @return FastqRecordsWriter that contains one or more FastqWriters (amount depends on read structure), all using * outputPrefix to determine the filename(s). */ private FastqRecordsWriter buildWriter(final File outputPrefix) { final File outputDir = outputPrefix.getAbsoluteFile().getParentFile(); IOUtil.assertDirectoryIsWritable(outputDir); final String prefixString = outputPrefix.getName(); final String suffixString = COMPRESS_OUTPUTS ? "fastq.gz" : "fastq"; final FastqWriter[] templateWriters = new FastqWriter[readStructure.templates.length()]; final FastqWriter[] barcodeWriters = new FastqWriter[readStructure.barcodes.length()]; for (int i = 0; i < templateWriters.length; ++i) { final String filename = String.format("%s.%d.%s", prefixString, i+1, suffixString); templateWriters[i] = fastqWriterFactory.newWriter(new File(outputDir, filename)); } for (int i = 0; i < barcodeWriters.length; ++i) { final String filename = String.format("%s.barcode_%d.%s", prefixString, i+1, suffixString); barcodeWriters[i] = fastqWriterFactory.newWriter(new File(outputDir, filename)); } return new FastqRecordsWriter(templateWriters, barcodeWriters); } public static void main(final String[] args) { new IlluminaBasecallsToFastq().instanceMainWithExit(args); } /** * Container for various FastqWriters, one for each template read and one for each barcode read. */ private static class FastqRecordsWriter implements IlluminaBasecallsConverter.ConvertedClusterDataWriter<FastqRecordsForCluster> { final FastqWriter[] templateWriters; final FastqWriter[] barcodeWriters; /** * @param templateWriters Writers for template reads in order, e,g. 0th element is for template read 1. * @param barcodeWriters Writers for barcode reads in order, e,g. 0th element is for barcode read 1. */ private FastqRecordsWriter(final FastqWriter[] templateWriters, final FastqWriter[] barcodeWriters) { this.templateWriters = templateWriters; this.barcodeWriters = barcodeWriters; } @Override public void write(final FastqRecordsForCluster records) { write(templateWriters, records.templateRecords); write(barcodeWriters, records.barcodeRecords); } private void write(final FastqWriter[] writers, final FastqRecord[] records) { for (int i = 0; i < writers.length; ++i) { writers[i].write(records[i]); } } @Override public void close() { for (final FastqWriter writer : templateWriters) { writer.close(); } for (final FastqWriter writer : barcodeWriters) { writer.close(); } } } /** * Contains the results of transforming one cluster into the record(s) to be written to output file(s). */ static class FastqRecordsForCluster { // These are accessed directly by converter and writer rather than through getters and setters. final FastqRecord[] templateRecords; final FastqRecord[] barcodeRecords; FastqRecordsForCluster(final int numTemplates, final int numBarcodes) { templateRecords = new FastqRecord[numTemplates]; barcodeRecords = new FastqRecord[numBarcodes]; } } /** * Passed to IlluminaBaseCallsConverter to do the conversion from input format to output format. */ class ClusterToFastqRecordsForClusterConverter implements IlluminaBasecallsConverter.ClusterDataConverter<FastqRecordsForCluster> { private final int [] templateIndices; private final int [] barcodeIndices; ClusterToFastqRecordsForClusterConverter(final ReadStructure outputReadStructure) { this.templateIndices = outputReadStructure.templates.getIndices(); this.barcodeIndices = outputReadStructure.barcodes.getIndices(); } @Override public FastqRecordsForCluster convertClusterToOutputRecord(final ClusterData cluster) { final FastqRecordsForCluster ret = new FastqRecordsForCluster(readStructure.templates.length(), readStructure.barcodes.length()); final boolean appendReadNumberSuffix = ret.templateRecords.length > 1; makeFastqRecords(ret.templateRecords, templateIndices, cluster, appendReadNumberSuffix); makeFastqRecords(ret.barcodeRecords, barcodeIndices, cluster, false); return ret; } private void makeFastqRecords(final FastqRecord[] recs, final int[] indices, final ClusterData cluster, final boolean appendReadNumberSuffix) { for (short i = 0; i < indices.length; ++i) { final ReadData readData = cluster.getRead(indices[i]); final String readBases = StringUtil.bytesToString(readData.getBases()).replace('.', 'N'); final String readName = readNameEncoder.generateReadName(cluster, appendReadNumberSuffix ? i + 1 : null); recs[i] = new FastqRecord( readName, readBases, null, SAMUtils.phredToFastq(readData.getQualities()) ); } } } /** * Coded passed to IlluminaBasecallsConverter for use in SortingCollections of output records. */ static class FastqRecordsForClusterCodec implements SortingCollection.Codec<FastqRecordsForCluster> { private final int numTemplates; private final int numBarcodes; private BasicFastqWriter writer = null; private FastqReader reader = null; FastqRecordsForClusterCodec(final int numTemplates, final int numBarcodes) { this.numTemplates = numTemplates; this.numBarcodes = numBarcodes; } @Override public void setOutputStream(final OutputStream os) { writer = new BasicFastqWriter(new PrintStream(os)); } @Override public void setInputStream(final InputStream is) { reader = new FastqReader(new BufferedReader(new InputStreamReader(is))); } @Override public void encode(final FastqRecordsForCluster val) { if (numTemplates != val.templateRecords.length) throw new IllegalStateException(); if (numBarcodes != val.barcodeRecords.length) throw new IllegalStateException(); encodeArray(val.templateRecords); encodeArray(val.barcodeRecords); writer.flush(); } private void encodeArray(final FastqRecord[] recs) { for (final FastqRecord rec: recs) { writer.write(rec); } } @Override public FastqRecordsForCluster decode() { if (!reader.hasNext()) return null; final FastqRecordsForCluster ret = new FastqRecordsForCluster(numTemplates, numBarcodes); decodeArray(ret.templateRecords); decodeArray(ret.barcodeRecords); return ret; } private void decodeArray(final FastqRecord[] recs) { for (int i = 0; i < recs.length; ++i) { recs[i] = reader.next(); } } @Override public SortingCollection.Codec<FastqRecordsForCluster> clone() { return new FastqRecordsForClusterCodec(numTemplates, numBarcodes); } } }