package htsjdk.samtools.cram.encoding.reader; import java.io.File; import java.util.zip.Deflater; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.Log; import net.sf.cram.Cram2Fastq; import net.sf.cram.CramTools.LevelConverter; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import com.beust.jcommander.converters.FileConverter; public class BAMNameCollate extends NameCollate<BAMRead> { private SAMFileWriter[] writers; private SAMFileWriter overspillWriter; private long kicked = 0, ready = 0, total = 0; public BAMNameCollate() { super(); } public BAMNameCollate(SAMFileWriter[] writers, SAMFileWriter overspillWriter) { super(); this.writers = writers; this.overspillWriter = overspillWriter; } @Override protected boolean needsCollating(BAMRead read) { return read.getRecord().getReadPairedFlag(); } public int getStreamIndex(SAMRecord record) { if (!record.getReadPairedFlag()) return 0; if (record.getFirstOfPairFlag()) return 1; if (record.getSecondOfPairFlag()) return 2; return 0; } @Override protected void ready(BAMRead read) { int streamIndex = getStreamIndex(read.getRecord()); writers[streamIndex].addAlignment(read.getRecord()); ready++; } @Override protected void kickedFromCache(BAMRead read) { overspillWriter.addAlignment(read.getRecord()); kicked++; } @Override public void add(BAMRead read) { super.add(read); total++; } public String report() { return String.format("ready: %d, kicked %d, total: %d.", ready, kicked, total); } private static Log log = Log.getInstance(BAMNameCollate.class); public static final String COMMAND = "fastq"; private static void printUsage(JCommander jc) { StringBuilder sb = new StringBuilder(); sb.append("\n"); jc.usage(sb); System.out.println("Version " + Cram2Fastq.class.getPackage().getImplementationVersion()); System.out.println(sb.toString()); } public static void main(String[] args) throws Exception { Params params = new Params(); JCommander jc = new JCommander(params); try { jc.parse(args); } catch (Exception e) { System.out.println("Failed to parse parameteres, detailed message below: "); System.out.println(e.getMessage()); System.out.println(); System.out.println("See usage: -h"); System.exit(1); } if (args.length == 0 || params.help) { printUsage(jc); System.exit(1); } Log.setGlobalLogLevel(params.logLevel); long maxRecords = params.maxRecords; File overspillFile = new File(params.file.getAbsolutePath().replaceAll("\\.bam$", ".tmp.bam")); File file0 = new File(params.file.getAbsolutePath().replaceAll("\\.bam$", ".read0.bam")); File file1 = new File(params.file.getAbsolutePath().replaceAll("\\.bam$", ".read1.bam")); File file2 = new File(params.file.getAbsolutePath().replaceAll("\\.bam$", ".read2.bam")); SAMFileReader reader = new SAMFileReader(params.file); reader.getFileHeader().setSortOrder(SAMFileHeader.SortOrder.unsorted); BAMNameCollate collate = new BAMNameCollate(); collate.writers = new SAMFileWriter[3]; if (params.compression < Deflater.NO_COMPRESSION || params.compression > Deflater.BEST_COMPRESSION) { System.err.println("Invalid compression level, expecting an integer from 0 to 9. "); System.exit(1); } BlockCompressedOutputStream.setDefaultCompressionLevel(params.compression); SAMFileWriterFactory writerFactory = new SAMFileWriterFactory(); writerFactory.setUseAsyncIo(true); writerFactory.setAsyncOutputBufferSize(1024 * 100); collate.writers[0] = writerFactory.makeBAMWriter(reader.getFileHeader(), false, file0); collate.writers[1] = writerFactory.makeBAMWriter(reader.getFileHeader(), false, file1); collate.writers[2] = writerFactory.makeBAMWriter(reader.getFileHeader(), false, file2); collate.overspillWriter = writerFactory.makeBAMWriter(reader.getFileHeader(), false, overspillFile); long time = System.currentTimeMillis(); long total = 0; for (SAMRecord record : reader) { if (maxRecords > -1) { if (maxRecords == 0) break; maxRecords--; } collate.add(new BAMRead(record)); total++; if (System.currentTimeMillis() - time > 10 * 1000) { time = System.currentTimeMillis(); System.out.println(collate.report()); } } System.out.println("Read total records: " + total); reader.close(); collate.close(); collate.overspillWriter.close(); File sorted = new File("overspill_sorted.bam"); { // sorting overfill: reader = new SAMFileReader(overspillFile); SAMFileHeader header = reader.getFileHeader().clone(); header.setSortOrder(SAMFileHeader.SortOrder.queryname); SAMFileWriter writer = writerFactory.makeBAMWriter(header, false, sorted); long overfill = 0; System.out.println("Sorting overfill..."); for (SAMRecord record : reader) { writer.addAlignment(record); overfill++; } writer.close(); reader.close(); System.out.println("Sorted records: " + overfill); } { reader = new SAMFileReader(sorted); SAMRecordIterator iterator = reader.iterator(); long aftersort = 0; long aftersortMatch = 0; System.out.println("Sorting overfill..."); SAMRecord r1 = iterator.next(); SAMRecord r2 = null; while (iterator.hasNext()) { r2 = iterator.next(); aftersort++; if (r1.getReadName().equals(r2.getReadName())) { aftersortMatch++; collate.writers[collate.getStreamIndex(r1)].addAlignment(r1); collate.writers[collate.getStreamIndex(r2)].addAlignment(r2); if (!iterator.hasNext()) break; r1 = iterator.next(); r2 = null; } else { collate.writers[0].addAlignment(r1); r1 = r2; r2 = null; } } if (r1 != null) { aftersort++; collate.writers[0].addAlignment(r1); } reader.close(); System.out.println("Aftersort: " + aftersort); System.out.println("Aftersort matches: " + aftersortMatch); for (SAMFileWriter writer : collate.writers) writer.close(); } if (params.cleanup) { overspillFile.delete(); sorted.delete(); } } @Parameters(commandDescription = "Split reads.") static class Params { @Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = LevelConverter.class) Log.LogLevel logLevel = Log.LogLevel.ERROR; @Parameter(names = { "-h", "--help" }, description = "Print help and quit") boolean help = false; @Parameter(names = { "--input-file", "-I" }, converter = FileConverter.class) File file; @Parameter(names = { "--max-records" }, description = "Stop after reading this many records.") long maxRecords = -1; @Parameter(names = { "--compression" }, description = "BAM compression level. ") int compression = 5; @Parameter(names = { "--cleanup" }, description = "Remove tmp files.") boolean cleanup = false; } }