/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package net.sf.cram; import htsjdk.samtools.Defaults; import htsjdk.samtools.IndexAggregate; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.cram.build.ContainerParser; import htsjdk.samtools.cram.build.Cram2SamRecordFactory; import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.cram.build.CramNormalizer; import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.Log; import java.io.BufferedOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; import java.math.BigInteger; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPOutputStream; import net.sf.cram.CramTools.LevelConverter; import net.sf.cram.FixBAMFileHeader.MD5MismatchError; import net.sf.cram.common.Utils; import net.sf.cram.ref.ReferenceRegion; import net.sf.cram.ref.ReferenceSource; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import com.beust.jcommander.converters.FileConverter; public class Cram2Bam { private static Log log = Log.getInstance(Cram2Bam.class); public static final String COMMAND = "bam"; private static void printUsage(JCommander jc) { StringBuilder sb = new StringBuilder(); sb.append("\n"); jc.usage(sb); System.out.println("Version " + Cram2Bam.class.getPackage().getImplementationVersion()); System.out.println(sb.toString()); } public static void main(String[] args) throws IOException, IllegalArgumentException, IllegalAccessException { Params params = new Params(); JCommander jc = new JCommander(params); try { jc.parse(args); } catch (Exception e) { System.out.println("Failed to parse parameteres, detailed message below: "); System.out.println(e.getMessage()); System.out.println(); System.out.println("See usage: -h"); System.exit(1); } if (args.length == 0 || params.help) { printUsage(jc); System.exit(1); } Log.setGlobalLogLevel(params.logLevel); if (params.reference == null) log.warn("No reference file specified, remote access over internet may be used to download public sequences. "); if (params.locations == null) params.locations = new ArrayList<String>(); InputStream is = null; try { is = Utils.openCramInputStream(params.cramURL, params.decrypt, params.password); // is = new BufferedInputStream(new // FileInputStream(params.cramURL)); } catch (Exception e2) { log.error("Failed to open CRAM from: " + params.cramURL, e2); System.exit(1); } CramHeader cramHeader = CramIO.readCramHeader(is); if (params.printSAMHeaderOnly) { System.out.println(cramHeader.getSamFileHeader().getTextHeader()); return; } ReferenceSource referenceSource = new ReferenceSource(params.reference); referenceSource.setDownloadTriesBeforeFailing(params.downloadTriesBeforeFailing); if (!params.countOnly) { FixBAMFileHeader fix = new FixBAMFileHeader(referenceSource); fix.setConfirmMD5(!params.skipMD5Checks); fix.setInjectURI(params.injectURI); fix.setIgnoreMD5Mismatch(params.ignoreMD5Mismatch); try { log.info("Preparing the header..."); fix.fixSequences(cramHeader.getSamFileHeader().getSequenceDictionary().getSequences()); } catch (MD5MismatchError e) { log.error(e.getMessage()); System.exit(1); } fix.addCramtoolsPG(cramHeader.getSamFileHeader()); } BlockCompressedOutputStream.setDefaultCompressionLevel(Defaults.COMPRESSION_LEVEL); SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory(); samFileWriterFactory.setAsyncOutputBufferSize(params.asyncBamBuffer); samFileWriterFactory.setCreateIndex(false); samFileWriterFactory.setCreateMd5File(false); samFileWriterFactory.setUseAsyncIo(params.syncBamOutput); SAMFileWriter writer = createSAMFileWriter(params, cramHeader, samFileWriterFactory); htsjdk.samtools.cram.structure.Container c = null; AlignmentSliceQuery location = null; if (!params.locations.isEmpty()) { if (params.locations.size() > 1) throw new RuntimeException("Only one location is supported."); if (!(is instanceof SeekableStream)) throw new RuntimeException("Cannot use random access on a stream."); location = new AlignmentSliceQuery(params.locations.get(0)); location.sequenceId = cramHeader.getSamFileHeader().getSequenceIndex(location.sequence); if (location.sequenceId < 0) { log.error("Reference sequence not found for name: " + location.sequence); return; } try { log.info("Seeking for the query " + location.toString()); SeekableStream cramStream = (SeekableStream) is; IndexAggregate ia = IndexAggregate.forDataFile(cramStream, cramHeader.getSamFileHeader() .getSequenceDictionary()); long offset = ia.seek(location.sequenceId, location.start, location.end, cramStream); if (offset < 0) throw new ReadNotFoundException(); } catch (ReadNotFoundException e) { log.warn("Nothing found for query " + location); return; } } long recordCount = 0; long baseCount = 0; long readTime = 0; long parseTime = 0; long normTime = 0; long samTime = 0; long writeTime = 0; long time = 0; ArrayList<CramCompressionRecord> cramRecords = new ArrayList<CramCompressionRecord>(10000); CramNormalizer n = new CramNormalizer(cramHeader.getSamFileHeader(), referenceSource); ContainerParser parser = new ContainerParser(cramHeader.getSamFileHeader()); while (true) { if (params.maxContainers-- <= 0) break; time = System.nanoTime(); c = ContainerIO.readContainer(cramHeader.getVersion(), is); if (c.isEOF()) break; readTime += System.nanoTime() - time; // for random access check if the sequence is the one we are looking // for: if (location != null && location.sequenceId != c.sequenceId) break; if (params.countOnly && location == null && params.requiredFlags == 0 && params.filteringFlags == 0) { recordCount += c.nofRecords; baseCount += c.bases; continue; } time = System.nanoTime(); cramRecords.clear(); parser.getRecords(c, cramRecords, ValidationStringency.SILENT); parseTime += System.nanoTime() - time; for (int i = 0; i < c.slices.length; i++) { Slice s = c.slices[i]; if (s.sequenceId < 0) continue; SAMSequenceRecord sequence = cramHeader.getSamFileHeader().getSequence(s.sequenceId); ReferenceRegion ref = referenceSource.getRegion(sequence, s.alignmentStart, s.alignmentStart + s.alignmentSpan - 1); if (ref == null) log.error("Can't find reference to validate slice md5: " + sequence.getSequenceIndex() + " " + sequence.getSequenceName()); else { String sliceMD5 = String.format("%032x", new BigInteger(1, s.refMD5)); String md5 = ref.md5(s.alignmentStart, s.alignmentSpan); if (!md5.equals(sliceMD5)) { log.error(String .format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", s.sequenceId, s.alignmentStart, s.alignmentSpan, sliceMD5)); if (!params.resilient) System.exit(1); } } } long time1 = System.nanoTime(); n.normalizeRecordsForReferenceSource(cramRecords, referenceSource, c.header.substitutionMatrix); long time2 = System.nanoTime(); normTime += time2 - time1; Cram2SamRecordFactory c2sFactory = new Cram2SamRecordFactory(cramHeader.getSamFileHeader()); long c2sTime = 0; long sWriteTime = 0; boolean enough = false; for (CramCompressionRecord r : cramRecords) { // enforcing a special way to calculate template size: restoreMateInfo(r); // check if the record ends before the query start: if (location != null && r.sequenceId == location.sequenceId && r.getAlignmentEnd() < location.start) continue; // we got all the reads for random access: if (location != null && location.sequenceId == r.sequenceId && location.end < r.alignmentStart) { enough = true; break; } time = System.nanoTime(); SAMRecord s = c2sFactory.create(r); if (params.requiredFlags != 0 && ((params.requiredFlags & s.getFlags()) == 0)) continue; if (params.filteringFlags != 0 && ((params.filteringFlags & s.getFlags()) != 0)) continue; if (params.countOnly) { recordCount++; baseCount += r.readLength; continue; } if (params.calculateMdTag || params.calculateNmTag) { AlignmentsTags.calculateMdAndNmTags(s, referenceSource, cramHeader.getSamFileHeader() .getSequenceDictionary(), params.calculateMdTag, params.calculateNmTag); } c2sTime += System.nanoTime() - time; samTime += System.nanoTime() - time; time = System.nanoTime(); writer.addAlignment(s); sWriteTime += System.nanoTime() - time; writeTime += System.nanoTime() - time; if (params.outputFile == null && System.out.checkError()) break; } log.info(String .format("CONTAINER READ: io %dms, parse %dms, norm %dms, convert %dms, BAM write %dms, %d bases in %d records", c.readTime / 1000000, c.parseTime / 1000000, (time2 - time1) / 1000000, c2sTime / 1000000, sWriteTime / 1000000, c.bases, c.nofRecords)); if (enough || (params.outputFile == null && System.out.checkError())) break; } if (params.countOnly) { System.out.printf("READS: %d; BASES: %d\n", recordCount, baseCount); } writer.close(); log.warn(String.format("TIMES: io %ds, parse %ds, norm %ds, convert %ds, BAM write %ds", readTime / 1000000000, parseTime / 1000000000, normTime / 1000000000, samTime / 1000000000, writeTime / 1000000000)); } private static void restoreMateInfo(CramCompressionRecord r) { if (r.next == null) { return; } CramCompressionRecord cur; cur = r; while (cur.next != null) { setNextMate(cur, cur.next); cur = cur.next; } // cur points to the last segment now: CramCompressionRecord last = cur; setNextMate(last, r); final int templateLength = CramNormalizer.computeInsertSize(r, last); r.templateSize = templateLength; last.templateSize = -templateLength; } private static void setNextMate(CramCompressionRecord r, CramCompressionRecord next) { r.mateAlignmentStart = next.alignmentStart; r.setMateUnmapped(next.isSegmentUnmapped()); r.setMateNegativeStrand(next.isNegativeStrand()); r.mateSequenceID = next.sequenceId; if (r.mateSequenceID == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) r.mateAlignmentStart = SAMRecord.NO_ALIGNMENT_START; } private static SAMFileWriter createSAMFileWriter(Params params, CramHeader cramHeader, SAMFileWriterFactory samFileWriterFactory) throws IOException { /* * building sam writer, sometimes we have to go deeper to get to the * required functionality: */ SAMFileWriter writer = null; if (params.outputFastq) { if (params.cramURL == null) { writer = new FastqSAMFileWriter(System.out, null, cramHeader.getSamFileHeader()); } else { writer = new FastqSAMFileWriter(Utils.getFileName(params.cramURL), false, cramHeader.getSamFileHeader()); } } else if (params.outputFastqGz) { if (params.cramURL == null) { GZIPOutputStream gos = new GZIPOutputStream(System.out); PrintStream ps = new PrintStream(gos); writer = new FastqSAMFileWriter(ps, null, cramHeader.getSamFileHeader()); } else { writer = new FastqSAMFileWriter(Utils.getFileName(params.cramURL), true, cramHeader.getSamFileHeader()); } } else if (params.outputFile == null) { OutputStream os = new BufferedOutputStream(System.out); if (params.outputBAM) { writer = new SAMFileWriterFactory().makeBAMWriter(cramHeader.getSamFileHeader(), true, os); } else { writer = Utils.createSAMTextWriter(samFileWriterFactory, os, cramHeader.getSamFileHeader(), params.printSAMHeader); } } else { writer = samFileWriterFactory.makeSAMOrBAMWriter(cramHeader.getSamFileHeader(), true, params.outputFile); } return writer; } @Parameters(commandDescription = "CRAM to BAM conversion. ") static class Params { @Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = LevelConverter.class) Log.LogLevel logLevel = Log.LogLevel.ERROR; @Parameter(names = { "--input-cram-file", "-I" }, description = "The path or FTP URL to the CRAM file to uncompress. Omit if standard input (pipe).") String cramURL; @Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example). ") File reference; @Parameter(names = { "--output-bam-file", "-O" }, converter = FileConverter.class, description = "The path to the output BAM file.") File outputFile; @Parameter(names = { "-b", "--output-bam-format" }, description = "Output in BAM format.") boolean outputBAM = false; @Parameter(names = { "-q", "--output-fastq-format" }, hidden = true, description = "Output in fastq format.") boolean outputFastq = false; @Parameter(names = { "-z", "--output-fastq-gz-format" }, hidden = true, description = "Output in gzipped fastq format.") boolean outputFastqGz = false; @Parameter(names = { "--print-sam-header" }, description = "Print SAM header when writing SAM format.") boolean printSAMHeader = false; @Parameter(names = { "-H" }, description = "Print SAM header and quit.") boolean printSAMHeaderOnly = false; @Parameter(names = { "-h", "--help" }, description = "Print help and quit") boolean help = false; @Parameter(names = { "--default-quality-score" }, description = "Use this quality score (decimal representation of ASCII symbol) as a default value when the original quality score was lost due to compression. Minimum is 33.") int defaultQS = '?'; @Parameter(names = { "--calculate-md-tag" }, description = "Calculate MD tag.") boolean calculateMdTag = false; @Parameter(names = { "--calculate-nm-tag" }, description = "Calculate NM tag.") boolean calculateNmTag = false; @Parameter(description = "A region to access specified as <sequence name>[:<start inclusive>[-[<stop inclusive>]]") List<String> locations; @Parameter(names = { "--decrypt" }, description = "Decrypt the file.") boolean decrypt = false; @Parameter(names = { "--count-only", "-c" }, description = "Count number of records.") boolean countOnly = false; @Parameter(names = { "--required-flags", "-f" }, description = "Required flags. ") int requiredFlags = 0; @Parameter(names = { "--filter-flags", "-F" }, description = "Filtering flags. ") int filteringFlags = 0; @Parameter(names = { "--inject-sq-uri" }, description = "Inject or change the @SQ:UR header fields to point to ENA reference service. ") public boolean injectURI = false; @Parameter(names = { "--sync-bam-output" }, description = "Write BAM output in the same thread.") public boolean syncBamOutput = false; @Parameter(names = { "--async-bam-buffer" }, description = "The buffer size (number of records) for the asynchronious BAM output.", hidden = true) int asyncBamBuffer = 10000; @Parameter(names = { "--ignore-md5-mismatch" }, description = "Issue a warning on sequence MD5 mismatch and continue. This does not garantee the data will be read succesfully. ") public boolean ignoreMD5Mismatch = false; @Parameter(names = { "--skip-md5-check" }, description = "Skip MD5 checks when reading the header.") public boolean skipMD5Checks = false; @Parameter(names = { "--ref-seq-download-tries" }, description = "Try to download sequences this many times if their md5 mismatches.", hidden = true) int downloadTriesBeforeFailing = 2; @Parameter(names = { "--resilient" }, description = "Report reference sequence md5 mismatch and keep going.", hidden = true) public boolean resilient = false; @Parameter(names = { "--password", "-p" }, description = "Password to decrypt the file.") public String password; @Parameter(names = { "--max-containers" }, description = "Read only specified number of containers.", hidden = true) long maxContainers = Long.MAX_VALUE; @Parameter(names = { "--load-whole-reference-sequence" }, description = "Load all bases for each reference sequence required. ", hidden = true) public boolean loadWholeReferenceSequence = false; } }