package htsjdk.samtools.cram; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.cram.build.Cram2SamRecordFactory; import htsjdk.samtools.cram.build.CramNormalizer; import htsjdk.samtools.cram.build.Sam2CramRecordFactory; import htsjdk.samtools.cram.ref.ReferenceTracks; import htsjdk.samtools.cram.structure.AlignmentSpan; import htsjdk.samtools.cram.structure.Container; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.util.Log; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import net.sf.cram.ref.ReferenceSource; /** * A static collections of CRAM conversion methods. * * @author vadim * */ public class CramSerilization { private static Log log = Log.getInstance(CramSerilization.class); public static Container convertLossless(List<SAMRecord> samRecords, SAMFileHeader samFileHeader, ReferenceSource source) throws IllegalArgumentException, IllegalAccessException, IOException { CramContext context = new CramContext(samFileHeader, source, CramLossyOptions.lossless()); Map<Integer, AlignmentSpan> spans = getSpans(samRecords); return convert(samRecords, context, spans); } public static Container convert(List<SAMRecord> samRecords, SAMFileHeader samFileHeader, ReferenceSource source, CramLossyOptions lossyOptions) throws IllegalArgumentException, IllegalAccessException, IOException { CramContext context = new CramContext(samFileHeader, source, lossyOptions); Map<Integer, AlignmentSpan> spans = getSpans(samRecords); return convert(samRecords, context, spans); } public static Map<Integer, AlignmentSpan> getSpans(List<SAMRecord> samRecords) { Map<Integer, AlignmentSpan> spans = new HashMap<Integer, AlignmentSpan>(); int unmapped = 0; for (final SAMRecord r : samRecords) { int refId = r.getReferenceIndex(); if (refId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { unmapped++; continue; } int start = r.getAlignmentStart(); int end = r.getAlignmentEnd(); if (spans.containsKey(refId)) { spans.get(refId).add(start, end - start, 1); } else { spans.put(refId, new AlignmentSpan(start, end - start)); } } if (unmapped > 0) { AlignmentSpan span = new AlignmentSpan(AlignmentSpan.UNMAPPED_SPAN.getStart(), AlignmentSpan.UNMAPPED_SPAN.getSpan(), unmapped); spans.put(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, span); } return spans; } public static byte[] getRefBasesOfFail(CramContext cramContext, int refId) { if (refId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) return new byte[0]; SAMSequenceRecord sequenceRecord = cramContext.samFileHeader.getSequence(refId); if (sequenceRecord == null) throw new RuntimeException("Reference sequence not found in the header: " + refId); byte[] ref = cramContext.referenceSource.getReferenceBases(sequenceRecord, true); if (ref == null) throw new RuntimeException("Failed to fetch reference bases for sequence " + refId + ", name: " + sequenceRecord.getSequenceName()); return ref; } public static Container convert(List<SAMRecord> samRecords, CramContext cramContext) throws IllegalArgumentException, IllegalAccessException, IOException { return convert(samRecords, cramContext, getSpans(samRecords)); } public static Container convert(List<SAMRecord> samRecords, CramContext cramContext, Map<Integer, AlignmentSpan> spans) throws IllegalArgumentException, IllegalAccessException, IOException { if (spans.isEmpty()) throw new IllegalArgumentException("Expecting a valid alignment span or unmapped."); if (cramContext.lossyOptions.areReferenceTracksRequired()) { for (int refId : spans.keySet()) { if (refId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) continue; ReferenceTracks tracks = cramContext.tracks.get(refId); if (tracks == null) { throw new IllegalArgumentException("The quality score lossy model requires reference tracks."); } AlignmentSpan span = spans.get(refId); tracks.ensureRange(span.getStart(), span.getSpan()); updateTracks(samRecords, tracks); } } StringBuffer sequences = new StringBuffer(); for (int refId : spans.keySet()) { sequences.append(" ").append(refId); } log.info(String.format("converting %d records, sequences: %s", samRecords.size(), sequences)); final List<CramCompressionRecord> cramRecords = new ArrayList<CramCompressionRecord>(samRecords.size()); final Sam2CramRecordFactory sam2CramRecordFactory = cramContext.sam2cramFactory; int index = 0; int refId = Integer.MIN_VALUE; byte[] ref = null; for (SAMRecord samRecord : samRecords) { if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { // update ref in the factory: if (refId != samRecord.getReferenceIndex()) { refId = samRecord.getReferenceIndex(); ref = getRefBasesOfFail(cramContext, refId); sam2CramRecordFactory.setRefBases(ref); } } final CramCompressionRecord cramRecord = sam2CramRecordFactory.createCramRecord(samRecord); cramRecord.index = ++index; cramRecord.alignmentStart = samRecord.getAlignmentStart(); cramRecords.add(cramRecord); if (cramContext.lossyOptions.isLosslessQualityScore()) cramRecord.setForcePreserveQualityScores(cramRecord.qualityScores != SAMRecord.NULL_QUALS); else { ReferenceTracks tracks = cramContext.tracks.get(samRecord.getReferenceIndex()); cramContext.lossyOptions.getPreservation().addQualityScores(samRecord, cramRecord, tracks); } } if (sam2CramRecordFactory.getBaseCount() < 3 * sam2CramRecordFactory.getFeatureCount()) { log.warn("Abnormally high number of mismatches, possibly wrong reference."); } if (cramContext.samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate) { setMateInfo(cramRecords); } else { for (final CramCompressionRecord cramRecord : cramRecords) { cramRecord.setDetached(true); } } assertSame(cramContext.samFileHeader, samRecords, cramRecords); final Container container = cramContext.containerFactory.buildContainer(cramRecords); for (final Slice slice : container.slices) { if (slice.alignmentSpan < 0) slice.alignmentSpan = 0; switch (slice.sequenceId) { case Slice.MULTI_REFERENCE: continue; case SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX: slice.setRefMD5(new byte[0]); continue; default: slice.setRefMD5(getRefBasesOfFail(cramContext, slice.sequenceId)); break; } } return container; } /** * The following passage is for paranoid mode only. When java is run with * asserts on it will throw an {@link AssertionError} if read bases or * quality scores of a restored SAM record mismatch the original. This is * effectively a runtime round trip test. */ private static void assertSame(SAMFileHeader samFileHeader, List<SAMRecord> samRecords, List<CramCompressionRecord> cramRecords) { @SuppressWarnings("UnusedAssignment") boolean assertsEnabled = false; assert assertsEnabled = true; if (assertsEnabled) { final Cram2SamRecordFactory f = new Cram2SamRecordFactory(samFileHeader); for (int i = 0; i < samRecords.size(); i++) { final SAMRecord restoredSamRecord = f.create(cramRecords.get(i)); assert (restoredSamRecord.getAlignmentStart() == samRecords.get(i).getAlignmentStart()); assert (restoredSamRecord.getReferenceName().equals(samRecords.get(i).getReferenceName())); assert (restoredSamRecord.getReadString().equals(samRecords.get(i).getReadString())); assert (restoredSamRecord.getBaseQualityString().equals(samRecords.get(i).getBaseQualityString())); } } } private static void updateTracks(final List<SAMRecord> samRecords, final ReferenceTracks tracks) { for (final SAMRecord samRecord : samRecords) { if (samRecord.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START && samRecord.getReferenceIndex() == tracks.getSequenceId()) { int refPos = samRecord.getAlignmentStart(); int readPos = 0; for (final CigarElement cigarElement : samRecord.getCigar().getCigarElements()) { if (cigarElement.getOperator().consumesReferenceBases()) { for (int elementIndex = 0; elementIndex < cigarElement.getLength(); elementIndex++) tracks.addCoverage(refPos + elementIndex, 1); } switch (cigarElement.getOperator()) { case M: case X: case EQ: for (int pos = readPos; pos < cigarElement.getLength(); pos++) { final byte readBase = samRecord.getReadBases()[readPos + pos]; final byte refBase = tracks.baseAt(refPos + pos); if (readBase != refBase) tracks.addMismatches(refPos + pos, 1); } break; default: break; } readPos += cigarElement.getOperator().consumesReadBases() ? cigarElement.getLength() : 0; refPos += cigarElement.getOperator().consumesReferenceBases() ? cigarElement.getLength() : 0; } } } } /** * Traverse the graph and mark all segments as detached. * * @param cramRecord * the starting point of the graph */ private static void detach(CramCompressionRecord cramRecord) { do { cramRecord.setDetached(true); cramRecord.setHasMateDownStream(false); cramRecord.recordsToNextFragment = -1; } while ((cramRecord = cramRecord.next) != null); } private static List<CramCompressionRecord> setMateInfo(List<CramCompressionRecord> cramRecords) { Map<String, CramCompressionRecord> primaryMateMap = new TreeMap<String, CramCompressionRecord>(); Map<String, CramCompressionRecord> secondaryMateMap = new TreeMap<String, CramCompressionRecord>(); for (CramCompressionRecord r : cramRecords) { if (!r.isMultiFragment()) { r.setDetached(true); r.setHasMateDownStream(false); r.recordsToNextFragment = -1; r.next = null; r.previous = null; } else { String name = r.readName; Map<String, CramCompressionRecord> mateMap = r.isSecondaryAlignment() ? secondaryMateMap : primaryMateMap; CramCompressionRecord mate = mateMap.get(name); if (mate == null) { mateMap.put(name, r); } else { CramCompressionRecord prev = mate; if (prev.next != null || prev.previous != null) { detach(r); detach(prev); continue; } while (prev.next != null) prev = prev.next; prev.recordsToNextFragment = r.index - prev.index - 1; prev.next = r; r.previous = prev; r.previous.setHasMateDownStream(true); r.setHasMateDownStream(false); r.setDetached(false); r.previous.setDetached(false); } } } // mark unpredictable reads as detached: for (CramCompressionRecord r : cramRecords) { if (r.next == null || r.previous != null) continue; CramCompressionRecord last = r; while (last.next != null) last = last.next; if ((r.isFirstSegment() && last.isLastSegment()) || (last.isFirstSegment() && r.isLastSegment())) { final int templateLength = CramNormalizer.computeInsertSize(r, last); if (r.templateSize == templateLength) { last = r.next; while (last.next != null) { if (last.templateSize != -templateLength) break; last = last.next; } if (last.templateSize != -templateLength) { detach(r); } } else { detach(r); } } else { detach(r); } if (r.mateSequenceID != last.sequenceId || r.sequenceId != last.mateSequenceID || (r.isMateNegativeStrand() != last.isNegativeStrand()) || (last.isMateNegativeStrand() != r.isNegativeStrand()) || (r.mateAlignmentStart != last.alignmentStart) || (last.mateAlignmentStart != r.alignmentStart)) { detach(r); } } for (CramCompressionRecord r : primaryMateMap.values()) { if (r.next != null) continue; r.setDetached(true); r.setHasMateDownStream(false); r.recordsToNextFragment = -1; r.next = null; r.previous = null; } for (CramCompressionRecord r : secondaryMateMap.values()) { if (r.next != null) continue; r.setDetached(true); r.setHasMateDownStream(false); r.recordsToNextFragment = -1; r.next = null; r.previous = null; } return cramRecords; } }