/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.RuntimeEOFException; import htsjdk.samtools.util.StringUtil; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * Utilty methods. */ public final class SAMUtils { // Representation of bases, one for when in low-order nybble, one for when in high-order nybble. private static final byte COMPRESSED_EQUAL_LOW = 0; private static final byte COMPRESSED_A_LOW = 1; private static final byte COMPRESSED_C_LOW = 2; private static final byte COMPRESSED_M_LOW = 3; private static final byte COMPRESSED_G_LOW = 4; private static final byte COMPRESSED_R_LOW = 5; private static final byte COMPRESSED_S_LOW = 6; private static final byte COMPRESSED_V_LOW = 7; private static final byte COMPRESSED_T_LOW = 8; private static final byte COMPRESSED_W_LOW = 9; private static final byte COMPRESSED_Y_LOW = 10; private static final byte COMPRESSED_H_LOW = 11; private static final byte COMPRESSED_K_LOW = 12; private static final byte COMPRESSED_D_LOW = 13; private static final byte COMPRESSED_B_LOW = 14; private static final byte COMPRESSED_N_LOW = 15; private static final byte COMPRESSED_EQUAL_HIGH = COMPRESSED_EQUAL_LOW << 4; private static final byte COMPRESSED_A_HIGH = COMPRESSED_A_LOW << 4; private static final byte COMPRESSED_C_HIGH = COMPRESSED_C_LOW << 4; private static final byte COMPRESSED_G_HIGH = COMPRESSED_G_LOW << 4; private static final byte COMPRESSED_T_HIGH = (byte)(COMPRESSED_T_LOW << 4); private static final byte COMPRESSED_N_HIGH = (byte)(COMPRESSED_N_LOW << 4); private static final byte COMPRESSED_M_HIGH = (byte)(COMPRESSED_M_LOW << 4); private static final byte COMPRESSED_R_HIGH = (byte)(COMPRESSED_R_LOW << 4); private static final byte COMPRESSED_S_HIGH = (byte)(COMPRESSED_S_LOW << 4); private static final byte COMPRESSED_V_HIGH = (byte)(COMPRESSED_V_LOW << 4); private static final byte COMPRESSED_W_HIGH = (byte)(COMPRESSED_W_LOW << 4); private static final byte COMPRESSED_Y_HIGH = (byte)(COMPRESSED_Y_LOW << 4); private static final byte COMPRESSED_H_HIGH = (byte)(COMPRESSED_H_LOW << 4); private static final byte COMPRESSED_K_HIGH = (byte)(COMPRESSED_K_LOW << 4); private static final byte COMPRESSED_D_HIGH = (byte)(COMPRESSED_D_LOW << 4); private static final byte COMPRESSED_B_HIGH = (byte)(COMPRESSED_B_LOW << 4); public static final int MAX_PHRED_SCORE = 93; /** * Convert from a byte array containing =AaCcGgTtNn represented as ASCII, to a byte array half as long, * with =, A, C, G, T converted to 0, 1, 2, 4, 8, 15. * @param readBases Bases as ASCII bytes. * @return New byte array with bases represented as nybbles, in BAM binary format. */ static byte[] bytesToCompressedBases(final byte[] readBases) { final byte[] compressedBases = new byte[(readBases.length + 1)/2]; int i; for (i = 1; i < readBases.length; i+=2) { compressedBases[i/2] = (byte)(charToCompressedBaseHigh(readBases[i-1]) | charToCompressedBaseLow(readBases[i])); } // Last nybble if (i == readBases.length) { compressedBases[i/2] = charToCompressedBaseHigh((char)readBases[i-1]); } return compressedBases; } /** * Convert from a byte array with basese stored in nybbles, with =, A, C, G, T represented as 0, 1, 2, 4, 8, 15, * to a a byte array containing =AaCcGgTtNn represented as ASCII. * @param length Number of bases (not bytes) to convert. * @param compressedBases Bases represented as nybbles, in BAM binary format. * @param compressedOffset Byte offset in compressedBases to start. * @return New byte array with bases as ASCII bytes. */ public static byte[] compressedBasesToBytes(final int length, final byte[] compressedBases, final int compressedOffset) { final byte[] ret = new byte[length]; int i; for (i = 1; i < length; i+=2) { final int compressedIndex = i / 2 + compressedOffset; ret[i-1] = compressedBaseToByteHigh(compressedBases[compressedIndex]); ret[i] = compressedBaseToByteLow(compressedBases[compressedIndex]); } // Last nybble if (i == length) { ret[i-1] = compressedBaseToByteHigh(compressedBases[i/2 + compressedOffset]); } return ret; } /** * Convert from ASCII byte to BAM nybble representation of a base in low-order nybble. * @param base One of =AaCcGgTtNn. * @return Low-order nybble-encoded equivalent. */ private static byte charToCompressedBaseLow(final int base) { switch (base) { case '=': return COMPRESSED_EQUAL_LOW; case 'a': case 'A': return COMPRESSED_A_LOW; case 'c': case 'C': return COMPRESSED_C_LOW; case 'g': case 'G': return COMPRESSED_G_LOW; case 't': case 'T': return COMPRESSED_T_LOW; case 'n': case 'N': case '.': return COMPRESSED_N_LOW; // IUPAC ambiguity codes case 'M': case 'm': return COMPRESSED_M_LOW; case 'R': case 'r': return COMPRESSED_R_LOW; case 'S': case 's': return COMPRESSED_S_LOW; case 'V': case 'v': return COMPRESSED_V_LOW; case 'W': case 'w': return COMPRESSED_W_LOW; case 'Y': case 'y': return COMPRESSED_Y_LOW; case 'H': case 'h': return COMPRESSED_H_LOW; case 'K': case 'k': return COMPRESSED_K_LOW; case 'D': case 'd': return COMPRESSED_D_LOW; case 'B': case 'b': return COMPRESSED_B_LOW; default: throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Convert from ASCII byte to BAM nybble representation of a base in high-order nybble. * @param base One of =AaCcGgTtNn. * @return High-order nybble-encoded equivalent. */ private static byte charToCompressedBaseHigh(final int base) { switch (base) { case '=': return COMPRESSED_EQUAL_HIGH; case 'a': case 'A': return COMPRESSED_A_HIGH; case 'c': case 'C': return COMPRESSED_C_HIGH; case 'g': case 'G': return COMPRESSED_G_HIGH; case 't': case 'T': return COMPRESSED_T_HIGH; case 'n': case 'N': case '.': return COMPRESSED_N_HIGH; // IUPAC ambiguity codes case 'M': case 'm': return COMPRESSED_M_HIGH; case 'R': case 'r': return COMPRESSED_R_HIGH; case 'S': case 's': return COMPRESSED_S_HIGH; case 'V': case 'v': return COMPRESSED_V_HIGH; case 'W': case 'w': return COMPRESSED_W_HIGH; case 'Y': case 'y': return COMPRESSED_Y_HIGH; case 'H': case 'h': return COMPRESSED_H_HIGH; case 'K': case 'k': return COMPRESSED_K_HIGH; case 'D': case 'd': return COMPRESSED_D_HIGH; case 'B': case 'b': return COMPRESSED_B_HIGH; default: throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Convert from BAM nybble representation of a base in low-order nybble to ASCII byte. * @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base. * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByteLow(final int base) { switch (base & 0xf) { case COMPRESSED_EQUAL_LOW: return '='; case COMPRESSED_A_LOW: return 'A'; case COMPRESSED_C_LOW: return 'C'; case COMPRESSED_G_LOW: return 'G'; case COMPRESSED_T_LOW: return 'T'; case COMPRESSED_N_LOW: return 'N'; // IUPAC ambiguity codes case COMPRESSED_M_LOW: return 'M'; case COMPRESSED_R_LOW: return 'R'; case COMPRESSED_S_LOW: return 'S'; case COMPRESSED_V_LOW: return 'V'; case COMPRESSED_W_LOW: return 'W'; case COMPRESSED_Y_LOW: return 'Y'; case COMPRESSED_H_LOW: return 'H'; case COMPRESSED_K_LOW: return 'K'; case COMPRESSED_D_LOW: return 'D'; case COMPRESSED_B_LOW: return 'B'; default: throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Convert from BAM nybble representation of a base in high-order nybble to ASCII byte. * @param base One of COMPRESSED_*_HIGH, a high-order nybble encoded base. * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByteHigh(final int base) { switch ((byte)(base & 0xf0)) { case COMPRESSED_EQUAL_HIGH: return '='; case COMPRESSED_A_HIGH: return 'A'; case COMPRESSED_C_HIGH: return 'C'; case COMPRESSED_G_HIGH: return 'G'; case COMPRESSED_T_HIGH: return 'T'; case COMPRESSED_N_HIGH: return 'N'; // IUPAC ambiguity codes case COMPRESSED_M_HIGH: return 'M'; case COMPRESSED_R_HIGH: return 'R'; case COMPRESSED_S_HIGH: return 'S'; case COMPRESSED_V_HIGH: return 'V'; case COMPRESSED_W_HIGH: return 'W'; case COMPRESSED_Y_HIGH: return 'Y'; case COMPRESSED_H_HIGH: return 'H'; case COMPRESSED_K_HIGH: return 'K'; case COMPRESSED_D_HIGH: return 'D'; case COMPRESSED_B_HIGH: return 'B'; default: throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); } } /** * Convert bases in place into canonical form, upper case, and with no-call represented as N. * @param bases */ static void normalizeBases(final byte[] bases) { for (int i = 0; i < bases.length; ++i) { bases[i] = StringUtil.toUpperCase(bases[i]); if (bases[i] == '.') { bases[i] = 'N'; } } } /** * Convert an array of bytes, in which each byte is a binary phred quality score, to * printable ASCII representation of the quality scores, ala FASTQ format. * * Equivalent to phredToFastq(data, 0, data.length) * * @param data Array of bytes in which each byte is a binar phred score. * @return String with ASCII representation of those quality scores. */ public static String phredToFastq(final byte[] data) { if (data == null) { return null; } return phredToFastq(data, 0, data.length); } /** * Convert an array of bytes, in which each byte is a binary phred quality score, to * printable ASCII representation of the quality scores, ala FASTQ format. * @param buffer Array of bytes in which each byte is a binar phred score. * @param offset Where in buffer to start conversion. * @param length How many bytes of buffer to convert. * @return String with ASCII representation of those quality scores. */ public static String phredToFastq(final byte[] buffer, final int offset, final int length) { final char[] chars = new char[length]; for (int i = 0; i < length; i++) { chars[i] = phredToFastq(buffer[offset+i] & 0xFF); } return new String(chars); } /** * Convert a single binary phred score to printable ASCII representation, ala FASTQ format. * @param phredScore binary phred score. * @return Printable ASCII representation of phred score. */ public static char phredToFastq(final int phredScore) { if (phredScore < 0 || phredScore > MAX_PHRED_SCORE) { throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); } return (char) (33 + phredScore); } /** * Convert a string with phred scores in printable ASCII FASTQ format to an array * of binary phred scores. * @param fastq Phred scores in FASTQ printable ASCII format. * @return byte array of binary phred scores in which each byte corresponds to a character in the input string. */ public static byte[] fastqToPhred(final String fastq) { if (fastq == null) { return null; } final int length = fastq.length(); final byte[] scores = new byte[length]; for (int i = 0; i < length; i++) { scores[i] = (byte) fastqToPhred(fastq.charAt(i)); } return scores; } /** * Converts printable qualities in Sanger fastq format to binary phred scores. */ public static void fastqToPhred(final byte[] fastq) { for (int i = 0; i < fastq.length; ++i) { fastq[i] = (byte)fastqToPhred((char)(fastq[i] & 0xff)); } } /** * Convert a single printable ASCII FASTQ format phred score to binary phred score. * @param ch Printable ASCII FASTQ format phred score. * @return Binary phred score. */ public static int fastqToPhred(final char ch) { if (ch < 33 || ch > 126) { throw new IllegalArgumentException("Invalid fastq character: " + ch); } return (ch - 33); } /** * calculate the bin given an alignment in [beg,end) * Copied from SAM spec. * @param beg 0-based start of read (inclusive) * @param end 0-based end of read (exclusive) * @deprecated Use GenomicIndexUtil.reg2bin */ static int reg2bin(final int beg, final int end) { return GenomicIndexUtil.reg2bin(beg, end); } /** * Handle a list of validation errors according to the validation stringency. * @param validationErrors List of errors to report, or null if there are no errors. * @param samRecordIndex Record number of the SAMRecord corresponding to the validation errors, or -1 if * the record number is not known. * @param validationStringency If STRICT, throw a SAMFormatException. If LENIENT, print the validation * errors to stderr. If SILENT, do nothing. */ static void processValidationErrors(final List<SAMValidationError> validationErrors, final long samRecordIndex, final ValidationStringency validationStringency) { if (validationErrors != null && validationErrors.size() > 0) { for (final SAMValidationError validationError : validationErrors) { validationError.setRecordNumber(samRecordIndex); } if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException("SAM validation error: " + validationErrors.get(0)); } else if (validationStringency == ValidationStringency.LENIENT) { for (final SAMValidationError error : validationErrors) { System.err.println("Ignoring SAM validation error: " + error); } } } } public static void processValidationError(final SAMValidationError validationError, final ValidationStringency validationStringency) { if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException("SAM validation error: " + validationError); } else if (validationStringency == ValidationStringency.LENIENT) { System.err.println("Ignoring SAM validation error: " + validationError); } } private static final SAMHeaderRecordComparator<SAMReadGroupRecord> HEADER_RECORD_COMPARATOR = new SAMHeaderRecordComparator<SAMReadGroupRecord>( SAMReadGroupRecord.PLATFORM_UNIT_TAG, SAMReadGroupRecord.LIBRARY_TAG, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG, SAMReadGroupRecord.SEQUENCING_CENTER_TAG, SAMReadGroupRecord.PLATFORM_TAG, SAMReadGroupRecord.DESCRIPTION_TAG, SAMReadGroupRecord.READ_GROUP_ID_TAG // We don't actually want to compare with ID but it's suitable // "just in case" since it's the only one that's actually required ); /** * Calculate a hash code from identifying information in the RG (read group) records in a SAM file's * header. This hash code changes any time read groups are added or removed. Comparing one file's * hash code to another's tells you if the read groups in the BAM files are different. */ public static String calculateReadGroupRecordChecksum(final File input) { final String ENCODING = "UTF-8"; final MessageDigest digest; try { digest = MessageDigest.getInstance("MD5"); } catch (final NoSuchAlgorithmException nsae) { throw new Error("No MD5 algorithm was available in a Java JDK? Unheard-of!"); } // Sort the read group records by their first final SAMFileReader reader = new SAMFileReader(input); final List<SAMReadGroupRecord> sortedRecords = new ArrayList<SAMReadGroupRecord>(reader.getFileHeader().getReadGroups()); Collections.sort(sortedRecords, HEADER_RECORD_COMPARATOR); for (final SAMReadGroupRecord rgRecord : sortedRecords) { final TreeMap<String, String> sortedAttributes = new TreeMap<String, String>(); for (final Map.Entry<String, String> attributeEntry : rgRecord.getAttributes()) { sortedAttributes.put(attributeEntry.getKey(), attributeEntry.getValue()); } try { for (final Map.Entry<String, String> sortedEntry : sortedAttributes.entrySet()) { if ( ! sortedEntry.getKey().equals(SAMReadGroupRecord.READ_GROUP_ID_TAG)) { // Redundant check, safety first digest.update(sortedEntry.getKey().getBytes(ENCODING)); digest.update(sortedEntry.getValue().getBytes(ENCODING)); } } } catch (final UnsupportedEncodingException uee) { throw new Error("No " + ENCODING + "!? WTH?"); } } // Convert to a String and pad to get the full 32 chars. final StringBuilder hashText = new StringBuilder((new BigInteger(1, digest.digest())).toString(16)); while (hashText.length() < 32 ) hashText.insert(0, "0"); return hashText.toString(); } /** * Chains <code>program</code> in front of the first "head" item in the list of * SAMProgramRecords in <code>header</code>. This method should not be used * when there are multiple chains of program groups in a header, only when * it can safely be assumed that there is only one chain. It correctly handles * the case where <code>program</code> has already been added to the header, so * it can be used whether creating a SAMProgramRecord with a constructor or when * calling SAMFileHeader.createProgramRecord(). */ public static void chainSAMProgramRecord(final SAMFileHeader header, final SAMProgramRecord program) { final List<SAMProgramRecord> pgs = header.getProgramRecords(); if (pgs.size() > 0) { final List<String> referencedIds = new ArrayList<String>(); for (final SAMProgramRecord pg : pgs) { if (pg.getPreviousProgramGroupId() != null) { referencedIds.add(pg.getPreviousProgramGroupId()); } } for (final SAMProgramRecord pg : pgs) { // if record being chained has already been added, ignore it if (pg.getProgramGroupId().equals(program.getProgramGroupId())) { continue; } if (!referencedIds.contains(pg.getProgramGroupId())) { program.setPreviousProgramGroupId(pg.getProgramGroupId()); break; } } } } public static void makeReadUnmapped(final SAMRecord rec) { if (rec.getReadNegativeStrandFlag()) { SAMRecordUtil.reverseComplement(rec); rec.setReadNegativeStrandFlag(false); } rec.setDuplicateReadFlag(false); rec.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); rec.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); rec.setInferredInsertSize(0); rec.setNotPrimaryAlignmentFlag(false); rec.setProperPairFlag(false); rec.setReadUnmappedFlag(true); } /** * Determines if a cigar has any element that both consumes read bases and consumes reference bases * (e.g. is not all soft-clipped) */ public static boolean cigarMapsNoBasesToRef(final Cigar cigar) { for (final CigarElement el : cigar.getCigarElements()) { if (el.getOperator().consumesReadBases() && el.getOperator().consumesReferenceBases()) { return false; } } return true; } /** * Tests if the provided record is mapped entirely beyond the end of the reference (i.e., the alignment start is greater than the * length of the sequence to which the record is mapped). */ public static boolean recordMapsEntirelyBeyondEndOfReference(final SAMRecord record) { return record.getHeader().getSequence(record.getReferenceIndex()).getSequenceLength() < record.getAlignmentStart(); } /** * * @return negative if mapq1 < mapq2, etc. * Note that MAPQ(0) < MAPQ(255) < MAPQ(1) */ public static int compareMapqs(final int mapq1, final int mapq2) { if (mapq1 == mapq2) return 0; if (mapq1 == 0) return -1; else if (mapq2 == 0) return 1; else if (mapq1 == 255) return -1; else if (mapq2 == 255) return 1; else return mapq1 - mapq2; } /** * Hokey algorithm for combining two MAPQs into values that are comparable, being cognizant of the fact * that in MAPQ world, 1 > 255 > 0. In this algorithm, 255 is treated as if it were 0.01, so that * CombinedMapq(1,0) > CombinedMapq(255, 255) > CombinedMapq(0, 0). * The return value should not be used for anything other than comparing to the return value of other * invocations of this method. */ public static int combineMapqs(int m1, int m2) { if (m1 == 255) m1 = 1; else m1 *= 100; if (m2 == 255) m2 = 1; else m2 *= 100; return m1 + m2; } /** * Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file * offset after skipping over the text header and the sequence records. */ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) { try { return BAMFileReader.findVirtualOffsetOfFirstRecord(bamFile); } catch (final IOException ioe) { throw new RuntimeEOFException(ioe); } } /** * Given a Cigar, Returns blocks of the sequence that have been aligned directly to the * reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference) * are not represented in the alignment blocks. * * @param cigar The cigar containing the alignment information * @param alignmentStart The start (1-based) of the alignment * @param cigarTypeName The type of cigar passed - for error logging. * @return List of alignment blocks */ public static List<AlignmentBlock> getAlignmentBlocks(final Cigar cigar, final int alignmentStart, final String cigarTypeName) { if (cigar == null) return Collections.emptyList(); final List<AlignmentBlock> alignmentBlocks = new ArrayList<AlignmentBlock>(); int readBase = 1; int refBase = alignmentStart; for (final CigarElement e : cigar.getCigarElements()) { switch (e.getOperator()) { case H : break; // ignore hard clips case P : break; // ignore pads case S : readBase += e.getLength(); break; // soft clip read bases case N : refBase += e.getLength(); break; // reference skip case D : refBase += e.getLength(); break; case I : readBase += e.getLength(); break; case M : case EQ : case X : final int length = e.getLength(); alignmentBlocks.add(new AlignmentBlock(readBase, refBase, length)); readBase += length; refBase += length; break; default : throw new IllegalStateException("Case statement didn't deal with " + cigarTypeName + " op: " + e.getOperator()); } } return Collections.unmodifiableList(alignmentBlocks); } /** * @param alignmentStart The start (1-based) of the alignment * @param cigar The cigar containing the alignment information * @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. * * Invalid to call on an unmapped read. * Invalid to call with cigar = null */ public static int getUnclippedStart(final int alignmentStart, final Cigar cigar) { int unClippedStart = alignmentStart; for (final CigarElement cig : cigar.getCigarElements()) { final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { unClippedStart -= cig.getLength(); } else { break; } } return unClippedStart; } /** * @param alignmentEnd The end (1-based) of the alignment * @param cigar The cigar containing the alignment information * @return the alignment end (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. * * Invalid to call on an unmapped read. * Invalid to call with cigar = null */ public static int getUnclippedEnd(final int alignmentEnd, final Cigar cigar) { int unClippedEnd = alignmentEnd; final List<CigarElement> cigs = cigar.getCigarElements(); for (int i=cigs.size() - 1; i>=0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { unClippedEnd += cig.getLength(); } else { break; } } return unClippedEnd; } /** * Returns the Mate Cigar String as stored in the attribute 'MC'. * @param rec the SAM record * @return Mate Cigar String, or null if there is none. */ public static String getMateCigarString(final SAMRecord rec) { return rec.getStringAttribute(SAMTag.MC.name()); } /** * Returns the Mate Cigar or null if there is none. * @param rec the SAM record * @param withValidation true if we are to validate the mate cigar before returning, false otherwise. * @return Cigar object for the read's mate, or null if there is none. */ public static Cigar getMateCigar(final SAMRecord rec, final boolean withValidation) { final String mateCigarString = getMateCigarString(rec); Cigar mateCigar = null; if (mateCigarString != null) { mateCigar = TextCigarCodec.getSingleton().decode(mateCigarString); if (withValidation && rec.getValidationStringency() != ValidationStringency.SILENT) { final List<AlignmentBlock> alignmentBlocks = getAlignmentBlocks(mateCigar, rec.getMateAlignmentStart(), "mate cigar"); SAMUtils.processValidationErrors(validateCigar(rec, mateCigar, rec.getMateReferenceIndex(), alignmentBlocks, -1, "Mate CIGAR"), -1L, rec.getValidationStringency()); } } return mateCigar; } /** * Returns the Mate Cigar or null if there is none. No validation is done on the returned cigar. * @param rec the SAM record * @return Cigar object for the read's mate, or null if there is none. */ public static Cigar getMateCigar(final SAMRecord rec) { return getMateCigar(rec, false); } /** * @param rec the SAM record * @return number of cigar elements (number + operator) in the mate cigar string. */ public static int getMateCigarLength(final SAMRecord rec) { final Cigar mateCigar = getMateCigar(rec); return (mateCigar != null) ? mateCigar.numCigarElements() : 0; } /** * This method uses the MateCigar value as determined from the attribute MC. It must be non-null. * @param rec the SAM record * @return 1-based inclusive rightmost position of the clipped mate sequence, or 0 read if unmapped. */ public static int getMateAlignmentEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { throw new RuntimeException("getMateAlignmentEnd called on an unmapped mate."); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found."); } return CoordMath.getEnd(rec.getMateAlignmentStart(), mateCigar.getReferenceLength()); } /** * @param rec the SAM record * @return the mate alignment start (1-based, inclusive) adjusted for clipped bases. For example if the mate * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. * * Invalid to call on an unmapped read. */ public static int getMateUnclippedStart(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) throw new RuntimeException("getMateUnclippedStart called on an unmapped mate."); final Cigar mateCigar = getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found."); } return SAMUtils.getUnclippedStart(rec.getMateAlignmentStart(), mateCigar); } /** * @param rec the SAM record * @return the mate alignment end (1-based, inclusive) adjusted for clipped bases. For example if the mate * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. * * Invalid to call on an unmapped read. */ public static int getMateUnclippedEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { throw new RuntimeException("getMateUnclippedEnd called on an unmapped mate."); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found."); } return SAMUtils.getUnclippedEnd(getMateAlignmentEnd(rec), mateCigar); } /** * @param rec the SAM record * Returns blocks of the mate sequence that have been aligned directly to the * reference sequence. Note that clipped portions of the mate and inserted and * deleted bases (vs. the reference) are not represented in the alignment blocks. */ public static List<AlignmentBlock> getMateAlignmentBlocks(final SAMRecord rec) { return getAlignmentBlocks(getMateCigar(rec), rec.getMateAlignmentStart(), "mate cigar"); } /** * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * @param rec the SAM record * @param cigar The cigar containing the alignment information * @param referenceIndex The reference index * @param alignmentBlocks The alignment blocks (parsed from the cigar) * @param recordNumber For error reporting. -1 if not known. * @param cigarTypeName For error reporting. "Read CIGAR" or "Mate Cigar" * @return List of errors, or null if no errors. */ public static List<SAMValidationError> validateCigar(final SAMRecord rec, final Cigar cigar, final Integer referenceIndex, final List<AlignmentBlock> alignmentBlocks, final long recordNumber, final String cigarTypeName) { // Don't know line number, and don't want to force read name to be decoded. List<SAMValidationError> ret = cigar.isValid(rec.getReadName(), recordNumber); if (referenceIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { final SAMSequenceRecord sequence = rec.getHeader().getSequence(referenceIndex); final int referenceSequenceLength = sequence.getSequenceLength(); for (final AlignmentBlock alignmentBlock : alignmentBlocks) { if (alignmentBlock.getReferenceStart() + alignmentBlock.getLength() - 1 > referenceSequenceLength) { if (ret == null) ret = new ArrayList<SAMValidationError>(); ret.add(new SAMValidationError(SAMValidationError.Type.CIGAR_MAPS_OFF_REFERENCE, cigarTypeName + " M operator maps off end of reference", rec.getReadName(), recordNumber)); break; } } } return ret; } /** * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * @param rec the SAM record * @param recordNumber For error reporting. -1 if not known. * @return List of errors, or null if no errors. */ public static List<SAMValidationError> validateMateCigar(final SAMRecord rec, final long recordNumber) { List<SAMValidationError> ret = null; if (rec.getValidationStringency() != ValidationStringency.SILENT) { if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { // The mateCigar will be defined if the mate is mapped if (getMateCigarString(rec) != null) { ret = SAMUtils.validateCigar(rec, getMateCigar(rec), rec.getMateReferenceIndex(), getMateAlignmentBlocks(rec), recordNumber, "Mate CIGAR"); } } else { if (getMateCigarString(rec) != null) { ret = new ArrayList<SAMValidationError>(); if (rec.getMateUnmappedFlag()) { // If the Mate is unmapped, and the Mate Cigar String (MC Attribute) exists, that is a validation error. ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, "Mate CIGAR String (MC Attribute) present for a read whose mate is unmapped", rec.getReadName(), recordNumber)); } else { // If the Mate is not paired, and the Mate Cigar String (MC Attribute) exists, that is a validation error. ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, "Mate CIGAR String (MC Attribute) present for a read that is not paired", rec.getReadName(), recordNumber)); } } } } return ret; } /** * Checks to see if it is valid for this record to have a mate CIGAR (MC) and then if there is a mate CIGAR available. This is done by * checking that this record is paired, its mate is mapped, and that it returns a non-null mate CIGAR. * @param rec * @return */ public static boolean hasMateCigar(SAMRecord rec) { // NB: use getMateCigarString rather than getMateCigar to avoid validation. return (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && null != SAMUtils.getMateCigarString(rec)); } /** * Returns a string that is the the read group ID and read name separated by a colon. This is meant to cannonically * identify a given record within a set of records. * @param record * @return */ public static String getCanonicalRecordName(final SAMRecord record) { String name = record.getStringAttribute(ReservedTagConstants.READ_GROUP_ID); if (null == name) name = record.getReadName(); else name = name + ":" + record.getReadName(); return name; } }