/** * Copyright Copyright 2015 Simon Andrews * * This file is part of BamQC. * * BamQC is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * BamQC is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with BamQC; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Changelog: * - Piero Dalle Pezze: Class creation. */ package uk.ac.babraham.BamQC.Utilities.CigarMD; import java.util.ArrayList; import java.util.List; import java.util.Locale; import org.apache.log4j.Logger; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; /** * It calculates a string combining information from CIGAR and MD strings. The * format is: # (number), operator (m=match, u=mismatch, i=insertion, * d=deletion), bases if oper={u,i,d}. For instance: CIGAR: 32M2D5M1I52M MDtag: * 7G24^AA7C49 Combined CIGAR+MDTag: 7m1uGT24m2dAA5m1iG2m1uCA49m * * Note: G in 1iG is SAM read- dependent. * @author Piero Dalle Pezze */ public class CigarMDGenerator { private static Logger log = Logger.getLogger(CigarMDGenerator.class); // Data fields used for computing the CigarMD string. // The Cigar's elements private List<CigarElement> cigarList = null; // The current Cigar element private String mdString = null; // The current Cigar element private CigarElement currentCigarElement = null; // The length for the current Cigar element private int currentCigarElementLength = 0; // The operator for the current Cigar element private CigarOperator currentCigarElementOperator = null; // The temporary processed length of the processed MD tag element private int temporaryMDElementLength = 0; // The current processed position of the processed MD tag (for the parser) private int currentMDElementPosition = 0; // The current base call position of the read private int currentBaseCallPosition = 0; // Data fields storing the CigarMD object combining Cigar and MD strings. private CigarMD cigarMD = null; // If the read is a first or second segment. private boolean isFirst = true; // The read string. This can be quite long and read.getReadString() can be // quite time consuming as it copies the string every time. Here we copy it // one time only. private String readString = null; // local variables declared here for limiting variable declarations in the method computeCigarMDTag(). private Cigar cigar = null; private int cigarListSize = 0; // 0: no error, 1: unmapped read, 2: read without MD string, 3: read without Cigar, 4: Cigar/MD/read inconsistencies private int errorType = 0; // Public interface // Constructors /** * Default constructor. It generates an empty CigarMD string. */ public CigarMDGenerator() { } /** * Constructor. It generates a CigarMD string from the SAMRecord read. * @param read The SAMRecord to use */ public CigarMDGenerator(SAMRecord read) { computeCigarMDTag(read); } // getter methods /** * It returns the error type. 0: no error, 1: unmapped read, 2: read without MD string, * 3: read without Cigar, 4: Cigar/MD/read inconsistencies. * @return the error type. */ public int getErrorType() { return errorType; } /** * It returns a string combining information from CIGAR and MD tag. This * method only returns a CigarMD string computed previously. * * @return a string containing the computed CigarMD information, or the * empty string if no CigarMD object was computed. */ public String getCigarMDString() { return cigarMD.toString(); } /** * It returns an object CigarMD containing the combined information from * CIGAR and MD tag. This method only returns a CigarMD object computed * previously. * * @return a CigarMD object containing the computed Cigar + MD information. */ public CigarMD getCigarMD() { return cigarMD; } /** * Returns true if the read is a first segment, false if it is a second. * @return true if read is a first segment. */ public boolean isFirst() { return isFirst; } // computing methods /** * It generates a string combining information from CIGAR and MD tag. This CigarMD string is null if it could not be created. * In this last case, an error message can be retrieved using the message getErrorType(). * @param read the SAMRecord to parse. */ public void generateCigarMD(SAMRecord read) { reset(); if(!computeCigarMDTag(read)) { if(errorType == 0) { // if we are here, we detected one of a broad range of errors due to inconsistencies between Cigar/MD/read strings. errorType = 4; cigarMD = null; } } } // Private methods here /** * It generates a string combining information from CIGAR and MD tag. * @return true if the CigarMD string has been computed, false otherwise. */ private boolean computeCigarMDTag(SAMRecord read) { // Get the read string readString = read.getReadString(); /* * IMPORTANT NOTE: * The second column of a SAM file contains important hex FLAGS. From the SAM/BAM format specifications: * (a) "Bit 0x4 is the only reliable place to tell whether the read is unmapped. If 0x4 is set, no assumptions * can be made about RNAME, POS, CIGAR, MAPQ" * (b) "Bit 0x10 indicates whether SEQ has been reverse complemented and QUAL reversed. When bit 0x4 is unset, this * correspond to the strand to which the segment has been mapped. When 0x4 is set, this indicates whether the unmapped * read is stored in its original orientation as it came off the sequencing machine. * * Since this class strongly depend on the CIGAR string, the bit 0x4 must be unset. * The bit 0x10 must be evaluated in order to parse and collect statistics correctly. If so, * the correct positions for SNP/Indels must be calculated. */ // Let's do some preliminary tests // if Flag 0x4 is set, then the read is unmapped. Therefore, skip it for the reasons above. // Check the state of a flag bit 'READ_UNMAPPED_FLAG'. if(read.getReadUnmappedFlag()) { log.info("Read " + readString + " is unmapped and therefore skipped."); errorType = 1; return false; } // Get the MD tag string. It is more likely errors are in the MD rather than the Cigar. Let's put this first. mdString = read.getStringAttribute("MD"); if (mdString == null || mdString.length() == 0) { log.info("Read " + readString + " does not have MD string."); errorType = 2; mdString = null; // We continue processing as indels detection does not require the MD string. } else { // In some reads the bases in the MD string can be in lower case. The read and cigar strings are already set to upper case // for us by the samtools library. This doesn't happen with the mdString though. Let's set this to upper case once for all now, // so we don't have to worry about it later. mdString = mdString.toUpperCase(Locale.ENGLISH); } // Get the CIGAR list cigar = read.getCigar(); if (cigar == null || read.getCigarLength() == 0) { log.info("Read " + readString + " does not have Cigar string."); errorType = 3; return false; } cigarList = cigar.getCigarElements(); // setup a new CigarMD string cigarMD = new CigarMD(); // Use the old c-style for loop for memory (garbage collector) and CPU efficiency // Iterate the CigarList // Iterator<CigarElement> iterCigar = cigarList.iterator(); // // Loop over the CigarElement objects of the read Cigar // while (iterCigar.hasNext()) { // currentCigarElement = iterCigar.next(); cigarListSize = cigarList.size(); for(int i=0; i<cigarListSize; i++) { currentCigarElement = cigarList.get(i); currentCigarElementLength = currentCigarElement.getLength(); currentCigarElementOperator = currentCigarElement.getOperator(); //log.debug("Parsing CigarElement: " + currentCigarElementLength + currentCigarElementOperator.toString()); if (currentCigarElementOperator == CigarOperator.MATCH_OR_MISMATCH) { if(!processMDtagCigarOperatorM(read)){ return false; } } else if (currentCigarElementOperator == CigarOperator.INSERTION) { if(!processMDtagCigarOperatorI(read)) { return false; } } else if (currentCigarElementOperator == CigarOperator.DELETION) { if(!processMDtagCigarOperatorD(read)) { return false; } } else if (currentCigarElementOperator == CigarOperator.SKIPPED_REGION) { processMDtagCigarOperatorN(); } else if (currentCigarElementOperator == CigarOperator.SOFT_CLIP) { processMDtagCigarOperatorS(); } else if (currentCigarElementOperator == CigarOperator.HARD_CLIP) { processMDtagCigarOperatorH(); } else if (currentCigarElementOperator == CigarOperator.PADDING) { processMDtagCigarOperatorP(); } else if (currentCigarElementOperator == CigarOperator.EQ) { log.warn("Extended CIGAR element = is currently unsupported."); return false; } else if (currentCigarElementOperator == CigarOperator.X) { log.warn("Extended CIGAR element X is currently unsupported."); return false; } else { log.error("Unknown Cigar operator " +currentCigarElementOperator.toString()+ " in read " + readString + "\n"); return false; } } // Let's do some tests to see whether something is wrong.. if(currentBaseCallPosition < readString.length()) { log.warn("Cigar string " + read.getCigarString() + " length " + currentBaseCallPosition + " < read length " + readString.length() + ". mdString : " + mdString + ", CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } if(currentBaseCallPosition > readString.length()) { log.warn("Cigar string " + read.getCigarString() + " length " + currentBaseCallPosition + " > read length " + readString.length() + ". mdString : " + mdString + ", CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } if(mdString != null && temporaryMDElementLength > 0) { log.warn("MD string " + mdString + " > Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } // Check whether the read is paired in sequencing or not. // Flag: 0x1 'READ_PAIRED_FLAG' if(read.getReadPairedFlag()) { // A read can be first/second (0x40/0x80) and forward/backward (0x10). // If the read is first/backward(0x40+0x10) or second/forward(0x80), the CigarMD string must be reversed and complemented. // Check the state of a flag bits 'FIRST_OF_PAIR_FLAG' and 'READ_STRAND_FLAG'. if(read.getFirstOfPairFlag()) { isFirst = true; // it is a first segment. if(read.getSecondOfPairFlag()) { // .. but it is also a second segment log.debug("Read " + readString + " is part of a linear template, but it is neither the first nor the last read."); } else if(read.getReadNegativeStrandFlag()) { // it is reversed and complemented log.debug("Current SAM read is FIRST(0x40) and parsed BACKWARD(0x10)."); reverseComplementCigarMD(); } // else it is a FIRST FORWARD. We don't do anything. } else // it is NOT a first segment if(!read.getSecondOfPairFlag()) { // .. but it is NOT a second segment either isFirst = true; // let's leave it as first. log.debug("The index for the read " + readString + " in the template is unknown. Non-linear template or index lost in data processing."); } else { // it is a second segment. isFirst = false; if(!read.getReadNegativeStrandFlag()) { // it is reversed and complemented log.debug("Current SAM read is SECOND(0x80) and parsed FORWARD."); reverseComplementCigarMD(); } // else it is a SECOND BACKWARD. We don't do anything. } // otherwise: // the read is FIRST AND FORWARD or SECOND AND BACKWARD. In these two cases, no reverse complement is needed. } else { // the read is unpaired. Treat as a first segment because it is the only one. isFirst = true; if(read.getReadNegativeStrandFlag()) { // it is reversed and complemented log.debug("CigarMDGenerator: current SAM read is parsed BACKWARD(0x10)."); reverseComplementCigarMD(); } } log.debug("CigarMD string: " + cigarMD.toString()); return true; } /** * Complements a base (e.g. "ACGT" => "TGCA") * @param bases */ private void baseComplement(StringBuilder bases) { for(int i = 0; i < bases.length(); i++) { if(bases.charAt(i) == 'A') bases.replace(i, i+1, "T"); else if(bases.charAt(i) == 'C') bases.replace(i, i+1, "G"); else if(bases.charAt(i) == 'G') bases.replace(i, i+1, "C"); else if(bases.charAt(i) == 'T') bases.replace(i, i+1, "A"); } } /** * It reverse and complement the CigarMD string if the Flag 0x10 or 0x80 are set on. */ private void reverseComplementCigarMD() { List<CigarMDElement> oldCigarMDElements = cigarMD.getCigarMDElements(); int listLength = oldCigarMDElements.size(); List<CigarMDElement> newCigarMDElements = new ArrayList<CigarMDElement>(listLength); CigarMDElement oldElement = null; StringBuilder newBases = null; String oldBases = null; int mutations = 0; for(int i = listLength - 1; i >= 0; i--) { oldElement = oldCigarMDElements.get(i); if(oldElement.getOperator() == CigarMDOperator.MATCH) { // As we do not record the bases, don't do anything newCigarMDElements.add(oldElement); } else if(oldElement.getOperator() == CigarMDOperator.MISMATCH) { // reverse and complement the bases couples oldBases = oldElement.getBases(); mutations = oldElement.getLength(); newBases = new StringBuilder(oldBases.length()); for(int j = 0; j < mutations; j++) { newBases.insert(0, oldBases.substring(j*2, j*2+2)); } baseComplement(newBases); newCigarMDElements.add(new CigarMDElement(mutations, CigarMDOperator.MISMATCH, newBases.toString())); } else if(oldElement.getOperator() == CigarMDOperator.INSERTION) { // reverse and complement the bases newBases = new StringBuilder(oldElement.getBases()); baseComplement(newBases.reverse()); newCigarMDElements.add(new CigarMDElement(oldElement.getLength(), CigarMDOperator.INSERTION, newBases.toString())); } else if(oldElement.getOperator() == CigarMDOperator.DELETION) { // reverse and complement the bases newBases = new StringBuilder(oldElement.getBases()); baseComplement(newBases.reverse()); newCigarMDElements.add(new CigarMDElement(oldElement.getLength(), CigarMDOperator.DELETION, newBases.toString())); } else if(oldElement.getOperator() == CigarMDOperator.SKIPPED_REGION) { // As we do not record the bases, don't do anything newCigarMDElements.add(oldElement); } else if(oldElement.getOperator() == CigarMDOperator.SOFT_CLIP) { // As we do not record the bases, don't do anything newCigarMDElements.add(oldElement); } else if(oldElement.getOperator() == CigarMDOperator.HARD_CLIP) { // As we do not record the bases, don't do anything newCigarMDElements.add(oldElement); } else if(oldElement.getOperator() == CigarMDOperator.PADDING) { // As we do not record the bases, don't do anything newCigarMDElements.add(oldElement); } else if(oldElement.getOperator() == CigarMDOperator.eq) { // TODO: THIS CASE DOES NOT EVER HAPPEN AS IT IS NOT IMPLEMENTED. IN THE FUTURE IT NEEDS REVISION newCigarMDElements.add(oldElement); } else if(oldElement.getOperator() == CigarMDOperator.x) { // TODO: THIS CASE DOES NOT EVER HAPPEN AS IT IS NOT IMPLEMENTED. IN THE FUTURE IT NEEDS REVISION newCigarMDElements.add(oldElement); } } cigarMD = new CigarMD(newCigarMDElements); } /** * It resets the class data fields. */ private void reset() { cigarList = null; mdString = null; currentCigarElement = null; currentCigarElementLength = 0; currentCigarElementOperator = null; temporaryMDElementLength = 0; currentMDElementPosition = 0; currentBaseCallPosition = 0; readString = null; cigarMD = null; errorType = 0; } // These methods process the MD string for each CIGAR operator. /** * Add a new Match element to the CigarMD Object. * @param temporaryCigarElementLength The length of the current Cigar element (Match) being processed. * @return the length of the current Cigar element (Match) after being processed. */ private int addMatchToCigarMD(int temporaryCigarElementLength) { // update the position of the currentBaseCall and the parser. if(mdString != null && temporaryMDElementLength <= temporaryCigarElementLength) { cigarMD.add(new CigarMDElement(temporaryMDElementLength, CigarMDOperator.MATCH, "")); currentBaseCallPosition = currentBaseCallPosition + temporaryMDElementLength; temporaryCigarElementLength = temporaryCigarElementLength - temporaryMDElementLength; temporaryMDElementLength = 0; } else { cigarMD.add(new CigarMDElement(temporaryCigarElementLength, CigarMDOperator.MATCH, "")); currentBaseCallPosition = currentBaseCallPosition + temporaryCigarElementLength; temporaryMDElementLength = temporaryMDElementLength - temporaryCigarElementLength; temporaryCigarElementLength = 0; } return temporaryCigarElementLength; } /** * Add a new Mismatch element to the CigarMD Object. * @param temporaryCigarElementLength The length of the current Cigar element (Mismatch) being processed. * @param bases the bases representing the mismatch. * @return the length of the current Cigar element (Match) after being processed. */ private int addMismatchToCigarMD(int temporaryCigarElementLength, StringBuilder bases) { // update the position of the currentBaseCall and the parser. if(temporaryMDElementLength <= temporaryCigarElementLength) { cigarMD.add(new CigarMDElement(temporaryMDElementLength, CigarMDOperator.MISMATCH, bases.toString())); currentBaseCallPosition = currentBaseCallPosition + temporaryMDElementLength; temporaryCigarElementLength = temporaryCigarElementLength - temporaryMDElementLength; temporaryMDElementLength = 0; } else { cigarMD.add(new CigarMDElement(temporaryCigarElementLength, CigarMDOperator.MISMATCH, bases.toString())); currentBaseCallPosition = currentBaseCallPosition + temporaryCigarElementLength; temporaryMDElementLength = temporaryMDElementLength - temporaryCigarElementLength; temporaryCigarElementLength = 0; } return temporaryCigarElementLength; } /** * Process the MD string once found the CIGAR operator M. * @read the SAMRecord to process. * @return true if the Cigar operator M has been processed correctly. */ private boolean processMDtagCigarOperatorM(SAMRecord read) { // The temporary length of the current Cigar element int temporaryCigarElementLength = currentCigarElementLength; while(temporaryCigarElementLength > 0) { if(mdString != null && temporaryMDElementLength == 0) { // PARSE A NEW MD ELEMENT // It is either a number [=>MATCH] or a char (A,C,G,T) [=>MISMATCH] // Extract the first character for the MD element. // Only parse the next element of MD Tag string if this current has been completed. // This is required as MD string does not record insertions, whilst Cigar string does. if(mdString.length() <= currentMDElementPosition) { if(currentBaseCallPosition + temporaryMDElementLength > readString.length()) { log.warn("Cigar string " + read.getCigarString() + " length " + (currentBaseCallPosition + temporaryMDElementLength) + " > read length " + readString.length() + ". mdString : " + mdString + ", CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } log.warn("MD string " + mdString + " < Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } // extract new MD element char by char char currentMDChar = mdString.charAt(currentMDElementPosition); currentMDElementPosition++; // skip if the retrieved MD element char is zero. This is redundant information if the CIGAR string is read too.. if(currentMDChar == '0') { continue; } // the MD character is not 0. Therefore, we need to process it! // Either we have a number (MATCH: 1 or more digits) or a string of bases (MISMATCH: 1 or more chars) if(currentMDChar >= '1' && currentMDChar <= '9') { // CASE 1: The first character is a number. Therefore, we are parsing MATCHED bases. // Let's continue and see how many numbers we find. // This comprehensive number is the temporaryMDElementLength, which tells us // how many matched bases we have. // Capacity initially set to 3: 3 digits of matches (up to 999!) StringBuilder mdNumber = new StringBuilder(3); mdNumber.append(currentMDChar); while(currentMDElementPosition < mdString.length()) { currentMDChar = mdString.charAt(currentMDElementPosition); if(currentMDChar >= '0' && currentMDChar <= '9') { mdNumber.append(currentMDChar); currentMDElementPosition++; } else { // c is something else. The MD Element has been parsed. break; } } temporaryMDElementLength = Integer.parseInt(mdNumber.toString()); // add the new MATCH element to the CigarMD string and update the temporaryCigarElementLength temporaryCigarElementLength = addMatchToCigarMD(temporaryCigarElementLength); } else { // CASE 2: The first character is a char. Therefore, we are parsing MISMATCHED bases or mutations. // If there is a mutation on the read bases with respect to the reference, we indicate this as: // RefBaseMutBase (base on the reference -> mutated base on the read). // For instance: an element of CigarMD with operator 'u' is 1uCA IF the base C on the reference // is mutated into the base A on the aligned read base. In the case of two contiguous mutation, this // can be coded as 2uCAGT if the reference string CA are mutated into GT. if(currentBaseCallPosition >= readString.length()) { log.warn("MD string " + mdString + " length "+currentBaseCallPosition+" > read " + readString + " length "+readString.length()+". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } // The bases for this mismatch // Capacity initially set to 8: 8 contiguous SNPs! StringBuilder bases = new StringBuilder(8); // Retrieve the mutation and create the first couple of mutated bases. char currentBaseCall = readString.charAt(currentBaseCallPosition); //log.debug("currentBaseCallPosition: " + currentBaseCallPosition); if(currentMDChar == 'A' || currentMDChar == 'C' || currentMDChar == 'G' || currentMDChar == 'T' || currentMDChar == 'N') { if(currentMDChar == currentBaseCall) { //error case : FALSE POSITIVE log.warn("Expected mutation " + currentMDChar + " at position " + (currentMDElementPosition-1) + " in MD string " + mdString + " but found same base " + currentBaseCall + " in read position " + currentBaseCallPosition + ". Cigar : " + read.getCigarString() + ", CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } bases.append(currentMDChar).append(currentBaseCall); } else { log.warn("Expected mutation but found " + currentMDChar + " at position " + (currentMDElementPosition-1) + " in MD string " + mdString + ". Cigar : " + read.getCigarString() + ", CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); //bases.append('?').append('?'); return false; } temporaryMDElementLength++; // Let's continue and see how many mismatches we find. // The number of mismatches will be the temporaryMDElementLength, whereas the variable // bases will store the mismatched couples (ReferenceBase,ReadBaseMutation). while(currentMDElementPosition < mdString.length()) { currentMDChar = mdString.charAt(currentMDElementPosition); if(currentMDChar == 'A' || currentMDChar == 'C' || currentMDChar == 'G' || currentMDChar == 'T' || currentMDChar == 'N') { if(currentBaseCallPosition+temporaryMDElementLength >= readString.length()) { log.warn("MD string " + mdString + " length "+currentBaseCallPosition+temporaryMDElementLength+" > read " + readString + " length "+readString.length()+". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } currentBaseCall = readString.charAt(currentBaseCallPosition+temporaryMDElementLength); if(currentMDChar == currentBaseCall) { //error case : FALSE POSITIVE log.warn("Expected mutation " + currentMDChar + " at position " + currentMDElementPosition + " in MD string " + mdString + " but found base " + currentBaseCall + " in read position " + (currentBaseCallPosition+temporaryMDElementLength) + ". Cigar : " + read.getCigarString() + ", CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } bases.append(currentMDChar).append(currentBaseCall); temporaryMDElementLength++; currentMDElementPosition++; } else if(currentMDChar == '0' && temporaryMDElementLength < temporaryCigarElementLength) { // temporaryMDElementLength < temporaryCigarElementLength is needed to assess that we // are still parsing the Cigar operator M, and not something else. currentMDElementPosition++; } else { break; } } // update the position of the currentBaseCall and the parser. if(bases.length() == 0) { //error case log.warn("MD string " + mdString + " < Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } // add the new MISMATCH element to the CigarMD string and update the temporaryCigarElementLength temporaryCigarElementLength = addMismatchToCigarMD(temporaryCigarElementLength, bases); } } else { // DO NOT PARSE A NEW MD ELEMENT // IF WE HAVE the mdString, the MD Element is a MATCH. This because temporaryMDElementLength > 0 and the MD string only reports numbers for matches. // Therefore, no mismatched bases to compute for this case. // IF WE DO NOT HAVE the mdString, this case is appropriate anyway. // add the new MATCH element to the CigarMD string and update the temporaryCigarElementLength temporaryCigarElementLength = addMatchToCigarMD(temporaryCigarElementLength); } //log.debug(cigarMD.toString()); } return true; } /** * Process the MD string once found the CIGAR operator I. * @read the SAMRecord to process. * @return true if the Cigar operator I has been processed correctly. */ private boolean processMDtagCigarOperatorI(SAMRecord read) { // The MD string does not contain information regarding an insertion. // NOTE: readString is already in upper case by samtools library, even if in the file, the read was in lowercase. // therefore, we do not need to worry about this. String insertedBases = readString.substring( currentBaseCallPosition, currentBaseCallPosition + currentCigarElementLength); currentBaseCallPosition = currentBaseCallPosition + currentCigarElementLength; for(int i=0; i<insertedBases.length(); i++) { char c = insertedBases.charAt(i); if(c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N') { log.warn("Read " + readString + " contains unknown inserted bases ("+insertedBases+"). Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } } //log.debug("tempCigElem: " + currentCigarElementLength + "I ~ " + " ; length: " + temporaryMDElementLength); cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.INSERTION, insertedBases)); return true; } /** * Process the MD string once found the CIGAR operator D. * @read the SAMRecord to process. * @return true if the Cigar operator D has been processed correctly. */ private boolean processMDtagCigarOperatorD(SAMRecord read) { if(mdString != null) { if(temporaryMDElementLength != 0) { // There is an inconsistency between Cigar and MD strings. // If the currentCigarElement is D, temporaryMDElementLength should be 0. log.warn("MD string " + mdString + " contains more matches/mismatches than Cigar string " + read.getCigarString() + ". CigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString() + ", MD string position : " + currentMDElementPosition + ". Base call position : " + currentBaseCallPosition); return false; } // Parse and extract the current MD Element. It is a string starting with ^ // and followed by a string of (A,C,G,T) // Extract the first character for the MD element. char currentMDChar = mdString.charAt(currentMDElementPosition); currentMDElementPosition++; // skip if the current MD element is zero. This is redundant information // if the CIGAR string is read too.. while (currentMDChar == '0') { if(mdString.length() <= currentMDElementPosition) { log.warn("MD string " + mdString + " < Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } currentMDChar = mdString.charAt(currentMDElementPosition); currentMDElementPosition++; } if (currentMDChar != '^') { // this means an inconsistency between the CIGAR and MD string log.warn("^ not found in the MD string " + mdString + " when processing the CigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString() + " in the Cigar String " + read.getCigarString()); return false; } if(mdString.length() < currentMDElementPosition + currentCigarElementLength) { log.warn("MD string " + mdString + " < Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } // The first character is a ^. There are exactly // temporaryCigarElementLength chars (A,C,G,T) to parse. // Let's be nice with programs setting the mdString bases in lower case. String deletedBases = mdString.substring(currentMDElementPosition, currentMDElementPosition + currentCigarElementLength); for(int i=0; i<deletedBases.length(); i++) { char c = deletedBases.charAt(i); if(c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N') { log.warn("MD string " + mdString + " contains unknown deleted bases ("+deletedBases+"). Cigar string " + read.getCigarString() + ". CurrentCigarElement : " + currentCigarElement.getLength() + currentCigarElement.getOperator().toString()); return false; } } currentMDElementPosition = currentMDElementPosition + currentCigarElementLength; // log.debug("tempCigElem: " + currentCigarElementLength + "D ~ " + "DeletedBases: " + deletedBases + " ; length: " + // currentMDElement.length()); cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.DELETION, deletedBases)); return true; } // if we do not have the mdString cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.DELETION, "")); return true; } /** * Process the MD string once found the CIGAR operator N. */ private void processMDtagCigarOperatorN() { // As far as I see the MD string contains information about skipped regions in the read. // Skipped regions are not reported in the read, so don't update currentBaseCallPosition. // We do not record the bases. // log.debug("tempCigElem: " + // currentCigarElementLength + "N ~ " + " ; length: " + // temporaryMDElementLength); cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.SKIPPED_REGION, "")); } /** Process the MD string once found the CIGAR operator S. */ private void processMDtagCigarOperatorS() { // MD string does not report any information about soft clips. They are just skipped. // Soft clips are reported in the read though, so the currentBaseCallPosition must be updated // We do not record the bases. // log.debug("tempCigElem: " + // currentCigarElementLength + "S ~ " + " ; length: " + // temporaryMDElementLength); currentBaseCallPosition = currentBaseCallPosition + currentCigarElementLength; cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.SOFT_CLIP, "")); } /** Process the MD string once found the CIGAR operator H. */ private void processMDtagCigarOperatorH() { // As far as I see the MD string contains information about hard clips. // Hard clips are not reported in the read, so don't update currentBaseCallPosition. // We do not record the bases. // log.debug("tempCigElem: " + // currentCigarElementLength + "H ~ " + " ; length: " + // temporaryMDElementLength); cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.HARD_CLIP, "")); } /** Process the MD string once found the CIGAR operator P. */ private void processMDtagCigarOperatorP() { // As far as I see the MD string contains information about padding. // Paddings are not reported in the read, so don't update currentBaseCallPosition. // We do not record the bases. // log.debug("tempCigElem: " + // currentCigarElementLength + "P ~ " + " ; length: " + // temporaryMDElementLength); cigarMD.add(new CigarMDElement(currentCigarElementLength, CigarMDOperator.PADDING, "")); } /** Process the MD string once found the CIGAR operator =. */ private void processMDtagCigarOperatorEQ() { // is this operator used? } /** Process the MD string once found the CIGAR operator X. */ private void processMDtagCigarOperatorNEQ() { // is this operator used? } }