// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package it.crs4.seal.common; import java.util.HashMap; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.nio.ByteBuffer; import java.io.UnsupportedEncodingException; /** * Abstract readable mapping class with tags. */ public abstract class AbstractTaggedMapping { public enum TagDataType { Char, String, Int, Float, NumArray, Bytes; public static TagDataType fromSamType(char t) { switch (t) { case 'A': return TagDataType.Char; case 'i': return TagDataType.Int; case 'f': return TagDataType.Float; case 'Z': return TagDataType.String; case 'H': return TagDataType.Bytes; case 'B': return TagDataType.NumArray; default: throw new IllegalArgumentException("Unknown tag type " + t); } } }; protected class TagCacheItem { private TagDataType type; private String value; public TagCacheItem(TagDataType type, String value) { this.type = type; this.value = value; } public String getValue() { return value; } public TagDataType getType() { return type; } public String toString() { return "(" + type + "," + value + ")"; } } //////////////////////////////////////////////// // variables //////////////////////////////////////////////// protected HashMap<String, TagCacheItem> tagCache; public AbstractTaggedMapping() { tagCache = new HashMap<String, TagCacheItem>(); } //////////////////////////////////////////////// // methods //////////////////////////////////////////////// abstract public String getName(); abstract public int getFlag(); abstract public String getContig() throws IllegalStateException; abstract public int get5Position() throws IllegalStateException; abstract public int getMapQ(); abstract public List<AlignOp> getAlignment() throws IllegalStateException; /** * This mapping's DNA sequence. * The ASCII representation of the base sequence is contained in the ByteBuffer, * starting at buffer.position() and ending at buffer.limit() (exclusive). * The buffer is mark()ed at the start of the sequence. */ abstract public ByteBuffer getSequence(); /** * Return this mapping's sequence as a string. * * A new string is created from the internal ByteBuffer and returned. */ public String getSequenceString() { return byteBufferToString(getSequence()); } /** * This mapping's DNA sequence's base quality scores. * The ASCII Phred+33 (Sanger encoding) representation of the base quality * scores sequence is contained in the ByteBuffer, * starting at buffer.position() and ending at buffer.limit() (exclusive). * The buffer is mark()ed at the start of the sequence. */ abstract public ByteBuffer getBaseQualities(); /** * Return this mapping's base qualities as a string. * * A new string is created from the internal ByteBuffer and returned. */ public String getBaseQualitiesString() { return byteBufferToString(getBaseQualities()); } /** * Get the sequence's length. */ abstract public int getLength(); /** * Returns a string representation of this read mapping. * The string looks something like a partial SAM format. It contains the following * tab-separated fields: * name, flag, contig, 5' position, sequence. */ public String toString() { StringBuilder builder = new StringBuilder(1000); builder. append(getName()).append("\t"). append(getFlag()).append("\t"); if (isMapped()) { builder. append(getContig()).append("\t"). append(get5Position()).append("\t"); } else builder.append("*\t*\t"); ByteBuffer seq = getSequence(); try { builder.append( new String(seq.array(), seq.position(), seq.limit() - seq.position(), "US-ASCII") ); } catch (UnsupportedEncodingException e) { throw new RuntimeException("??? US-ASCII charset not supported! " + e); } return builder.toString(); } //////////////////////// tag methods //////////////////////// /** * Retrieve a TagCacheItem for the named tag. * Checks the cache and retrieves the item from the cache if it's there. If * it's not, it calls getTagText to retrieve it, scans it creating a * TagCacheItem which it caches and returns to the caller. * @return a TagCacheItem for the given tag name. The returned object is the one in this object's cache. * @exception NoSuchFieldException Tag not found in mapping record. * @exception FormatException Invalid SAM tag syntax. */ abstract protected TagCacheItem makeTagItem(String name) throws NoSuchFieldException; /** * Get a tag item, going through the cache. * * This method checks the cache to see if the relevant tag item is already present. * If it isn't, it calls makeTagItem(name) to fetch it, if possibile, and * then saves it new item in the cache before returning it to the caller. */ protected TagCacheItem getTagItem(String name) throws NoSuchFieldException { TagCacheItem item = tagCache.get(name); if (item == null) { item = makeTagItem(name); tagCache.put(name, item); } return item; } public String getTag(String name) throws NoSuchFieldException { return getTagItem(name).getValue(); } public boolean hasTag(String name) { try { getTagItem(name); return true; } catch (NoSuchFieldException e) { return false; } } public int getIntTag(String name) throws NoSuchFieldException { TagCacheItem item = getTagItem(name); if (item.getType() == TagDataType.Int) return Integer.parseInt(item.getValue()); else throw new NumberFormatException("item " + item + " is not of integer type"); } public double getDoubleTag(String name) throws NoSuchFieldException { TagCacheItem item = getTagItem(name); if (item.getType() == TagDataType.Float || item.getType() == TagDataType.Int) return Double.parseDouble(item.getValue()); else throw new NumberFormatException("item " + item + " is not of double type"); } //////////////////////// flag methods //////////////////////// public boolean isPaired() { return AlignFlags.Paired.is(getFlag()); } public boolean isProperlyPaired() { return AlignFlags.ProperlyPaired.is(getFlag()); } public boolean isMapped() { return AlignFlags.Unmapped.isNot(getFlag()); } public boolean isUnmapped() { return AlignFlags.Unmapped.is(getFlag()); } public boolean isMateMapped() { return AlignFlags.MateUnmapped.isNot(getFlag()); } public boolean isMateUnmapped() { return AlignFlags.MateUnmapped.is(getFlag()); } public boolean isOnReverse() { return AlignFlags.OnReverse.is(getFlag()); } public boolean isMateOnReverse() { return AlignFlags.MateOnReverse.is(getFlag()); } public boolean isRead1() { return AlignFlags.Read1.is(getFlag()); } public boolean isRead2() { return AlignFlags.Read2.is(getFlag()); } public boolean isSecondaryAlign() { return AlignFlags.SecondaryAlignment.is(getFlag()); } public boolean isFailedQC() { return AlignFlags.FailedQC.is(getFlag()); } public boolean isDuplicate() { return AlignFlags.Duplicate.is(getFlag()); } //////////////////////// template/insert size methods //////////////////////// /** * Check whether the template length is available. * If it returns true the method getTemplateLength() will not throw. */ abstract public boolean isTemplateLengthAvailable(); /** * Get the observed template length. * The value returned from this method is closely related to the "TLEN: signed * observed template length" field in the SAM spec, but <b>slighly different</b>: * the value is always > 0. When the template length is unavailable this method * throws an IllegalStateException. This can happen when: * <ul> * <li>the mapping is for an unpaired read;</li> * <li>either this read or the mate are unmapped;</li> * <li>read and mate are mapped to different contigs.</li> * </ul> * * @exception IllegalStateException Template length is unavailable. */ abstract public int getTemplateLength() throws IllegalStateException; //////////////////////// utility methods //////////////////////// protected static String byteBufferToString(ByteBuffer buffer) { try { return new String(buffer.array(), buffer.position(), buffer.limit() - buffer.position(), "US-ASCII"); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Can't get US-ASCII encoding!"); } } // These below should probably be in another class. /** * Calculate the reference coordinate for each matched position in this mapping. * This read must not be unmapped. * * Writes getLength() integers into dest. Non-matched positions (AlignOp != Match) * are set to -1, while the rest are set to the reference position to which * the base at the corresponding position was matched. * */ public void calculateReferenceCoordinates(ArrayList<Integer> dest) throws IllegalStateException { if (isUnmapped()) throw new IllegalStateException("Can't calculate reference coordinates for an unmapped read"); dest.clear(); List<AlignOp> alignment = getAlignment(); int refPos = get5Position(); for (AlignOp op: alignment) { int opLength = op.getLen(); if (op.getType() == AlignOp.Type.Match) { int endPos = refPos + opLength; for ( ; refPos < endPos; ++refPos) dest.add(refPos); } else if (op.getType() == AlignOp.Type.Insert || op.getType() == AlignOp.Type.SoftClip) { for (int i = op.getLen(); i > 0; --i) dest.add(-1); } else if (op.getType() == AlignOp.Type.Delete) { refPos += op.getLen(); } else throw new RuntimeException("Found " + op.getType() + " alignment operation. Sorry, but I don't know how to deal with this."); } if (dest.size() != getLength()) throw new RuntimeException("Inconsistency? Alignment in " + AlignOp.cigarStr(alignment) + " doesn't exactly cover the entire read (got " + dest.size() + " positions for " + getLength() + " bases)"); } /** * Calculate which read positions match or don't match their corresponding reference position. * Calculation is based on the read's alignment and MD tag. * This object must be mapped. The dest array will be cleared and then filled with one value * per base in the read. True indicates that the base matches the reference; false indicates * a mismatch; null indicates that the base doesn't have a matching reference base (it's an * insertion or it has been clipped). * * @exception IllegalStateException This mapping is unmapped. * @exception RuntimeException Unexpected errors such as bad CIGAR or MD tags. */ public void calculateReferenceMatches(ArrayList<Boolean> dest) throws IllegalStateException { if (isUnmapped()) throw new IllegalStateException("Can't calculate reference coordinates for an unmapped read"); dest.clear(); dest.ensureCapacity(getLength()); List<AlignOp> alignment = getAlignment(); if (alignment.isEmpty()) throw new RuntimeException("No alignment for read " + this); try { List<MdOp> md = MdOp.scanMdTag(getTag("MD")); if (md.isEmpty()) throw new RuntimeException("no MD operators extracted from tag! (tag: " + getTag("MD") + ")"); Iterator<MdOp> mdIt = md.iterator(); MdOp mdOp = mdIt.next(); int mdOpConsumed = 0; // the number of positions within the current mdOp that have been consumed. // we iterate through the alignment. We really only care about operations which // "consume" bases from our read: insertions, soft clippings and matches. // // In the case of inserts and soft clips we don't have a corresponding reference base, // so we leave the match value as null. // // In the case of a cigar Match, we consult the MD string. We advance along the MD matches // and mismatches inserting corresponding true or false values. for (AlignOp alignOp: alignment) { if (alignOp.getType() == AlignOp.Type.Match) { int positionsToCover = alignOp.getLen(); while (positionsToCover > 0 && mdOp != null) { if (mdOp.getType() == MdOp.Type.Delete) mdOp = mdIt.next(); // skip it else { // must be a match or a mismatch boolean match = mdOp.getType() == MdOp.Type.Match; int consumed = Math.min(mdOp.getLen() - mdOpConsumed, positionsToCover); for (int i = consumed; i > 0; --i) dest.add(match); positionsToCover -= consumed; mdOpConsumed += consumed; if (mdOpConsumed >= mdOp.getLen()) // operator consumed. Advance to next { mdOpConsumed = 0; if (mdIt.hasNext()) mdOp = mdIt.next(); else mdOp = null; } } } if (positionsToCover > 0 && mdOp == null) throw new RuntimeException("BUG or bad data?? Found more read positions than was covered by the MD tag. CIGAR: " + AlignOp.cigarStr(alignment) + "; MD: " + getTag("MD") + "; read: " + this.toString()); } else if (alignOp.getType() == AlignOp.Type.Insert || alignOp.getType() == AlignOp.Type.SoftClip) { for (int i = alignOp.getLen(); i > 0; --i) dest.add(null); } // else the op doesn't affect the read } } catch (NoSuchFieldException e) { throw new RuntimeException(e.getMessage()); } } }