/* * The MIT License * * Copyright (c) 2011 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.util; import htsjdk.samtools.ReservedTagConstants; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.SequenceUtil; /** * Utilities to clip the adapater sequence from a SAMRecord read * * @author Tim Fennell */ public class ClippingUtility { /** * The default value used for the minimum number of contiguous bases to match against. */ public static final int MIN_MATCH_BASES = 12; /** * The default value used for the minimum number of contiguous bases to match against in a paired end read */ public static final int MIN_MATCH_PE_BASES = 6; /** * The default value used for the maximum error rate when matching read bases to clippable sequence. */ public static final double MAX_ERROR_RATE = 0.10; /** * The default value used for the maximum error rate when matching paired end read bases to clippable sequence. */ public static final double MAX_PE_ERROR_RATE = 0.10; /** * The value returned by methods returning int when no match is found. */ public static final int NO_MATCH = -1; private static final Log log = Log.getInstance(ClippingUtility.class); /** * @deprecated Use the varargs version. This no longer returns a warning string.. */ public static void adapterTrimIlluminaSingleRead(final SAMRecord read, final AdapterPair adapter) { adapterTrimIlluminaSingleRead(read, MIN_MATCH_BASES, MAX_ERROR_RATE, adapter); } /** * @deprecated Use the varargs version. This no longer returns a warning string.. */ public static void adapterTrimIlluminaSingleRead(final SAMRecord read, final AdapterPair adapter, final int minMatchBases, final double maxErrorRate) { adapterTrimIlluminaSingleRead(read, minMatchBases, maxErrorRate, adapter); } /** * Invokes adapterTrimIlluminRead with default parameters for a single read. * If the read is a negative strand, its bases will be reverse complemented * Simpler, more common of two overloads. Accepts multiple adapters * and tries them all until it finds the first one that matches. * * @param read SAM/BAM read to trim * @param adapters which adapters to try to use (indexed, paired_end, or single_end) * @return AdapterPair the AdapterPair matched, or null */ public static AdapterPair adapterTrimIlluminaSingleRead(final SAMRecord read,final AdapterPair ... adapters) { return adapterTrimIlluminaSingleRead(read, MIN_MATCH_BASES, MAX_ERROR_RATE, adapters); } /** * Invokes adapterTrimIlluminRead with explicit matching thresholds for a single read. * If the read is a negative strand, a copy of its bases will be reverse complemented. * More general form of the two overloads. Accepts multiple adapters * and tries them all until it finds the first one that matches. * * @param read SAM/BAM read to trim * @param minMatchBases minimum number of contiguous bases to match against in a read * @param maxErrorRate maximum error rate when matching read bases * @param adapters which adapters to try (indexed, paired_end, or single_end) * @return AdapterPair the AdapterPair matched, or null */ public static AdapterPair adapterTrimIlluminaSingleRead(final SAMRecord read, final int minMatchBases, final double maxErrorRate, final AdapterPair ... adapters) { for (AdapterPair adapter : adapters) { final int indexOfAdapterSequence = findIndexOfClipSequence( getReadBases(read), adapter.get3PrimeAdapterBytes(), minMatchBases, maxErrorRate); if (indexOfAdapterSequence != NO_MATCH) { // Convert to a one-based index for storage on the record. read.setAttribute(ReservedTagConstants.XT, indexOfAdapterSequence + 1); return adapter; } } return null; } /** * @deprecated Use the varargs version. This no longer returns a warning string.. */ public static String adapterTrimIlluminaPairedReads(final SAMRecord read1, final SAMRecord read2, final AdapterPair adapters) { adapterTrimIlluminaPairedReads(read1, read2, MIN_MATCH_PE_BASES, MAX_PE_ERROR_RATE, adapters); return null; } /** * @deprecated Use the varargs version. This no longer returns a warning string.. */ public static String adapterTrimIlluminaPairedReads(final SAMRecord read1, final SAMRecord read2, final AdapterPair adapters, final int minMatchBases, final double maxErrorRate) { adapterTrimIlluminaPairedReads(read1, read2, minMatchBases, maxErrorRate, adapters); return null; } /** * Invokes adapterTrimIlluminaPairedReads with default less stringent parameters for a pair of reads. * If the read is a negative strand, its bases will be reverse complemented * Simpler, more common of two overloads. * * @param read1 first read of the pair * @param read2 second read of the pair * @param adapters which adapters to use (indexed, paired_end, or single_end, nextera), attempted in order * @return int number of bases trimmed */ public static AdapterPair adapterTrimIlluminaPairedReads(final SAMRecord read1, final SAMRecord read2, final AdapterPair ... adapters) { return adapterTrimIlluminaPairedReads(read1, read2, MIN_MATCH_PE_BASES, MAX_PE_ERROR_RATE, adapters); } /** * Invokes adapterTrimIlluminaRead with explicit parameters for a pair of reads. * More general form of two overloads. * Returns a warning string when the trim positions found differed for each read. * * @param read1 first read of the pair. * If read1 is a negative strand, a copy of its bases will be reverse complemented. * @param read2 second read of the pair. * If read2 is a negative strand, a copy of its bases will be reverse complemented * @param minMatchBases minimum number of contiguous bases to match against in a read * @param maxErrorRate maximum error rate when matching read bases * @param adapters which adapters to use (indexed, paired_end, or single_end, nextera), attempted in order * @return int number of bases trimmed */ public static AdapterPair adapterTrimIlluminaPairedReads(final SAMRecord read1, final SAMRecord read2, final int minMatchBases, final double maxErrorRate, final AdapterPair ... adapters) { AdapterPair matched = null; for (final AdapterPair adapterPair : adapters) { final int index1 = findIndexOfClipSequence( getReadBases(read1), adapterPair.get3PrimeAdapterBytes(), minMatchBases, maxErrorRate); final int index2 = findIndexOfClipSequence( getReadBases(read2), adapterPair.get5PrimeAdapterBytesInReadOrder(), minMatchBases, maxErrorRate); if (index1 == index2) { if (index1 != NO_MATCH) { // This is the best result: both match exactly, we're done read1.setAttribute(ReservedTagConstants.XT, index1 + 1); read2.setAttribute(ReservedTagConstants.XT, index2 + 1); return adapterPair; } else { // Otherwise they were both no match, we just keep trying } } else if (index1 == NO_MATCH || index2 == NO_MATCH) { // One of them matched, but the other didn't. // Try matching the one that did match again with a little tighter // stringency and, if that works, trim both reads at the matching point. // This is only the second-best possibility... keep looking for a perfect match. if(attemptOneSidedMatch(read1, read2, index1, index2, 2 * minMatchBases)) { matched = adapterPair; } } else { // Both matched at different positions. Do nothing } } return matched; } /** * When an adapter is matched in only one end of a pair, we check it again with * stricter thresholds. If it still matches, then we trim both ends of the read * at the same location. */ private static boolean attemptOneSidedMatch(final SAMRecord read1, final SAMRecord read2, final int index1, final int index2, final int stricterMinMatchBases) { // Save all the data about the read where we found the adapter match final int matchedIndex = index1 == NO_MATCH ? index2 : index1; final SAMRecord matchedRead = index1 == NO_MATCH ? read2 : read1; // If it still matches with a stricter minimum matched bases, then // clip both reads if (matchedRead.getReadLength() - matchedIndex >= stricterMinMatchBases) { if (read1.getReadBases().length > matchedIndex) { read1.setAttribute(ReservedTagConstants.XT, matchedIndex + 1); } if (read2.getReadBases().length > matchedIndex) { read2.setAttribute(ReservedTagConstants.XT, matchedIndex + 1); } return true; } return false; } /** * Returns an array of bytes representing the bases in the read, * reverse complementing them if the read is on the negative strand */ private static byte[] getReadBases(final SAMRecord read) { if (!read.getReadNegativeStrandFlag()) { return read.getReadBases(); } else { final byte[] reverseComplementedBases = new byte[read.getReadBases().length]; System.arraycopy(read.getReadBases(), 0, reverseComplementedBases, 0, reverseComplementedBases.length); SequenceUtil.reverseComplement(reverseComplementedBases); return reverseComplementedBases; } } /** * Finds the first index of the adapterSequence sequence in the read sequence requiring at least minMatch * bases of pairwise alignment with a maximum number of errors dictated by maxErrorRate. * * @param read */ public static int findIndexOfClipSequence(final byte[] read, final byte[] adapterSequence, final int minMatch, final double maxErrorRate) { // If the read's too short we can't possibly match it if (read == null || read.length < minMatch) return NO_MATCH; final int minClipPosition = 0; // Walk backwards down the read looking for the sequence READ_LOOP: for (int start = read.length - minMatch; start > minClipPosition -1; --start) { final int length = Math.min(read.length - start, adapterSequence.length); final int mismatchesAllowed = (int) (length * maxErrorRate); int mismatches = 0; for (int i = 0; i < length; ++i) { if (!SequenceUtil.isNoCall(adapterSequence[i]) && !SequenceUtil.basesEqual(adapterSequence[i], read[start + i])) { if (++mismatches > mismatchesAllowed) continue READ_LOOP; } } // If we got this far without breaking out, then it matches return start; } return NO_MATCH; } }