/* * The MIT License * * Copyright (c) 2011 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.util; import htsjdk.samtools.util.FormatUtil; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.SolexaQualityConverter; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import java.util.List; /** * Misc utilities for working with Illumina specific files and data * * @author jburke@broadinstitute.org */ public class IlluminaUtil { public static final String BARCODE_DELIMITER = "-"; /** * Parse the tile # from the read name. * If we find that there are other elements needed from the read name, it might be a good idea to put * makeReadName() and various get..() methods into a new class. * * @param readName As produced by IlluminaUtil.makeReadName() * @return tile number, or null if read name is not in correct format. */ public static Integer getTileFromReadName(final String readName) { final int first = readName.indexOf(':'); if (first > 0) { final int second = readName.indexOf(':', first+1); if (second > 0) { final int third = readName.indexOf(':', second+1); if (third > 0) { return Integer.parseInt(readName.substring(second+1, third)); } } } return null; } /** * Convert from Solexa-scaled ASCII qualities to Phred-scaled binary. The only difference is Solexa qualities have * 64 added to the phred binary to make them printable. * * @param solexaQualities Printable ASCII qualities. * @return binary Phred-scaled qualities. */ public static byte[] makePhredBinaryFromSolexaQualityAscii_1_3(final String solexaQualities) { return makePhredBinaryFromSolexaQualityAscii_1_3(solexaQualities, 0, solexaQualities.length()); } /** * Convert from Solexa-scaled ASCII qualities to Phred-scaled binary. The only difference is Solexa qualities have * 64 added to the phred binary to make them printable. * * @param solexaQualities Printable ASCII qualities. * @param offset Character at which to start conversion. * @param length Number of characters to convert. * @return binary Phred-scaled qualities. */ public static byte[] makePhredBinaryFromSolexaQualityAscii_1_3(final String solexaQualities, final int offset, final int length) { final byte[] quals = StringUtil.stringToBytes(solexaQualities, offset, length); SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(quals); return quals; } /** * Converts from Solexa ASCII to Phred binary in place. These are the older-style qualities * rather than Phred qualities with a different addend to make them printable. */ public static void convertSolexaQualityAscii_1_1_ToPhredBinary(final byte[] solexaQualities) { SolexaQualityConverter.getSingleton().convertSolexaQualityCharsToPhredBinary(solexaQualities); } /** * Get a Solexa ASCII quality value from an array of strings that are integer qualities in this order: * [cycle-1-A, cycle-1-C, cycle-1-G, cycle-1-T, cycle-2-A, ...]. The best quality from the 4 qualities for * the cycle is found, and then it is ASCII-ized by adding 64. * @param qualities Array of integer quality strings. * @param cycleNumber Which cycle to get quality for. * @param formatter For converting decimal strings to ints. * @return best quality for the given cycle. * @throws picard.PicardException if the best quality ASCII value is > 255. */ public static byte getSolexaQualityCharFromFourQualities(final String[] qualities, final int cycleNumber, final FormatUtil formatter) { // It apparently is the case that all 4 qualities might be negative, but this appears to correspond to // an no-called base. int bestQuality = Integer.MIN_VALUE; final int startOffset = (cycleNumber - 1) * 4; for (int i = startOffset; i < startOffset + 4; ++i) { final int quality = formatter.parseInt(qualities[i]); if (quality > bestQuality) { bestQuality = quality; } } final int qualityAsCharacter = bestQuality + SolexaQualityConverter.SOLEXA_ADDEND; if (qualityAsCharacter > 255) { throw new PicardException("Quality too large: " + bestQuality); } return (byte)(qualityAsCharacter & 0xff); } // Strings indented below to make these easier to compare visually. /** Describes adapters used on each pair of strands */ public static enum IlluminaAdapterPair implements AdapterPair { PAIRED_END( "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", //58 bases) "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"), // 61 bases INDEXED ( "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"), // note 8 N's // 67 bases SINGLE_END ( "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"), NEXTERA_V1( "AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAG", "CTGTCTCTTATACACATCTCTGAGCGGGCTGGCAAGGCAGACCGNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"), NEXTERA_V2( "AATGATACGGCGACCACCGAGATCTACACNNNNNNNNTCGTCGGCAGCGTCAGATGTGTATAAGAGACAG", "CTGTCTCTTATACACATCTCCGAGCCCACGAGACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"), DUAL_INDEXED( "AATGATACGGCGACCACCGAGATCTNNNNNNNNACACTCTTTCCCTACACGACGCTCTTCCGATCT", "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"), FLUIDIGM( "AATGATACGGCGACCACCGAGATCTACACTGACGACATGGTTCTACA", "AGACCAAGTCTCTGCTACCGTANNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"), TRUSEQ_SMALLRNA( "AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGACGATC", "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG"), // This one is at the end of the list because its 3' is a subset of several of the 3's above. // There are unit tests that try all AdapterPairs, and this one should go at the end os // it is checked last. ALTERNATIVE_SINGLE_END("AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGACGATC", "TCGTATGCCGTCTTCTGCTTG"), ; final String fivePrime, threePrime, fivePrimeReadOrder; final byte[] fivePrimeBytes, threePrimeBytes, fivePrimeReadOrderBytes; private IlluminaAdapterPair(final String fivePrime, final String threePrime) { this.threePrime = threePrime; this.threePrimeBytes = StringUtil.stringToBytes(threePrime); this.fivePrime = fivePrime; this.fivePrimeReadOrder = SequenceUtil.reverseComplement(fivePrime); this.fivePrimeBytes = StringUtil.stringToBytes(fivePrime); this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder); } public String get3PrimeAdapter(){ return threePrime; } public String get5PrimeAdapter(){ return fivePrime; } public String get3PrimeAdapterInReadOrder(){ return threePrime; } public String get5PrimeAdapterInReadOrder() { return fivePrimeReadOrder; } public byte[] get3PrimeAdapterBytes() { return threePrimeBytes; } public byte[] get5PrimeAdapterBytes() { return fivePrimeBytes; } public byte[] get3PrimeAdapterBytesInReadOrder() { return threePrimeBytes; } public byte[] get5PrimeAdapterBytesInReadOrder() { return fivePrimeReadOrderBytes; } public String getName() { return this.name(); } } /** * Concatenates all the barcode sequences with BARCODE_DELIMITER * @param barcodes * @return A single string representation of all the barcodes */ public static String barcodeSeqsToString(final List<String> barcodes) { return barcodeSeqsToString(barcodes.toArray(new String[barcodes.size()])); } /** * Concatenates all the barcode sequences with BARCODE_DELIMITER * @param barcodes * @return A single string representation of all the barcodes */ public static String barcodeSeqsToString(final String barcodes[]) { final StringBuilder sb = new StringBuilder(); for (final String bc : barcodes) { if (sb.length() > 0) sb.append(BARCODE_DELIMITER); sb.append(bc); } return sb.toString(); } /** * Concatenates all the barcode sequences with BARCODE_DELIMITER * @param barcodes * @return A single string representation of all the barcodes */ public static String barcodeSeqsToString(final byte barcodes[][]) { final String bcs[] = new String[barcodes.length]; for (int i = 0; i < barcodes.length; i++) { bcs[i] = StringUtil.bytesToString(barcodes[i]); } return barcodeSeqsToString(bcs); } }