package org.seqcode.genome.sequence; import java.util.Collection; import java.util.LinkedList; import java.util.Random; import org.seqcode.tools.sequence.UShuffle; /** * <code>SequenceUtils</code> provides a number of static methods for manipulating * DNA sequences stores as strings or char[]. * * @author <a href="mailto:arolfe@mit.edu">Alex Rolfe</a> */ public class SequenceUtils { /** * <code>complement</code> returns the complement of a nucleotide in the 2-bit representation. */ public static int complement(int i) { switch(i) { case 0: return 1; case 1: return 0; case 2: return 3; case 3: return 2; default: return -1; } } /** * <code>complement</code> returns the complement of a nucleotide in the character representation * (A,C,T,G,a,c,t,g). */ public static char complementChar(char c) { if (trans == null) { trans = new char['z']; trans['A'] = 'T'; trans['C'] = 'G'; trans['T'] = 'A'; trans['G'] = 'C'; trans['a'] = 'T'; trans['c'] = 'G'; trans['t'] = 'A'; trans['g'] = 'C'; trans['n'] = 'N'; trans['N'] = 'N'; } return trans[c]; } private static char[] trans; /** * <code>reverseComplement</code> mutates in the input array of characters * (A,C,T,G,a,c,t,g) to be the reverse complement. */ public static void reverseComplement(char[] array) { if (trans == null) { trans = new char['z']; trans['A'] = 'T'; trans['C'] = 'G'; trans['T'] = 'A'; trans['G'] = 'C'; trans['a'] = 't'; trans['c'] = 'g'; trans['t'] = 'a'; trans['g'] = 'c'; trans['N'] = 'N'; trans['n'] = 'n'; trans['X'] = 'X'; trans['x'] = 'x'; } int i; int end = array.length - 1; for (i = 0; i <= array.length / 2 && i < array.length; i++) { try { char first = array[i]; array[i] = trans[array[end - i]]; array[end-i] = trans[first]; } catch (ArrayIndexOutOfBoundsException ex) { ex.printStackTrace(); System.err.println("i=" + i); System.err.println("first = " + array[i]); System.err.println("other = " + array[end-i]); System.err.println("trans = " + trans[array[i]] + " and " + trans[array[end-i]]); } } } public static String reverseComplement(String str) { StringBuilder sb = new StringBuilder(); for(int i = str.length()-1; i>= 0; i--) { sb.append(complementChar(str.charAt(i))); } return sb.toString(); } public static byte[] reverseComplement(byte[] bases) { byte[] rc = new byte[bases.length]; int j=0; for(int i = bases.length-1; i>= 0; i--) { rc[j]=(byte)complementChar((char)bases[i]); j++; } return rc; } /** converts from 2-bit representation to character representation */ public static char int2char(int i) { switch(i) { case 0: return 'A'; case 1: return 'C'; case 2: return 'G'; case 3: return 'T'; } return 'n'; } /** converts from character representation to 2-bit representation */ public static int char2int(char c) { switch(c) { case 'a': case 'A': return 0; case 'c': case 'C': return 1; case 'g': case 'G': return 2; case 't': case 'T': return 3; } return -1; } /** maps a DNA sequence (ie [actgACTG]*) to * a Long representing that sequence in 2-bit per * base representation * * * we should time this and compare performance to the version * where you have * int charToInt[256] * charToInt['a'] = 0 * etc * and do lookups from that... */ public static long StringToLong(String a) { return StringToLong(a,0,a.length()); } /** convert k characters of String to a long, starting at position offset * * Throws a StringIndexOutOfBoundsException (or something) if there aren't * enough characters in the string * * Apparently Tim and Alex did this conversion in opposite order- Tim's had * string[0] as the lowest bits in the long and Alex had them as the highest. * I like my way better so that * 1) when you write out the long and the string, the letters are in the same order * 2) the strings and the longs sort the same order * */ public static long StringToLong(String a, int offset, int k) { long sum = (long)0; for(int i = 0; i < k; i++) { int val = char2int(a.charAt(i + offset)); sum = (sum << 2) + val; } return sum; } public static String LongToString(Long l, int length) { char[] output = new char[length]; while (length-- > 0) { output[length] = int2char((int)(l & 3)); l >>= 2; } return new String(output); } public static Collection<String> creatKMers(String original, int k) { LinkedList<String> kmers = new LinkedList<String>(); for(int i = 0; i <= original.length() - k; i++) { kmers.addLast(original.substring(i, i + k)); } return kmers; } private static cern.jet.random.engine.RandomEngine randomEngine; public static String generateRandomBases(int k){ if (randomEngine==null) randomEngine = new cern.jet.random.engine.MersenneTwister(); StringBuffer sb=new StringBuffer(); for (int i=0;i<k;i++){ int num = Math.abs(randomEngine.nextInt()) % 4; sb.append(int2char(num)); } return sb.toString(); } public static String generateRandomString(int k){ if (randomEngine==null) randomEngine = new cern.jet.random.engine.MersenneTwister(); char[] chars = new char[k]; for (int i=0;i<k;i++){ chars[i]=(char)(Math.abs(randomEngine.nextInt()) % 26 + 'A'); } return new String(chars); } /** * Single nucleotide shuffle * @param str * @param randObj * @return */ public static String shuffle(String str, Random randObj){ if (str.length()<=1) return str; int split=str.length()/2; String temp1=shuffle(str.substring(0,split), randObj); String temp2=shuffle(str.substring(split), randObj); if (randObj.nextDouble() > 0.5) return temp1 + temp2; else return temp2 + temp1; } /** * Di-nucleotide shuffle * @param str * @param randObj * @return */ public static String dinu_shuffle(String str, Random randObj){ if (str.length()<=1) return str; char[] result = new char[str.length()]; UShuffle sf = new UShuffle(); sf.set_randfunc(randObj); sf.shuffle(str.toCharArray(), result, str.length(), 2); return new String(result); } }