/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package org.twentyn.proteintodna;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* * Spacing between arms of the hairpin can be anywhere from 4 to 9, and are
* considered to be of equal weight
*
* * a match of 2bp is 100x, 3bp is 1000x, etc., so the length of the match
* of the arms is log-scale dependence
*
* * G:C is 3 HBonds; A:T is 2 HBonds; G:U is 2 HBonds; these are scored linearly
*/
public class HairpinCounter {
public static void main(String[] args) throws Exception {
HairpinCounter hc = new HairpinCounter();
double d = hc.score("AACCCCCAAAAAAAAAGGGGGAAAAAAAAAAAAA");
System.out.println("score: " + d);
}
public double score(String seq) throws Exception {
seq = seq.toUpperCase();
char[] seqRevC = SequenceUtils.reverseComplement(seq).toCharArray();
char[] s = seq.toCharArray();
int len = seq.length();
double out = 0.0;
//For each spacer length between 4 and 9
for(int spaces = 4; spaces <= 9; spaces++) {
//scan through the sequence and test each potential hairpin
for(int i=0; i<len-spaces-12; i++) {
// String hairpin = seq.substring(i, i+spaces+12);
int hbonds = countHbonds(s, i, i+spaces+12, seqRevC, len);
// int hbondsInefficient = countHbonds(hairpin);
// if (hbondsInefficient != hbonds)
// throw new Exception("not equal: " + hbonds + " <> " + hbondsInefficient);
out+= Math.pow(2, hbonds);
}
}
return out;
}
private int countHbonds(String hairpin) {
String prefix = hairpin.substring(0,6);
String suffix = hairpin.substring(hairpin.length()-6);
String prerev = SequenceUtils.reverseComplement(prefix);
//See how many out of the six being examined match
int matchlength = 0;
for(int i=0; i<6; i++) {
if(prerev.charAt(i)==suffix.charAt(i)) {
matchlength = i;
} else {
break;
}
}
if(matchlength <3) {
return 0;
}
//For the ones that match, score them for A, T, C, G
int hbonds = 0;
for(int i=0; i<matchlength; i++) {
char achar = prerev.charAt(i);
if(achar == 'C' || achar == 'G') {
hbonds+=3;
}
if(achar == 'A' || achar == 'T') {
hbonds+=2;
}
}
return hbonds;
}
private int countHbonds(char[] seq, int startInc, int endExcl, char[] revC, int len) {
// This code is an optimized version of the code that used to take in a String `hairpin`
// where the `hairpin` was a substring of an origin `seq`
// Instead now the caller just passing in the origin `seq` and `startInc` and `endExcl`
// indices into `seq`. That way we do not recreate small substrings over and over.
// What that means then is that we have to do some indexing to locate the right `prefix`
// and `suffix` that the original code needed. They are now translated as follows:
//
// *** prefix = hairpin.substring(0,6)
// prefix now goes from `startInc` 6 chars ahead, i.e., seq [startInc, startInc + 6]
// *** suffix = hairpin.substring(hairpin.length()-6)
// suffix goes from end index `endExcl` and 6 chars before, i.e., seq [endExcl - 6, endExcl]
// *** revComplement(prefix) = SequenceUtils.reverseComplement(prefix)
// rev complement is a little funky. we pass in the reverse complement of `seq` as `revC`
// but now to locate the reverse complement of `hairpin` we have to subtract its indices from
// `len-1`, and then since we will be reading it in reverse (and always 6 chars wide),
// the index of `i` will be at `len - 1 - (startInc + 5 - i)`! Apologies, but that is what it is!
// revComplement(prefix)[i] == revC[len-1 - (startInc+5-i)]
int suffixStart = endExcl - 6;
//See how many out of the six being examined match
int matchlength = 0;
for(int i=0; i<6; i++) {
// now we need to check `suffix.charAt(i) == revComplement(prefix).charAt(i) == suffix.charAt(i)`
// suffix.chatAt(i) is index `suffixStart + i`
// revComplement(prefix).charAt(i) is more complicated, but as explained above.
char revChar = revC[len - 1 - startInc - 5 + i];
if (seq[suffixStart + i] == revChar) {
matchlength = i;
} else {
break;
}
}
if(matchlength <3) {
return 0;
}
int hbonds = 0;
//For the ones that match, score them for A, T, C, G
for(int i=0; i<matchlength; i++) {
// compute metrics on revComplement(prefix).charAt(i); get that char using indexing as above
char achar = revC[len - 1 - startInc - 5 + i];
if(achar == 'C' || achar == 'G') {
hbonds+=3;
}
if(achar == 'A' || achar == 'T') {
hbonds+=2;
}
}
return hbonds;
}
}