HairpinCounter.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package org.twentyn.proteintodna;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * * Spacing between arms of the hairpin can be anywhere from 4 to 9, and are
 * considered to be of equal weight
 * 
 * * a match of 2bp is 100x, 3bp is 1000x, etc., so the length of the match
 * of the arms is log-scale dependence
 * 
 * * G:C is 3 HBonds; A:T is 2 HBonds; G:U is 2 HBonds; these are scored linearly
 */
public class HairpinCounter {
    public static void main(String[] args) throws Exception {
        HairpinCounter hc = new HairpinCounter();
        double d = hc.score("AACCCCCAAAAAAAAAGGGGGAAAAAAAAAAAAA");
        System.out.println("score: " + d);
    }

    public double score(String seq) throws Exception {
        seq = seq.toUpperCase();
        char[] seqRevC = SequenceUtils.reverseComplement(seq).toCharArray();
        char[] s = seq.toCharArray();
        int len = seq.length();
        double out = 0.0;
        //For each spacer length between 4 and 9
        for(int spaces = 4; spaces <= 9; spaces++) {
            //scan through the sequence and test each potential hairpin
            for(int i=0; i<len-spaces-12; i++) {
                // String hairpin = seq.substring(i, i+spaces+12);
                int hbonds = countHbonds(s, i, i+spaces+12, seqRevC, len);
                // int hbondsInefficient = countHbonds(hairpin);
                // if (hbondsInefficient != hbonds)
                //   throw new Exception("not equal: " + hbonds + " <> " + hbondsInefficient);
                out+= Math.pow(2, hbonds);
            }
            
        }
        
        return out;
    }
    
    private int countHbonds(String hairpin) {
        String prefix = hairpin.substring(0,6);
        String suffix = hairpin.substring(hairpin.length()-6);
        String prerev = SequenceUtils.reverseComplement(prefix);
        
        //See how many out of the six being examined match
        int matchlength = 0;
        for(int i=0; i<6; i++) {
            if(prerev.charAt(i)==suffix.charAt(i)) {
                matchlength = i;
            } else {
                break;
            }
        }
        
        if(matchlength <3) {
            return 0;
        }
        
        //For the ones that match, score them for A, T, C, G
        int hbonds = 0;
        for(int i=0; i<matchlength; i++) {
            char achar = prerev.charAt(i);
            if(achar == 'C' || achar == 'G') {
                hbonds+=3;
            }
            if(achar == 'A' || achar == 'T') {
                hbonds+=2;
            }
        }
        
        return hbonds;
    }

    private int countHbonds(char[] seq, int startInc, int endExcl, char[] revC, int len) {

        // This code is an optimized version of the code that used to take in a String `hairpin`
        //    where the `hairpin` was a substring of an origin `seq`
        // Instead now the caller just passing in the origin `seq` and `startInc` and `endExcl`
        //    indices into `seq`. That way we do not recreate small substrings over and over.
        // What that means then is that we have to do some indexing to locate the right `prefix`
        //    and `suffix` that the original code needed. They are now translated as follows:
        //
        // *** prefix = hairpin.substring(0,6)
        //      prefix now goes from `startInc` 6 chars ahead, i.e., seq [startInc, startInc + 6]
        // *** suffix = hairpin.substring(hairpin.length()-6)
        //      suffix goes from end index `endExcl` and 6 chars before, i.e., seq [endExcl - 6, endExcl]
        // *** revComplement(prefix) = SequenceUtils.reverseComplement(prefix)
        //      rev complement is a little funky. we pass in the reverse complement of `seq` as `revC`
        //      but now to locate the reverse complement of `hairpin` we have to subtract its indices from
        //      `len-1`, and then since we will be reading it in reverse (and always 6 chars wide),
        //      the index of `i` will be at `len - 1 - (startInc + 5 - i)`! Apologies, but that is what it is!
        //      revComplement(prefix)[i] == revC[len-1 - (startInc+5-i)]

        int suffixStart = endExcl - 6;
        //See how many out of the six being examined match
        int matchlength = 0;
        for(int i=0; i<6; i++) {
            // now we need to check `suffix.charAt(i) == revComplement(prefix).charAt(i) == suffix.charAt(i)`
            // suffix.chatAt(i) is index `suffixStart + i`
            // revComplement(prefix).charAt(i) is more complicated, but as explained above.
            char revChar = revC[len - 1 - startInc - 5 + i];
            if (seq[suffixStart + i] == revChar) {
                matchlength = i;
            } else {
                break;
            }
        }

        if(matchlength <3) {
            return 0;
        }

        int hbonds = 0;
        //For the ones that match, score them for A, T, C, G
        for(int i=0; i<matchlength; i++) {
            // compute metrics on revComplement(prefix).charAt(i); get that char using indexing as above
            char achar = revC[len - 1 - startInc - 5 + i];
            if(achar == 'C' || achar == 'G') {
                hbonds+=3;
            }
            if(achar == 'A' || achar == 'T') {
                hbonds+=2;
            }
        }

        return hbonds;
    }

}