package com.ppfold.algo.extradata; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import com.ppfold.algo.MatrixTools; /** * Static class to define the model for how to deal with hard constraints. * * Input as in Mfold: * * force bases i,i+1,...,i+k-1 to be double stranded by entering: * F i 0 k on 1 line in the constraint box. * * force consecutive base pairs i.j,i+1.j-1, ...,i+k-1.j-k+1 by entering: * F i j k on 1 line in the constraint box. * * force bases i,i+1,...,i+k-1 to be single stranded by entering: * P i 0 k on 1 line in the constraint box. * * prohibit the consecutive base pairs i.j,i+1.j-1, ...,i+k-1.j-k+1 by entering: * P i j k on 1 line in the constraint box. * * * Types of constraints in data matrix match those in GTfold. * * data[i][j] for i!=j can have one of the following values: * 0: Nothing is required about a pair i,j in constraints * 1: Force pairing between i and j. This implies also that * any non-nested pairings will be prohibited and any * other pairs i and j would be involved in are also prohibited. * 2: Prohibit pairing between i and j. (Nothing else is done) * * data[i][i] can have one of the following values: * 0: Nothing is known about position i at all * 3: Position i is forced to be single-stranded. This implies also that * any pairs with i will be prohibited. * 4: Position i is NOT single-stranded, it is forced to be in a pair. * 5: Position i is prohibited from pairing with at least one nucleotide. * 6: Position i is in a forced pair, but the pairing partner is not specified. * * Contact distance is implemented as prohibition. * * @author Z.Sukosd */ public class ForcedConstraints implements ExtraData { /*Types of constraints match those in GTfold, for consistent implementation. * * data[i][j] for i!=j can have one of the following values: * 0: Nothing is required about a pair i,j in constraints * 1: Force pairing between i and j. This implies also that * any non-nested pairings will be prohibited and any * other pairs i and j would be involved in are also prohibited. * 2: Prohibit pairing between i and j. (Nothing else is done) * * data[i][i] can have one of the following values: * 0: Nothing is known about position i at all * 3: Position i is forced to be single-stranded. This implies also that * any pairs with i will be prohibited. * 4: Position i is NOT single-stranded, it is forced to be in a pair. * 5: Position i is prohibited from pairing with at least one nucleotide. * 6: Position i is in a forced pair, but the pairing partner is not specified. */ /* */ int type = 2; int [][] data; //This contains the two-dimensional data, but only filled out in one instance. /* * These arrays store the raw data: each element is i,j,k, where i,j are the outermost * boundaries and k is the stem length. */ ArrayList<int[]> forceArray = new ArrayList<int[]>(); ArrayList<int[]> prohibitArray = new ArrayList<int[]>(); int contactDistance = -1; //Maximum contact distance; -1 if not set public ForcedConstraints(){} public int getType(){ return type; } public ForcedConstraints combinedForcedConstraints(int n, ArrayList<ForcedConstraints> constraints){ //This returns a specific instance of forced constraints with an empty matrix. //This is important because not every ForcedConstraints will have the data matrix //initialized, as it fills too much in memory. Only one (the "main") ForcedConstraints //will have the datamatrix initialized. ForcedConstraints fcons = new ForcedConstraints(); fcons.data = new int[n][n]; //Set all data to 0's for(int i = 0; i<n; i++){ for(int j = 0; j<n; j++){ fcons.data[i][j] =0; } } for(ForcedConstraints constraint:constraints){ //System.out.println(toString(forceArray)); //System.out.println(toString(prohibitArray)); fcons.forceArray.addAll(constraint.forceArray); fcons.prohibitArray.addAll(constraint.prohibitArray); //replace with minimum contact distance (to begin with it's -1) if(fcons.contactDistance==-1 || (fcons.contactDistance > -1 && constraint.contactDistance<fcons.contactDistance)){ fcons.contactDistance = constraint.contactDistance; } } if(fcons.contactDistance > -1){ System.out.println("Prevailing contact distance is: " + fcons.contactDistance); } //Now do the conversion to the table //Handle prohibiting and single-stranded for(int[] cons:fcons.prohibitArray){ //Smallest coordinate should come first if(cons[1] > -1 && cons[0] > cons[1]){ int tmp = cons[0]; cons[0] = cons[1]; cons[1] = tmp; } for(int k = 1; k <= cons[2]; k++){ int itarget = cons[0]+k-1; int jtarget = cons[1]-(k-1); if(cons[1] == -1){ //force single-stranded //(No check needed as 3 is stronger than 5) fcons.data[itarget][itarget] = 3; //prohibit all pairs with that base for(int i = 0; i<n; i++){ fcons.data[i][itarget] = 2; fcons.data[itarget][i] = 2; } } else{ //prohibit the pair fcons.data[itarget][jtarget] = 2; fcons.data[jtarget][itarget] = 2; //If it's not forced to pair or stay unpaired, mark that these two nucleotides are involved in a prohibited pair //Need to check as 5 is weaker than 4 or 3 if(fcons.data[itarget][jtarget]!=4&&fcons.data[itarget][jtarget]!=3){ fcons.data[itarget][itarget] = 5; fcons.data[jtarget][jtarget] = 5; } } } } //prohibit pairs too close if(fcons.contactDistance > -1){ for (int i = 0; i < n; i++) { for (int j = i+1; j < n; j++) { if (j - i > fcons.contactDistance) { fcons.data[i][j] = 2; fcons.data[j][i] = 2; //System.out.println(i + " and " + (i+j) + " prohibited (too close)"); } if(fcons.data[i][i]!=4&&fcons.data[i][i]!=3){ //If it's not forced to pair or stay unpaired, mark that it's prohibited from pairing. //Need to check as 5 is weaker than 4 or 3 fcons.data[i][i] = 5; } if(fcons.data[j][j]!=4&&fcons.data[j][j]!=3){ //If it's not forced to pair or stay unpaired, mark that it's prohibited from pairing. //Need to check as 5 is weaker than 4 or 3 fcons.data[j][j] = 5; } } } } //Handle forcing for(int[] cons:fcons.forceArray){ //Smallest coordinate should come first if(cons[1] > -1 && cons[0] > cons[1]){ int tmp = cons[0]; cons[0] = cons[1]; cons[1] = tmp; } for(int k = 1; k<=cons[2]; k++){ //force pairing int itarget = cons[0]+k-1; int jtarget = cons[1]-(k-1); if(cons[1] != -1){ //prohibit all pairs not-nested with respect to this one //(including the ones which include either of the bases) for(int i = 0; i<itarget; i++){ for(int j = itarget; j<=jtarget; j++){ fcons.data[i][j]=2; fcons.data[j][i]=2; } } for(int i = itarget+1; i<jtarget; i++){ for(int j = jtarget+1; j<n; j++){ fcons.data[i][j]=2; fcons.data[j][i]=2; } } //prohibit all remaining pairs with the pairing bases //(inside enclosed region) for(int i = itarget+1; i<jtarget; i++){ fcons.data[itarget][i] = 2; fcons.data[i][itarget] = 2; fcons.data[jtarget][i] = 2; fcons.data[i][jtarget] = 2; } //mark that these two nucleotides are involved in a constrained pair //to avoid searching in O(N^2) time //(No check needed as 4 is stronger than 5) fcons.data[itarget][itarget] = 4; fcons.data[jtarget][jtarget] = 4; //force pairing fcons.data[itarget][jtarget] = 1; fcons.data[jtarget][itarget] = 1; } else{ //force pairing, partner is not known //Check that it's not forced to pair (it shouldn't be forced to stay unpaired simultaneously) if(fcons.data[itarget][itarget]!=4){ fcons.data[itarget][itarget] = 6; } //prohibit all pairs with that base } } } //MatrixTools.print(fcons.data); return fcons; } public boolean isEmpty(int i) { //System.out.println(toString(forceArray)); //System.out.println(toString(prohibitArray)); for(int[] cons:prohibitArray){ if(cons[0]==i || cons[1]==i){ //System.out.println("data for col " + i + " not empty "); return false; } } for(int[] cons:forceArray){ if(cons[0]==i || cons[1]==i){ //System.out.println("data for col " + i + " not empty "); return false; } } //System.out.println("data for col " + i + " is empty "); return true; } public float getProbabilityGivenOuterPaired(int position1, int position2) { return canPair(position1, position2); } public float getProbabilityGivenInnerPaired(int position1, int position2) { return canPair(position1, position2); } public float getProbabilityGivenUnpaired(int n){ return canSs(n); } private float canPair(int i, int j){ //can't pair if i,j is prohibited, or i or j are forced to be single-stranded if(data[i][j]==2||data[i][i]==3||data[j][j]==3){return Float.MIN_VALUE;} //can't pair if i or j are forced to pair with something other than each other if((data[i][i]==4||data[j][j]==4)&&data[i][j]!=1){return Float.MIN_VALUE;} return 1f; } private float canSs(int i){ if(data[i][i]!=4&&data[i][i]!=6){return 1f;} else{return Float.MIN_VALUE;} } /** * Reads SHAPE distribution data from the file specified * @throws IOException */ public void importData(String filename, int sequencelength) throws Exception{ //read the data BufferedInputStream stream = null; try { stream = new BufferedInputStream(new FileInputStream(filename)); } catch (FileNotFoundException e) { System.err.println("Constraint sequence input file " + filename + " could not be read!"); throw new IOException(e); } readData_toStream(stream, sequencelength); } public void readData_toStream(BufferedInputStream stream, int sequencelength) throws Exception{ if(stream!=null){ String line = ""; try{ int l = stream.available(); byte[] bytes = new byte[l]; stream.read(bytes); stream.close(); String data_string = new String(bytes); String[] lines = data_string.split("\n"); int data_size = lines.length; boolean [] readdata_letter = new boolean[data_size]; // Create a pattern to match different kinds of separators Pattern p = Pattern.compile("[,\\s]+"); //DO NOT ignore first line... for(int i = 0; i<lines.length; i++){ line = lines[i]; String splitline[] = p.split(line.trim()); if(splitline.length == 4){ //True if prohibiting, False if forcing readdata_letter[i] = (Character.toLowerCase(splitline[0].charAt(0))=='p')?true:false; int [] readdata = new int[3]; readdata[0] = Integer.valueOf(splitline[1])-1; //start of pairing //note data files are numbered from 1; Java numbers from 0 readdata[1] = Integer.valueOf(splitline[2])-1; //end of pairing readdata[2] = Integer.valueOf(splitline[3]); //length of pairing if(!readdata_letter[i]){ forceArray.add(readdata); } else{ prohibitArray.add(readdata); } } } } catch(Exception e){ System.err.println("An exception occured while attempting to read or interpret the constraint data. "); throw new Exception(e); } } else{ System.err.println("BufferedStream was null, ignoring..."); //The stream is null if ONLY contact distance is given. (or maybe not even that) //In this case, do nothing, the object will just be empty. } } public void setContactDistance(int n){ contactDistance = n; } public int getContactDistance(){ return contactDistance; } public void transformToAlignment(String gappedseq) { //System.out.println("before transformtoalignment"); //System.out.println(toString(forceArray)); //System.out.println(toString(prohibitArray)); if(gappedseq == null){ //we just have a contact distance, no array data. return; } int n = gappedseq.length(); int[] nrofgaps = new int[n]; int gapcounter = 0; //counts gaps for(int i = 0; i<n; i++){ //step alignment positions if(MatrixTools.isGap(gappedseq.charAt(i))){ gapcounter++; } nrofgaps[i] = gapcounter; } //Transform with the gaps for(int[] cons:prohibitArray){ cons[0] = cons[0] + nrofgaps[cons[0]]; if(cons[1]>-1){ //this is to make sure P i 0 k doesn't get altered cons[1] = cons[1] + nrofgaps[cons[1]]; } } for(int[] cons:forceArray){ //this is to make sure F i 0 k doesn't get altered cons[0] = cons[0] + nrofgaps[cons[0]]; if(cons[1]>-1){ cons[1] = cons[1] + nrofgaps[cons[1]]; } } //System.out.println("after transformtoalignment"); //System.out.println(toString(forceArray)); //System.out.println(toString(prohibitArray)); } public void removeColumns(List<Integer> leftoutcolumns) throws Exception{ //System.out.println("before removing columns"); //System.out.println(toString(forceArray)); //System.out.println(toString(prohibitArray)); if(forceArray.size()==0&&prohibitArray.size()==0){ //In this case we don't actually have any data except maybe contact distance, so do nothing return; } @SuppressWarnings("unchecked") int maxpos = findMaxPos(forceArray,prohibitArray); int [] remColCnt = new int[maxpos+1]; Iterator<Integer> iter = leftoutcolumns.iterator(); int current = 0; int cnt = 0; while(iter.hasNext()){ int col = iter.next(); for(int i = current; i<=col; i++){ if(i == col){ cnt ++; //one more column removed remColCnt[i] = -1; //Remove this column current = i+1; break; } else{ remColCnt[i] = cnt; } } } //fill last bit for(int i = current; i<=maxpos; i++){ remColCnt[i] = cnt; } //Transform with the removed columns //In theory, there can't be any columns that are kept that have associated constraints. //If so, throw an exception... for(int[] cons:prohibitArray){ if(remColCnt[cons[0]]!=-1){ cons[0] = cons[0] - remColCnt[cons[0]]; } else{ throw new Exception("Column with data was trying to be removed!"); } if(cons[1]>-1){ if(remColCnt[cons[1]]!=-1){ cons[1] = cons[1] - remColCnt[cons[1]]; } else{ throw new Exception("Column with data was trying to be removed!"); } } } for(int[] cons:forceArray){ if(remColCnt[cons[0]]!=-1){ cons[0] = cons[0] - remColCnt[cons[0]]; } else{ throw new Exception("Column with data was trying to be removed!"); } if(cons[1]>-1){ if(remColCnt[cons[1]]!=-1){ cons[1] = cons[1] - remColCnt[cons[1]]; } else{ throw new Exception("Column with data was trying to be removed!"); } } } //System.out.println("after removing columns"); //System.out.println(toString(forceArray)); //System.out.println(toString(prohibitArray)); } private static String toString(ArrayList<int[]> inputlist){ String result = ""; for(int[] input:inputlist ){ result = result + "[ "; for(int i:input){ result = result +i + " "; } result = result + "] \n"; } return result; } private int findMaxPos(ArrayList<int[]> ... arrays){ int res = 0; for(ArrayList<int[]> array : arrays){ for(int[] cons : array){ if(res < cons[0]){ res = cons[0]; } if(res < cons[1]){ res = cons[1]; } } } return res; } }