package statalign.distance; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import statalign.postprocess.utils.RNAFoldingTools; /** * * A class that calculates the distance and similarity score between two alignments * * @author Ingolfur * */ public class Distance { /** * Given a set of sequences and an AMA score, returns the corresponding distance. * @param sequences sequences (possibly having gaps) * @param amaScore similarity score between 0 and 1 * @return the distance between the alignments */ public static double amaScoreToDistance(ArrayList<String> sequences, double amaScore) { double lengthSum = 0; for(int i = 0 ; i < sequences.size() ; i++) { lengthSum += sequences.get(i).replaceAll("-", "").length(); } double ksub1 = sequences.size() - 1; return (1-amaScore)*ksub1*lengthSum; } /** * So we can use Strings instead of list * * @see #distance(ArrayList, ArrayList) * */ public static int distance(String [] A, String [] B){ ArrayList<String> arListA = new ArrayList <String> (Arrays.asList(A)); ArrayList<String> arListB = new ArrayList <String> (Arrays.asList(B)); return Distance.distance(arListA, arListB); } /** * Calculates the distance between two alignments, if the function * returns 0, the alignments are the same. Higher number means the * alignments are more distant. * * * @param A The first alignment * @param B The second alignment * @return Integer between 0 and infinity */ public static int distance(ArrayList<String> A, ArrayList<String> B){ ArrayList<String> cloneA = new ArrayList<String>(); ArrayList<String> cloneB = new ArrayList<String>(); for(String i : A){ cloneA.add(i); } for(String i : B){ cloneB.add(i); } sortSeq(cloneA,cloneB); String [] tempA = new String[2]; String [] tempB = new String[2]; int d = 0; int k = A.size(); //This says how many sequences the alignments have for(int i=0; i<k-1; ++i){ for(int j=i+1; j<k; ++j){ tempA[0] = A.get(i); tempA[1] = A.get(j); tempB[0] = B.get(i); tempB[1] = B.get(j); d += Distance.distanceBetweenTwoAlignments(tempA, tempB); } } return d; } private static void sortSeq(ArrayList<String> A, ArrayList<String> B){ List< Pair < String , Integer> > seq1 = new ArrayList< Pair < String , Integer> >(); List< Pair < String , Integer> > seq2 = new ArrayList< Pair < String , Integer> >(); ArrayList<String> newA = new ArrayList<String>(); ArrayList<String> newB = new ArrayList<String>(); for(int i = 0; i<A.size(); ++i){ seq1.add( new Pair<String,Integer > (A.get(i).replaceAll("-",""),i)); seq2.add( new Pair<String,Integer > (B.get(i).replaceAll("-",""),i)); } Collections.sort(seq1, new PairCompare()); Collections.sort(seq2, new PairCompare()); for(int i = 0; i<A.size(); ++i){ int index1 =seq1.get(i).getRight(); int index2 =seq2.get(i).getRight(); newA.add(A.get(index1)); newB.add(B.get(index2)); } for(int i = 0; i<A.size(); ++i){ A.set(i, newA.get(i)); B.set(i, newB.get(i)); } } private static int distanceBetweenTwoAlignments(String[] A, String[] B){ String seqA1 = A[0].replaceAll("-", ""); String seqA2 = A[1].replaceAll("-", ""); int homoSize = Distance.HomoUnionSize(A, B); int delandInsertSize = deletionUnionAndInsertionUnionSize(A,B); int dist = seqA1.length() + seqA2.length() - 2 * homoSize - delandInsertSize; return dist; } private static int HomoUnionSize(String[] A, String[] B) { ArrayList<Pair<Integer,Integer>> HomoForA = new ArrayList<Pair<Integer,Integer>>(); int seqIndex1 = 0; int seqIndex2 = 0; char seq1Char; char seq2Char; for(int i =0; i<A[0].length(); ++i ){ seq1Char = A[0].charAt(i); seq2Char = A[1].charAt(i); if(seq2Char != '-' && seq1Char != '-') { HomoForA.add( new Pair<Integer,Integer>(seqIndex1,seqIndex2)); } if(seq1Char != '-'){ seqIndex1++; } if(seq2Char != '-'){ seqIndex2++; } } ArrayList<Pair<Integer,Integer>> HomoForB = new ArrayList<Pair<Integer,Integer>>(); seqIndex1 = 0; seqIndex2 = 0; for(int i =0; i<B[0].length(); ++i ){ seq1Char = B[0].charAt(i); seq2Char = B[1].charAt(i); if(seq2Char != '-' && seq1Char != '-') { HomoForB.add( new Pair<Integer,Integer>(seqIndex1,seqIndex2)); } if(seq1Char != '-'){ seqIndex1++; } if(seq2Char != '-'){ seqIndex2++; } } ArrayList<Pair<Integer,Integer>> HomoUnion = new ArrayList<Pair<Integer,Integer>>(); for(int j=0;j<HomoForA.size();++j) { Pair<Integer,Integer> common = HomoForA.get(j); if(HomoForB.contains(common)){ HomoUnion.add(common); } } return HomoUnion.size(); } private static int deletionUnionAndInsertionUnionSize(String[] A, String[] B) { ArrayList<Integer> deletionForA = new ArrayList<Integer>(); ArrayList<Integer> insertionForA = new ArrayList<Integer>(); char seq1Char; char seq2Char; int seqIndex1 = 0; int seqIndex2 = 0; for(int i =0; i<A[0].length(); ++i ){ seq1Char = A[0].charAt(i); seq2Char = A[1].charAt(i); if(seq1Char == '-' && seq2Char == '-'){ continue; } if(seq1Char == '-'){ insertionForA.add(seqIndex2); } else{ seqIndex1++; } if(seq2Char == '-'){ deletionForA.add(seqIndex1-1); } else{ seqIndex2++; } } seqIndex1=0; seqIndex2=0; ArrayList<Integer> deletionForB = new ArrayList<Integer>(); ArrayList<Integer> insertionForB = new ArrayList<Integer>(); for(int i =0; i<B[0].length(); ++i ){ seq1Char = B[0].charAt(i); seq2Char = B[1].charAt(i); if(seq1Char == '-' && seq2Char == '-'){ continue; } if(seq2Char == '-'){ deletionForB.add(seqIndex1); } else{ seqIndex2++; } if(seq1Char == '-'){ insertionForB.add(seqIndex2-1); } else{ seqIndex1++; } } ArrayList<Integer> deletionUnion = new ArrayList<Integer>(); for(int j=0;j<deletionForA.size();++j) { int common = deletionForA.get(j); if(deletionForB.contains(common)){ deletionUnion.add(common); } } ArrayList<Integer> insertionUnion = new ArrayList<Integer>(); for(int j=0;j<insertionForA.size();++j) { int common = insertionForA.get(j); if(insertionForB.contains(common)){ insertionUnion.add(common); } } return deletionUnion.size() + insertionUnion.size(); } /** * * The same as {@link #AMA(ArrayList, ArrayList)} but here you can use arrays instead of lists * */ public static double AMA(String[] A, String[] B){ ArrayList<String> arListA = new ArrayList <String> (Arrays.asList(A)); ArrayList<String> arListB = new ArrayList <String> (Arrays.asList(B)); return Distance.AMA(arListA, arListB); } /** * Does the same as the {@link Distance#distance(ArrayList, ArrayList)} but outputs a more * meaningful number since it is between 0 and 1. 1 means we have the same alignments while 0 means they are as distance as possible. * * @param A The first alignment * @param B The second alignment * @return Number between 0 and 1 */ public static double AMA(ArrayList<String> A, ArrayList<String> B){ int sum = 0; for(int i = 0; i<A.size(); ++i){ sum += A.get(i).replaceAll("-","").length(); } return 1 - Distance.distance(A, B)/ (double)( (A.size()-1) *sum ); } /** * Overloading the sequenceSimilarityScore method so one can use arrays instead of lists */ public static double sequenceSimilarityScore(String [] A){ ArrayList<String> temp = new ArrayList<String>(Arrays.asList(A)); return sequenceSimilarityScore(temp); } /** * Calculates the similarity of the sequences. * * @param A List of strings where each string is a sequence () * @return A double between 0 and 1, 1 means all sequences are the same. 0 they are as different as possible */ public static double sequenceSimilarityScore(List<String> A){ double score = 0; int count = 0; int tempScore; int length; for(int i =0; i<A.size()-1; ++i){ for(int j =i+1; j<A.size(); ++j){ String seq1 = A.get(i); String seq2 = A.get(j); tempScore = 0; length = 0; count++; for(int k = 0; k<seq1.length(); ++k){ if (seq1.charAt(k) == '.' || seq2.charAt(k) == '.'){ continue; } else if(seq1.charAt(k) == seq2.charAt(k)){ tempScore++; length++; }else{ length++; } } score += tempScore / (double)length; } } return score /count; } /** * Used when determining the step rate in the automation of the MCMC. Gives you * a much better idea how the MCMC progress behaves that is, how the space looks like * on average. * * * @param allAlignments List of alignments * @return A list of average similarity between all possible pairs of alignments */ public static ArrayList<Double> spaceAMA(ArrayList<String[]> allAlignments){ ArrayList<Double> sum = new ArrayList<Double>(allAlignments.size()+1); //Initialising sum for(int i =0; i<allAlignments.size(); ++i){ sum.add(0.0); } for(int i = 0; i<allAlignments.size()-1; ++i){ for(int j = i+1; j<allAlignments.size(); ++j){ String[] first = allAlignments.get(i); String[] second = allAlignments.get(j); sum.set(j-i, sum.get(j-i)+Distance.AMA(first, second)); } } for(int i = 0; i<allAlignments.size(); ++i){ sum.set(i, sum.get(i)/(allAlignments.size() - i ) ); } return sum; } }