package com.ppfold.algo.extradata; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import java.math.*; import com.ppfold.algo.MatrixTools; /** * Static class to define the model for how to deal with SHAPE data as well as container for the actual data * * @author Z.Sukosd */ public class SHAPEData implements ExtraData { private int type = 0; //the percent error assumed in each individual SHAPE measurement //(this is used to define the interval to calculate probability integrals) private float[] data; //contains the actual measurement numbers (data) //Probabilities must always be given as P(data|unpaired) or P(data|paired). private float[] dataProbGivenPaired; //P(data|model), paired case private float[] dataProbGivenUnpaired; //P(data|model), unpaired case //Unpaired parameter fit private final float lamb = 0.681211f; private final float k_mixedpair = 0.895341f; private final float sigma_mixedpair = 0.0712f; private final float mu_mixedpair = 0.00173144f; public SHAPEData(){ } public int getType(){ return type; } public boolean isEmpty(int i) { return data[i]==-999; } public float getProbabilityGivenOuterPaired(int position1, int position2) { return getProbabilityGivenOuterPair(position1)*getProbabilityGivenOuterPair(position2); } public float getProbabilityGivenInnerPaired(int position1, int position2) { return getProbabilityGivenInnerPair(position1)*getProbabilityGivenInnerPair(position2); } public float getProbabilityGivenInnerPair(int n){ return dataProbGivenPaired[n]; } public float getProbabilityGivenOuterPair(int n){ return dataProbGivenPaired[n]; } public float getProbabilityGivenUnpaired(int n){ return dataProbGivenUnpaired[n]; } private float expPDF(float x, float lamb){ return (1/lamb) * (float)Math.exp(-x/lamb); } private float gevPDF(float x, float xi, float sigma, float mu){ float oneoverxi = 1f/xi; float oneoversigma = 1f/sigma; float xminusmu = x-mu; float z = xminusmu*oneoversigma; return oneoversigma*(float)Math.exp(-1*Math.pow(1+xi*z,-1*oneoverxi))* (float)Math.pow((1+xi*z),-1-oneoverxi); } private float calcUnpaired(float f) { //Calculates the numbers on the basis of distribution fits return expPDF(f,lamb); } private float calcPaired(float f) { // Calculates the numbers on the basis of distribution fits return gevPDF(f,k_mixedpair, sigma_mixedpair, mu_mixedpair); } public void importData(String filename, int sequencelength) throws Exception{ //read the SHAPE data BufferedInputStream stream = null; try { stream = new BufferedInputStream(new FileInputStream(filename)); } catch (FileNotFoundException e) { System.err.println("SHAPE sequence input file " + filename + " could not be read!"); throw new IOException(e); } readData_toStream(stream, sequencelength); } public void readData_toStream(BufferedInputStream stream, int sequencelength) throws Exception{ //the CLC import plugin uses the following line: //public ClcObject[] doImport(BufferedInputStream stream, String name, Activity activity) throws IOException, ParseException, PersistenceException { //the same stream will be passed here to ease the import process later. //create the data containers this.data = new float[sequencelength]; this.dataProbGivenPaired = new float[sequencelength]; this.dataProbGivenUnpaired = new float[sequencelength]; if(stream!=null){ String line = ""; try{ int l = stream.available(); byte[] bytes = new byte[l]; stream.read(bytes); stream.close(); String data_string = new String(bytes); String[] lines = data_string.split("\n"); int data_size = lines.length; int [] readdata_index = new int[data_size]; float [] readdata_data = new float[data_size]; // Create a pattern to match different kinds of separators Pattern p = Pattern.compile("[,\\s]+"); //DO NOT ignore first line... for(int i = 0; i<lines.length; i++){ line = lines[i]; String splitline[] = p.split(line.trim()); if(splitline.length == 2){ readdata_index[i] = Integer.valueOf(splitline[0])-1; //SHAPE data files are numbered from 1; Java numbers from 0 readdata_data[i] = Float.valueOf(splitline[1]); } } //initialize all data values to -999 for(int i = 0; i<this.data.length; i++){ data[i] = -999; } //set the appropriate data values to the read ones for(int i = 0; i<data_size; i++){ // data[readdata_index[i]] = readdata_data[i]; } //calculate probabilities calcProbabilities(); //normalize probabilities //normalizeProbabilities(); } catch(Exception e){ System.err.println("An exception occured while attempting to read or interpret the SHAPE sequence data. "); throw new Exception(e); } } else{ System.err.println("Input stream was null, SHAPE sequence data could not be loaded."); } } private void calcProbabilities(){ //System.out.println("Calculating probs"); int n = data.length; for(int i = 0; i<n; i++){ //For each data point, calculate the probability of data given paired/unpaired. if(data[i] == -999){ //ignore dataProbGivenPaired[i] = 1; dataProbGivenUnpaired[i] = 1; continue; } dataProbGivenPaired[i] = calcPaired(data[i]); dataProbGivenUnpaired[i] = calcUnpaired(data[i]); } //For plotting the distributions: //for (int i=0; i<100; i++){ // System.out.println(((float)i/100) + " " + calcPaired((float)i/100) + " " + calcUnpaired((float)i/100)); //} //System.exit(0); } public void transformToAlignment(String gappedseq) { int n = gappedseq.length(); float[] data_a = new float[n]; float[] dataProbGivenPair_a = new float[n]; float[] dataProbGivenUnpaired_a = new float[n]; int cnt = 0; //counts sequence positions for(int i = 0; i<n; i++){ //step alignment positions if(MatrixTools.isGap(gappedseq.charAt(i))){ //if there's a gap, set probabilities to 1 data_a[i] = -999; //no data for that column dataProbGivenPair_a[i] = 1; dataProbGivenUnpaired_a[i] = 1; } else{ data_a[i] = data[cnt]; dataProbGivenPair_a[i] = dataProbGivenPaired[cnt]; dataProbGivenUnpaired_a[i] = dataProbGivenUnpaired[cnt]; cnt++; } //Prevent division by zero by setting 0's to a very small finite number instead if(dataProbGivenPair_a[i]==0 && dataProbGivenUnpaired_a[i]==0){ dataProbGivenPair_a[i]=Float.MIN_VALUE; dataProbGivenUnpaired_a[i]=Float.MIN_VALUE; } //System.out.println("i=" + i + ", data = " + data_a[i] + ": pairing="+dataProbGivenPair_a[i] + // ", unpaired = " + dataProbGivenUnpaired_a[i]); } this.data = data_a; this.dataProbGivenPaired = dataProbGivenPair_a; this.dataProbGivenUnpaired = dataProbGivenUnpaired_a; } public void removeColumns(List<Integer> leftoutcolumns){ Iterator<Integer> iter = leftoutcolumns.iterator(); int leaveout = 0; int from = 0; int cnt = 0; //counts position in new thing float[] data_a = new float[this.data.length - leftoutcolumns.size()]; float[] dataProbGivenPair_a = new float[this.data.length - leftoutcolumns.size()]; float[] dataProbGivenUnpaired_a = new float[this.data.length - leftoutcolumns.size()]; while(iter.hasNext()){ leaveout = iter.next(); for(int i = from; i<leaveout; i++){ data_a[cnt] = this.data[i]; dataProbGivenPair_a[cnt] = dataProbGivenPaired[i]; dataProbGivenUnpaired_a[cnt] = dataProbGivenUnpaired[i]; cnt++; } from = leaveout+1; } //do the last part part for(int i = from; i<data.length; i++){ data_a[cnt] = this.data[i]; dataProbGivenPair_a[cnt] = dataProbGivenPaired[i]; dataProbGivenUnpaired_a[cnt] = dataProbGivenUnpaired[i]; cnt++; } this.data = data_a; this.dataProbGivenPaired = dataProbGivenPair_a; this.dataProbGivenUnpaired = dataProbGivenUnpaired_a; //System.out.println("Size of data after column removal: " + data.length); } private void normalizeProbabilities(){ System.out.println("Normalizing probs"); int n = data.length; for(int i = 0; i<n; i++){ //For each data point, normalize the three probabilities //so they are all under 1. //This is so we don't get overflow errors //(it's the ratios that matter anyway) float sum = dataProbGivenPaired[i] + dataProbGivenUnpaired[i]; dataProbGivenPaired[i] = dataProbGivenPaired[i]/sum; dataProbGivenUnpaired[i] = dataProbGivenUnpaired[i]/sum; } } }