package com.ppfold.main; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.regex.Pattern; import javax.swing.JDialog; import com.ppfold.algo.MatrixTools; import com.ppfold.algo.Node; import com.ppfold.algo.Tree; import com.ppfold.algo.extradata.ExtraDataBars; /** * Executes SOME checks of the input data. It is not guaranteed to find all possible errors. * * @author Z.Sukosd */ public class CheckAllData { static boolean alignmentOK; static boolean distOK; static boolean dataOK; static boolean fileTooLong; static boolean nothingfailed; static String reason; public static final String SUCCESS_TEXT = "THERE WERE FAILED TESTS! Details below.\n\n"; public static final String FAILURE_TEXT = "EVERYTHING SEEMS TO BE OK! Details below.\n\n"; private static final String gapstring = "[-.]"; static String checkData() throws Exception { alignmentOK = true; distOK = true; dataOK = true; nothingfailed = true; String testresults = ""; reason = ""; //Check alignment testresults += "Alignment check result: \n" ; fileTooLong=false; Alignment ali = attemptToLoadAlignment(); boolean alignmentCanBeOpened = ali==null?false:true; testresults += "- File exists and can be opened... "; if(fileTooLong){ testresults += "FAILED (cause: file too large)\n"; } else{ testresults += alignmentCanBeOpened?"OK\n":"FAILED\n"; } if(alignmentCanBeOpened){ testresults += "- Sequences match names... "; testresults += namesMatchSequences(ali)?"OK\n":"FAILED\n"; testresults += "- No illegal symbols... "; String symbol = noIllegalSymbols(ali); testresults += symbol==null?"OK\n":"FAILED (cause: " + symbol + ")\n"; testresults += "- Gapped sequences have the same length... "; int nr = lengthOK(ali); if(nr>0){testresults += "FAILED (cause: sequence nr. " + (nr+1) + ")\n";} else if(nr==-1){testresults += "FAILED (cause: general format error)\n";} else{testresults += "OK\n";} testresults += "- Memory requirements met... "; long required = memRequired(ali); long available = Runtime.getRuntime().maxMemory()/(1024*1024); if(required<available){ testresults += "OK (required estimate: " + required + " MB, available: " + Runtime.getRuntime().maxMemory()/(1024*1024) + " MB)\n"; } else{ nothingfailed = false; testresults += "FAILED (required estimate: " + required + " MB, available: " + Runtime.getRuntime().maxMemory()/(1024*1024) + " MB)\n"; } } testresults += "\n"; testresults += "Tree check result: \n"; boolean treeCanBeOpened = false; Tree tree = null; if(PPfoldMain.treefilename==null){ testresults += "- Tree not provided \n"; } else{ tree = attemptToLoadTree(); treeCanBeOpened = tree==null?false:true; testresults += "- File exists and can be parsed... "; if(fileTooLong){ testresults += "FAILED (cause: file too large)\n"; } else{ testresults += treeCanBeOpened?"OK\n":"FAILED\n"; } } if(treeCanBeOpened&&alignmentOK){ testresults += "- Tree names match alignment names... "; String results = namesMatchTreeAlignment(ali,tree); testresults += results==null?"OK\n":"FAILED (cause: node named " + results +")\n"; } testresults += "\n"; testresults += "Output folder check result: \n"; boolean outputExists = attemptToAccessOutput(); testresults += "- Folder exists... "; testresults += outputExists?"OK\n":"FAILED\n"; if(outputExists){ boolean outputFolderWriteable = attemptToWriteOutput(); testresults += "- Output folder writeable... "; testresults += outputFolderWriteable?"OK\n":"FAILED\n"; } testresults += "\n"; testresults += "Data check result:\n"; if(PPfoldMain.datainfo.size()==0){ testresults += "- Data not provided. \n"; } else{ for(DataInfo data:PPfoldMain.datainfo){ dataOK = true; if(data.getType()==0){ boolean dataexists; ExtraDataBars dist; boolean distheader; boolean distprobs; int seqID; distOK = true; testresults += "- Data identifier: " + data.getiD() + "\n"; testresults += " - Distribution file can be opened: "; dist = attemptToLoadDataDist(data); testresults += dist!=null?"OK\n":"FAILED\n"; if(dist!=null){ testresults += " - Distribution file has the correct header: "; distheader = checkDistHeader(data); testresults += distheader?"OK\n":"FAILED\n"; testresults += " - Distribution probabilities add to 1 and limits are increasing: "; distprobs = checkDistProbs(dist); testresults += distprobs?"OK\n":("FAILED (cause: "+ reason + ")\n"); } dataexists = attemptToLoadData(data); testresults += " - Data file can be opened: "; testresults += dataexists?"OK\n":"FAILED\n"; int[] indices = null; if(dataexists){ indices = checkBarDataFormat(data); testresults += " - Data has right format: "; testresults += indices!=null?"OK\n":"FAILED\n"; } if(alignmentOK){ seqID = sequenceFound(ali, data); testresults += " - Sequence name found in alignment: "; testresults += seqID!=-1?"OK\n":"FAILED\n"; if(seqID!=-1&&dataOK&&dataexists){ testresults += " - Data not longer than sequence: "; String newseq = new String(ali.getSequences().get(seqID)); newseq.replaceAll(gapstring, ""); testresults += sequenceMatchesBarData(newseq.length(), indices)?"OK\n":"FAILED\n"; } } } else if(data.getType()==1){ int seqID; boolean dataexists = false; testresults += "- Data identifier: " + data.getiD() + "\n"; dataexists = attemptToLoadData(data); testresults += " - Data file can be opened: "; testresults += dataexists?"OK\n":"FAILED\n"; int[] indices = null; if(dataexists){ indices = checkProbDataFormat(data); testresults += " - Data has right format: "; testresults += indices!=null?"OK\n":"FAILED\n"; } if(alignmentOK){ seqID = sequenceFound(ali, data); testresults += " - Sequence name found in alignment: "; testresults += seqID!=-1?"OK\n":"FAILED\n"; if(seqID!=-1&&dataOK&&dataexists){ testresults += " - Data not longer than sequence: "; String newseq = new String(ali.getSequences().get(seqID)); newseq.replaceAll(gapstring, ""); testresults += sequenceMatchesProbData(newseq.length(), data)?"OK\n":"FAILED\n"; } } } else if(data.getType()==2){ int seqID = -1; boolean dataexists = false; testresults += "- Data identifier: " + data.getiD() + "\n"; if(data.getFileName()!=null){ dataexists = attemptToLoadData(data); testresults += " - Data file can be opened: "; testresults += dataexists?"OK\n":"FAILED\n"; } if(alignmentOK){ if(data.getSequenceName()!=null){ seqID = sequenceFound(ali, data); testresults += " - Sequence name found in alignment: "; testresults += seqID!=-1?"OK\n":"FAILED\n"; } if(seqID!=-1&&dataOK&&dataexists){ testresults += " - Data not longer than sequence and no forced pairs are too close: "; String newseq = new String(ali.getSequences().get(seqID)); newseq.replaceAll(gapstring, ""); testresults += sequenceMatchesConstraintData(newseq.length(), data)?"OK\n":"FAILED\n"; } if(data.getContactDistance()>0&&seqID!=-1){ testresults += " - Contact distance not longer than alignment: "; testresults += data.getContactDistance()<=ali.getSequences().get(seqID).length()?"OK\n":"FAILED\n"; } } testresults += " - NOTE: not checking for pseudoknotted constraints or conflicting data! (Please do that yourself)\n"; } else{ testresults += "- Data identifier: " + data.getiD() + "\n"; testresults += " - Unknown data type: " + data.getType() + ", check aborted."; } testresults += "\n"; } } if(PPfoldMain.seqexportname!=null){ testresults += "\n"; testresults += "Export check result:\n"; testresults += "- Sequence name found in alignment: " + (findSequence(ali.getNames(),PPfoldMain.seqexportname)?"OK\n":"FAILED\n"); } if(!nothingfailed){ testresults = SUCCESS_TEXT.concat(testresults); } else{ testresults = FAILURE_TEXT.concat(testresults); } return testresults; } private static long memRequired(Alignment ali) { //Calculating that 150 MB is required for 1542 nt long size = ali.getSequences().get(0).length(); long required = (long) ( Math.pow((double)(size)/9158d, 2) * 7270d); return required; } private static boolean findSequence(List<String> names, String seqexportname) { for(int i = 0; i<names.size(); i++){ if(seqexportname.trim().equals(names.get(i).trim())){ return true; } } nothingfailed=false; return false; } private static int[] checkProbDataFormat(DataInfo datain) { String filename = datain.getFileName(); BufferedInputStream stream = null; int [] readdata_index = null; try { stream = new BufferedInputStream(new FileInputStream(filename)); } catch (FileNotFoundException e) { dataOK=false; nothingfailed=false; return null; } if(stream!=null){ String line = ""; try{ int l = stream.available(); byte[] bytes = new byte[l]; stream.read(bytes); stream.close(); String data_string = new String(bytes); String[] lines = data_string.split("\n"); int data_size = lines.length; readdata_index = new int[data_size]; float [] readdata_data1 = new float[data_size]; float[] readdata_data2 = new float[data_size]; // Create a pattern to match different kinds of separators Pattern p = Pattern.compile("[,\\s]+"); //DO NOT ignore first line... for(int i = 0; i<lines.length; i++){ line = lines[i]; String splitline[] = p.split(line.trim()); if(splitline.length == 3){ readdata_index[i] = Integer.valueOf(splitline[0])-1; //SHAPE data files are numbered from 1; Java numbers from 0 readdata_data1[i] = Float.valueOf(splitline[1]); readdata_data2[i] = Float.valueOf(splitline[2]); } } } catch(Exception e){ dataOK=false; nothingfailed=false; return null; } } return readdata_index; } private static boolean attemptToLoadData(DataInfo data) { String filename = data.getFileName(); BufferedInputStream stream = null; try { stream = new BufferedInputStream(new FileInputStream(filename)); } catch (FileNotFoundException e) { dataOK=false; nothingfailed=false; return false; } return true; } private static boolean sequenceMatchesProbData(int length, DataInfo data) { // TODO Auto-generated method stub return false; } private static boolean sequenceMatchesConstraintData(int length, DataInfo data) { ArrayList<int[]> forceArray = new ArrayList<int[]>(); ArrayList<int[]> prohibitArray = new ArrayList<int[]>(); //read the data BufferedInputStream stream = null; try { stream = new BufferedInputStream(new FileInputStream(data.getFileName())); } catch (FileNotFoundException e) { dataOK=false; nothingfailed=false; return false; } if(stream!=null){ String line = ""; try{ int l = stream.available(); byte[] bytes = new byte[l]; stream.read(bytes); stream.close(); String data_string = new String(bytes); String[] lines = data_string.split("\n"); int data_size = lines.length; boolean [] readdata_letter = new boolean[data_size]; // Create a pattern to match different kinds of separators Pattern p = Pattern.compile("[,\\s]+"); //DO NOT ignore first line... for(int i = 0; i<lines.length; i++){ line = lines[i]; String splitline[] = p.split(line.trim()); if(splitline.length == 4){ //True if prohibiting, False if forcing readdata_letter[i] = (Character.toLowerCase(splitline[0].charAt(0))=='p')?true:false; int [] readdata = new int[3]; readdata[0] = Integer.valueOf(splitline[1])-1; //start of pairing //note data files are numbered from 1; Java numbers from 0 readdata[1] = Integer.valueOf(splitline[2])-1; //end of pairing readdata[2] = Integer.valueOf(splitline[3]); //length of pairing if(!readdata_letter[i]){ forceArray.add(readdata); } else{ prohibitArray.add(readdata); } } } for(int[] cns:forceArray){ if(cns[0]>length || cns[1]>length || Math.max(cns[1],cns[0]) - Math.min(cns[1], cns[0]) < 4){ dataOK=false; nothingfailed=false; return false; } } for(int[] cns:prohibitArray){ if(cns[0]>length || cns[1]>length){ dataOK=false; nothingfailed=false; return false; } } return true; } catch(Exception e){ dataOK=false; nothingfailed=false; return false; } } return false; } private static boolean sequenceMatchesBarData(int seqlength, int[] indices) { for(int i = 0; i<indices.length; i++){ if(indices[i]>seqlength){ dataOK=false; nothingfailed=false; return false; } } return true; } private static int[] checkBarDataFormat(DataInfo datain) { String filename = datain.getFileName(); int[] indices = null; float[] data = null; BufferedInputStream stream = null; try { stream = new BufferedInputStream(new FileInputStream(filename)); } catch (FileNotFoundException e) { dataOK=false; nothingfailed=false; return null; } if(stream!=null){ String line = ""; try{ int l = stream.available(); byte[] bytes = new byte[l]; stream.read(bytes); stream.close(); String data_string = new String(bytes); String[] lines = data_string.split("\n"); int data_size = lines.length; indices = new int[data_size]; data = new float[data_size]; // Create a pattern to match different kinds of separators Pattern p = Pattern.compile("[,\\s]+"); for(int i = 0; i<lines.length; i++){ line = lines[i]; String splitline[] = p.split(line.trim()); if(splitline.length == 2){ indices[i] = Integer.valueOf(splitline[0])-1; //SHAPE data files are numbered from 1; Java numbers from 0 data[i] = Float.valueOf(splitline[1]); } else{ dataOK=false; nothingfailed=false; return null; } } } catch(Exception e){ dataOK=false; nothingfailed=false; return null; } } return indices; } private static boolean attemptToLoadBarData(DataInfo data) { String filename = data.getFileName(); BufferedInputStream stream = null; try { stream = new BufferedInputStream(new FileInputStream(filename)); } catch (FileNotFoundException e) { dataOK=false; nothingfailed=false; return false; } return true; } private static boolean checkDistProbs(ExtraDataBars dist) { float [] distUnpaired = dist.getDistributionUnpaired(); float [] distPaired = dist.getDistributionPaired(); float [] limits = dist.getDistributionLimits(); if(!(limits.length==distPaired.length&&limits.length==distUnpaired.length)){ reason = "Format error"; System.out.println(limits.length); System.out.println(distPaired.length); System.out.println(distUnpaired.length); distOK=false; nothingfailed=false; return false; } for(int i = 1; i<limits.length; i++){ if(limits[i]<=limits[i-1]){ reason = "Limits are not increasing between " + limits[i-1] + " and " + limits[i]; distOK=false; nothingfailed=false; return false; } } float sum = 0; for(int i = 0; i<limits.length; i++){ if(distUnpaired[i]>=0){ sum += distUnpaired[i]; } else{ reason = "Negative probability " + distPaired[i]; distOK=false; nothingfailed=false; return false; } } if(Math.abs(1f-sum)>0.00001){ reason = "Unpaired probabilities sum to " + sum; distOK=false; nothingfailed=false; return false; } sum = 0; for(int i = 0; i<limits.length; i++){ if(distPaired[i]>=0){ sum += distPaired[i]; } else{ reason = "Negative probability " + distPaired[i]; distOK=false; nothingfailed=false; return false; } } if(Math.abs(1f-sum)>0.00001){ reason = "Paired probabilities sum to " + sum; distOK=false; nothingfailed=false; return false; } return true; } private static boolean checkDistHeader(DataInfo data) { //read the SHAPE data BufferedInputStream stream; if(data.getDistFileName().equals(PPfoldMain.defaultDataDistfile)){ stream = new BufferedInputStream(Thread.currentThread(). getContextClassLoader().getResourceAsStream(PPfoldMain.defaultDataDistfile)); } else{ try { stream = new BufferedInputStream(new FileInputStream(data.getDistFileName())); } catch (FileNotFoundException e) { nothingfailed = false; distOK = false; return false; } } try{ int l = stream.available(); byte[] bytes = new byte[l]; stream.read(bytes); stream.close(); String data_string = new String(bytes); String[] lines = data_string.split("\n"); String line = lines[0]; //First line String[] splitline = line.split("\\s+"); if(!( splitline[0].startsWith("lower_bound") && splitline[1].startsWith("P_density_paired") && splitline[2].startsWith("P_density_unpaired"))){ nothingfailed = false; distOK = false; return false; } else{return true;} } catch(Exception e){ nothingfailed = false; distOK = false; return false; } } private static int sequenceFound(Alignment ali, DataInfo data) { Integer seqID = -1; for(int i = 0; i<ali.getNames().size(); i++){ if(data.getSequenceName().trim().equals(ali.getNames().get(i).trim())){ seqID = i; break; } } if(seqID==-1){dataOK=false; nothingfailed=false;return seqID;} else{return seqID;} } private static boolean attemptToAccessOutput() { if(PPfoldMain.outputdir==null){ nothingfailed=false; return false; } File file=new File(PPfoldMain.outputdir); if(!file.exists()){nothingfailed=false;} return file.exists(); } private static boolean attemptToWriteOutput() { File sample = null; if(PPfoldMain.outputdir!=null){ sample = new File(PPfoldMain.outputdir,"tmp"); } else{nothingfailed=false;return false;} try{ sample.createNewFile(); sample.delete(); return true; } catch(IOException e) { nothingfailed=false; return false; } } private static String namesMatchTreeAlignment(Alignment ali, Tree tree) { for (int i = 0; i < ali.getNames().size(); i++) { // finds the node corresponding to the rownumber // rownumber = corresponds to sequences. Node node = tree.findSlowlyNodeWithName(ali.getNames().get(i)); if (node == null) { nothingfailed=false; return ali.getNames().get(i); } } nothingfailed=false; return null; } private static Tree attemptToLoadTree() { try{ File file=new File(PPfoldMain.treefilename); if(!file.exists()){nothingfailed=false;return null;} else if(file.length()>1048576){ nothingfailed=false; fileTooLong=true; return null; } Tree tree = NewickReader.readNewick(PPfoldMain.treefilename); return tree; } catch(Exception e){nothingfailed=false;return null;} } private static int lengthOK(Alignment ali) { try{ int size = ali.getSequences().get(0).length(); for(int i = 0; i<ali.getSequences().size();i++){ if(ali.getSequences().get(i).length()!=size){ alignmentOK=false;nothingfailed=false; return i; } } return 0; } catch(Exception e){ return -1; } } private static boolean namesMatchSequences(Alignment ali) { boolean val = (ali.getNames().size()==ali.getSequences().size())&&(ali.getNames().size()!=0); if(!val){alignmentOK=false;nothingfailed=false;} return val; } private static String noIllegalSymbols(Alignment ali) { for(int i = 0; i<ali.getSequences().size();i++){ for(int j=0; j<ali.getSequences().get(i).length(); j++){ char thischar = ali.getSequences().get(i).charAt(j); thischar = Character.toLowerCase(thischar); if (thischar != 'a' && thischar != 'u' && thischar != 't' && thischar != 'g' && thischar != 'c' && thischar != 'r' && thischar != 'y' && thischar != 's' && thischar != 'w' && thischar != 'k' && thischar != 'm' && thischar != 'b' && thischar != 'd' && thischar != 'h' && thischar != 'v' && thischar != 'n' && !MatrixTools.isGap(thischar)){ alignmentOK=false;nothingfailed=false; return String.valueOf(thischar); } } } return null; } private static Alignment attemptToLoadAlignment() { try{ File file=new File(PPfoldMain.alignmentfilename); if(!file.exists()){nothingfailed=false;alignmentOK=false;return null;} else if(file.length()>1048576){ alignmentOK=false; nothingfailed=false; fileTooLong=true; return null; } Alignment align = AlignmentReader.readAlignment(PPfoldMain.alignmentfilename); return align; } catch(Exception e){alignmentOK=false;nothingfailed=false;return null;} } private static ExtraDataBars attemptToLoadDataDist(DataInfo data) { try{ BufferedInputStream shapeDistReader; if(data.getDistFileName().equals(PPfoldMain.defaultDataDistfile)){ shapeDistReader = new BufferedInputStream(Thread.currentThread(). getContextClassLoader().getResourceAsStream(PPfoldMain.defaultDataDistfile)); } else{ shapeDistReader = new BufferedInputStream(new FileInputStream(data.getDistFileName())); } ExtraDataBars sequenceData = ExtraDataBars.readDistTable_toStream(shapeDistReader); return sequenceData; } catch(Exception e){distOK=false;nothingfailed=false;return null;} } }