/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.RNAPosToDNAPos; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.StringTokenizer; import edu.yu.einstein.genplay.dataStructure.enums.RNAToDNAResultType; import edu.yu.einstein.genplay.exception.ExceptionManager; /** * Creates a file with DNA coordinate from a RNA coordinate coverage file and an annotation file * @author Chirag Gorasia * @author Julien Lajugie * @version 0.1 */ public class GeneRelativeToGenomePosition { private static Map<String, List<Double>> startStopScore; // map to store the start stop and score private static Map<String, List<List<String>>> remainderLineFromCoverageFile; // map containing all lines of the coverage file private final RNAToDNAResultType outputFileType; // result type private final File coverageFile; // coverage file private final File annotationFile; // annotation file private final File outputFile; // output file /** * Creates an instance of {@link GeneRelativeToGenomePosition} * @param coverageFile coverage file * @param annotationFile annotation file * @param outputFile output file * @param outputFileType type of output */ public GeneRelativeToGenomePosition(File coverageFile, File annotationFile, File outputFile, RNAToDNAResultType outputFileType) { this.coverageFile = coverageFile; this.annotationFile = annotationFile; this.outputFile = outputFile; this.outputFileType = outputFileType; } /** * private method to merge the list of absolute lengths and list of start stop values from the coverage file * @param absolutebplengths * @param tempList * @param mergedList * @return mergedList */ private List<Integer> listsMerger (List<Integer> absolutebplengths, List<Double> tempList, List<Integer> mergedList) { mergedList.add(0); int i = 0; int j = 0; while ((i < absolutebplengths.size()) && (j < tempList.size())) { if (((j+1)%3) == 0) { j++; continue; } int relbp = absolutebplengths.get(i); int val = tempList.get(j).intValue(); if (relbp < val) { if (mergedList.contains(relbp) == false) { mergedList.add(relbp); } i++; } else { if (mergedList.contains(val) == false) { mergedList.add(val); } j++; } } return mergedList; } /** * private method to populate the startStopScore hash map * @throws IOException */ private void loadCoverageFileCompletelyOnHashMap() throws IOException { remainderLineFromCoverageFile = new HashMap<String, List<List<String>>>(); // coverage file BufferedReader buf = new BufferedReader(new FileReader(coverageFile)); String lineRead = buf.readLine(); StringTokenizer strtok; List<List<String>> tempList = new ArrayList<List<String>>(); while ((lineRead != null) && (lineRead.length() > 0)) { try { strtok = new StringTokenizer(lineRead, "\t\n"); String gene = strtok.nextToken(); // now skip the start stop and score and store the remainder of the line List<String> toBeAdded = new ArrayList<String>(); toBeAdded.add(strtok.nextToken()); toBeAdded.add(strtok.nextToken()); // UNCOMMENT THE NEXT LINE FOR JUNCTION FILES //strtok.nextToken(); toBeAdded.add(strtok.nextToken()); String remainingLine = ""; while (strtok.hasMoreTokens()) { remainingLine += strtok.nextToken()+"\t"; } toBeAdded.add(remainingLine); if (remainderLineFromCoverageFile.containsKey(gene) == true) { remainderLineFromCoverageFile.get(gene).add(toBeAdded); } else { tempList.add(toBeAdded); remainderLineFromCoverageFile.put(gene, tempList); } } catch (NoSuchElementException e) { lineRead = buf.readLine(); tempList = new ArrayList<List<String>>(); } lineRead = buf.readLine(); tempList = new ArrayList<List<String>>(); } buf.close(); } /** * private method to populate the startStopScore hashmap * @throws IOException */ private void loadCoverageFileOnHashMap() throws IOException { startStopScore = new HashMap<String, List<Double>>(); // coverage file BufferedReader buf = new BufferedReader(new FileReader(coverageFile)); String lineRead = buf.readLine(); StringTokenizer strtok; List<Double> tempList = new ArrayList<Double>(); while ((lineRead != null) && (lineRead.length() > 0)) { try { strtok = new StringTokenizer(lineRead, "\t\n"); String gene = strtok.nextToken(); double start = Double.parseDouble(strtok.nextToken()); double stop = Double.parseDouble(strtok.nextToken()); // uncomment the next line for junction files //strtok.nextToken(); double score = Double.parseDouble(strtok.nextToken()); if (startStopScore.containsKey(gene)) { startStopScore.get(gene).add(start); startStopScore.get(gene).add(stop); startStopScore.get(gene).add(score); } else { tempList.add(start); tempList.add(stop); tempList.add(score); startStopScore.put(gene, tempList); } } catch (NoSuchElementException e) { lineRead = buf.readLine(); tempList = new ArrayList<Double>(); } lineRead = buf.readLine(); tempList = new ArrayList<Double>(); } buf.close(); } /** * private method to populate and sort an intermediate array having the start stop and scores * @param value * @param intermediatestartstopscorearray * @return intermediatestartstopscorearray */ private double[][] populateAndSortIntermediateArray (List<Double> value, double[][] intermediatestartstopscorearray, List<List<String>> remainingString) { int j = 0; for (int i = 0; i < value.size(); i+=3) { intermediatestartstopscorearray[j][0] = value.get(i); intermediatestartstopscorearray[j][1] = value.get(i+1); intermediatestartstopscorearray[j][2] = value.get(i+2); j++; } // sorting intermediatestartstopscorearray for (int i = 0; i < intermediatestartstopscorearray.length; i++) { for (j = i+1; j < intermediatestartstopscorearray.length; j++) { if (intermediatestartstopscorearray[i][0] > intermediatestartstopscorearray[j][0]) { double tempstart = intermediatestartstopscorearray[i][0]; double tempstop = intermediatestartstopscorearray[i][1]; double tempscore = intermediatestartstopscorearray[i][2]; intermediatestartstopscorearray[i][0] = intermediatestartstopscorearray[j][0]; intermediatestartstopscorearray[i][1] = intermediatestartstopscorearray[j][1]; intermediatestartstopscorearray[i][2] = intermediatestartstopscorearray[j][2]; intermediatestartstopscorearray[j][0] = tempstart; intermediatestartstopscorearray[j][1] = tempstop; intermediatestartstopscorearray[j][2] = tempscore; if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { Collections.swap(remainingString, i, j); } } } } return intermediatestartstopscorearray; } /** * private method to populate an array out of the final list of start stop and score values * @param tempList * @param startstopscorearray * @return startstopscorearray */ private double[][] populateFinalArray (List<Double> tempList, double[][] startstopscorearray) { int j = 0; for (int i = 0; i < tempList.size(); i+=3) { startstopscorearray[j][0] = tempList.get(i); startstopscorearray[j][1] = tempList.get(i+1); startstopscorearray[j][2] = tempList.get(i+2); j++; } return startstopscorearray; } /** * private method to populate the missing range values in an arraylist * @param tempList * @param intermediatestartstopscorearray * @return tempList */ private List<Double> populateMissingValuesIntoList (List<Double> tempList, double[][] intermediatestartstopscorearray, List<List<String>> remainingString, List<List<String>> newRemainingString) { List<String> tempRemainingListElement = new ArrayList<String>(); for (int i = 0; i < intermediatestartstopscorearray.length; i++) { if ((i == 0) && (intermediatestartstopscorearray[i][0] > 0)) { tempList.add(0.0); tempList.add(intermediatestartstopscorearray[i][0]); tempList.add(0.0); if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { tempRemainingListElement.add("0.0"); tempRemainingListElement.add(new Double(intermediatestartstopscorearray[i][0]).toString()); tempRemainingListElement.add("0.0"); tempRemainingListElement.add(""); newRemainingString.add(tempRemainingListElement); tempRemainingListElement = new ArrayList<String>(); } } if ((i !=0) && (intermediatestartstopscorearray[i][0] > intermediatestartstopscorearray[i-1][1])) { tempList.add(intermediatestartstopscorearray[i-1][1]); tempList.add(intermediatestartstopscorearray[i][0]); tempList.add(0.0); if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { tempRemainingListElement.add(new Double(intermediatestartstopscorearray[i-1][1]).toString()); tempRemainingListElement.add(new Double(intermediatestartstopscorearray[i][0]).toString()); tempRemainingListElement.add("0.0"); tempRemainingListElement.add(""); newRemainingString.add(tempRemainingListElement); tempRemainingListElement = new ArrayList<String>(); } } tempList.add(intermediatestartstopscorearray[i][0]); tempList.add(intermediatestartstopscorearray[i][1]); tempList.add(intermediatestartstopscorearray[i][2]); if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { tempRemainingListElement.add(new Double(intermediatestartstopscorearray[i][0]).toString()); tempRemainingListElement.add(new Double(intermediatestartstopscorearray[i][1]).toString()); tempRemainingListElement.add(new Double(intermediatestartstopscorearray[i][2]).toString()); tempRemainingListElement.add(remainingString.get(i).get(3)); newRemainingString.add(tempRemainingListElement); tempRemainingListElement = new ArrayList<String>(); } } return tempList; } /** * private method to populate an array list with the repositioned values and weighted scores * @param startstopscorearray * @param mergedList * @param absolutebplengths * @param exonStarts * @param basePairs * @param finalStartStopScore * @return finalStartStopScore */ private List<Double> populateRepositionedArrayList(double[][] startstopscorearray, List<Integer> mergedList, List<Integer> absolutebplengths, List<Integer> exonStarts, List<Integer> basePairs, List<Double> finalStartStopScore, List<List<String>> newRemainingString, List<String> remainderStringForPrinting) { //long time_inside_function = System.currentTimeMillis(); int prevLength = 0; for (int i = 0; i < startstopscorearray.length; i++) { int startval = (int)startstopscorearray[i][0]; int stopval = (int)startstopscorearray[i][1]; double totalscore = startstopscorearray[i][2]; //int length = stopval - startval; int startindex = 0; int stopindex = 0; for (int j = 0; j < mergedList.size(); j++) { int mergedJ = mergedList.get(j); if (mergedJ == startval) { startindex = j; } if (mergedJ == stopval) { stopindex = j; break; } } for (int k = startindex; k < stopindex; k++) { // locate the index position in relativebplengths and then the corresponding actual position int index = 0; int mergedK = mergedList.get(k).intValue(); while (index < absolutebplengths.size()) { if (absolutebplengths.get(index) >= mergedK) { break; } index++; } int position = exonStarts.get(index); if ((absolutebplengths.get(index) == mergedK) && ((index+1) < exonStarts.size())) { position = exonStarts.get(index+1); prevLength += basePairs.get(index); } int mergedK1 = mergedList.get(k+1); finalStartStopScore.add(((double) position + mergedK) - prevLength); finalStartStopScore.add(((double) position + mergedK1) - prevLength); //finalStartStopScore.add(totalscore * (mergedK1 - mergedK) / length); finalStartStopScore.add(totalscore * (mergedK1 - mergedK)); // changed on 10/19/2010 as per Julien's logic if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { remainderStringForPrinting.add(newRemainingString.get(i).get(3)); } } } //System.out.println("time inside calculation method " + (System.currentTimeMillis() - time_inside_function)); return finalStartStopScore; } /** * private method to print the start stop and score values to the output file * @param chromosome * @param finalStartStopScore * @param bufWriter * @throws IOException */ private void printOutputWithExtraFieldsToFile (String key, String chromosome, int chrStart, List<Double> finalStartStopScore, List<String> remainderStringForPrinting, BufferedWriter bufWriter) throws IOException { for (int i = 0, j = 0; i < finalStartStopScore.size(); i+=3, j++) { int finalstart = finalStartStopScore.get(i).intValue(); int finalstop = finalStartStopScore.get(i+1).intValue(); double finalscore = finalStartStopScore.get(i+2); if (finalscore > 0.0) { bufWriter.write(chromosome + "\t" + finalstart + "\t" + finalstop + "\t" + finalscore + "\t" + key + "\t" + remainderStringForPrinting.get(j) + "\n"); } } } /** * private method to print the start stop and score values to the output file * @param chromosome * @param finalStartStopScore * @param bufWriter * @throws IOException */ private void printToOutputFile (String chromosome, List<Double> finalStartStopScore, BufferedWriter bufWriter) throws IOException { for (int i = 0; i < finalStartStopScore.size(); i+=3) { int finalstart = finalStartStopScore.get(i).intValue(); int finalstop = finalStartStopScore.get(i+1).intValue(); double finalscore = finalStartStopScore.get(i+2); if (finalscore > 0.0) { bufWriter.write(chromosome + "\t" + finalstart + "\t" + finalstop + "\t" + finalscore + "\n"); } } } /** * Method to get the Genome positions and scores */ public void rePosition() { try { loadCoverageFileOnHashMap(); if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { loadCoverageFileCompletelyOnHashMap(); } BufferedReader newbuf = new BufferedReader(new FileReader(annotationFile)); BufferedWriter bufWriter = new BufferedWriter(new FileWriter(outputFile)); String lineReadFromFile2 = newbuf.readLine(); StringTokenizer newstrtok = new StringTokenizer(lineReadFromFile2,"\t\n"); while (lineReadFromFile2 != null) { String chrmomosome = ""; int chrStart = 0; String geneFromFile2 = ""; try { chrmomosome = newstrtok.nextToken(); chrStart = Integer.parseInt(newstrtok.nextToken()); geneFromFile2 = newstrtok.nextToken(); } catch (NoSuchElementException e) { lineReadFromFile2 = newbuf.readLine(); if (lineReadFromFile2 != null) { newstrtok = new StringTokenizer(lineReadFromFile2,"\t\n"); } } Iterator<String> iter = startStopScore.keySet().iterator(); while (iter.hasNext()) { String key = iter.next(); if (key.compareTo(geneFromFile2) != 0) { continue; } if (key.compareTo(geneFromFile2) == 0) { List<Double> value = startStopScore.get(key); List<List<String>> remainingString = null; if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { remainingString = remainderLineFromCoverageFile.get(key); } double[][] intermediatestartstopscorearray = new double[value.size()/3][3]; intermediatestartstopscorearray = populateAndSortIntermediateArray(value, intermediatestartstopscorearray, remainingString); // now copy sorted array to a list and while adding the array also add the zeros List<Double> tempList = new ArrayList<Double>(); List<List<String>> newRemainingString = null; if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { newRemainingString = new ArrayList<List<String>>(); } tempList = populateMissingValuesIntoList(tempList,intermediatestartstopscorearray, remainingString, newRemainingString); // now copy this list to the finalarray double[][] startstopscorearray = new double[tempList.size()/3][3]; startstopscorearray = populateFinalArray(tempList, startstopscorearray) ; // get to the exon positions and lengths in the file int i = 0; while (newstrtok.hasMoreTokens()) { if (i == 6) { break; } if (i == 1) { i++; continue; } newstrtok.nextToken(); i++; } // add the exon start positions and lengths to lists String basePairsAsStrings = newstrtok.nextToken(); String exonStartsAsStrings = newstrtok.nextToken(); List<Integer> basePairs = new ArrayList<Integer>(); List<Integer> exonStarts = new ArrayList<Integer>(); StringTokenizer basePairTok = new StringTokenizer(basePairsAsStrings,","); while (basePairTok.hasMoreElements()) { basePairs.add(Integer.parseInt(basePairTok.nextToken())); } StringTokenizer exonStartTok = new StringTokenizer(exonStartsAsStrings,","); while (exonStartTok.hasMoreElements()) { exonStarts.add(Integer.parseInt(exonStartTok.nextToken())+chrStart); } // get the absolute lengths List<Integer> absolutebplengths = new ArrayList<Integer>(); int sum = 0; for (i = 0; i < basePairs.size(); i++) { sum += basePairs.get(i); absolutebplengths.add(sum); } // Merge the two sorted lists List<Integer> mergedList = new ArrayList<Integer>(); mergedList = listsMerger(absolutebplengths, tempList, mergedList); // populate a list which contains the final start stop and score values List<Double> finalStartStopScore = new ArrayList<Double>(); List<String> remainderStringForPrinting = new ArrayList<String>(); finalStartStopScore = populateRepositionedArrayList(startstopscorearray, mergedList, absolutebplengths, exonStarts, basePairs, finalStartStopScore, newRemainingString, remainderStringForPrinting); // write the list to the output file if (outputFileType == RNAToDNAResultType.BGR) { printToOutputFile(chrmomosome, finalStartStopScore, bufWriter); } // write to output file including extra fields from the input file if (outputFileType == RNAToDNAResultType.BGR_WITH_EXTRA_FIELDS) { printOutputWithExtraFieldsToFile(key, chrmomosome, chrStart, finalStartStopScore, remainderStringForPrinting, bufWriter); } } } lineReadFromFile2 = newbuf.readLine(); if (lineReadFromFile2 != null) { newstrtok = new StringTokenizer(lineReadFromFile2,"\t\n"); } } newbuf.close(); bufWriter.close(); // We sort the output file if (outputFileType == RNAToDNAResultType.BGR) { sortRepositionedFile(); } }catch (IOException e) { ExceptionManager.getInstance().caughtException(e); } } /** * private method to sort the repositioned file * @throws IOException */ private void sortRepositionedFile() throws IOException { BufferedReader buf = new BufferedReader(new FileReader(outputFile)); List<FileDataLineForSorting> file = new ArrayList<FileDataLineForSorting>(); String lineRead = buf.readLine(); StringTokenizer strtok = new StringTokenizer(lineRead,"\t\n"); while (lineRead != null) { String chrName = strtok.nextToken(); int start = Integer.parseInt(strtok.nextToken()); int stop = Integer.parseInt(strtok.nextToken()); double score = Double.parseDouble(strtok.nextToken()); FileDataLineForSorting fileSorter = new FileDataLineForSorting(chrName, start, stop, score); file.add(fileSorter); lineRead = buf.readLine(); if (lineRead != null) { strtok = new StringTokenizer(lineRead,"\t\n"); } } buf.close(); Collections.sort(file); // write the sorted data to the file BufferedWriter bufWriter = new BufferedWriter(new FileWriter(outputFile)); Iterator<FileDataLineForSorting> iter = file.iterator(); while (iter.hasNext()) { FileDataLineForSorting fs = iter.next(); bufWriter.write(fs.getChromosomeName() + "\t" + fs.getStart() + "\t" + fs.getStop() + "\t" + fs.getScore() + "\n"); } bufWriter.close(); } }