package org.seqcode.deepseq.experiments; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.nio.channels.FileChannel; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.LinkOption; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import cern.jet.random.Poisson; import cern.jet.random.engine.DRand; import org.apache.commons.lang3.RandomStringUtils; import org.seqcode.deepseq.ExtReadHit; import org.seqcode.deepseq.HitPair; import org.seqcode.deepseq.ReadHit; import org.seqcode.deepseq.StrandedBaseCount; import org.seqcode.deepseq.StrandedPair; import org.seqcode.deepseq.hitloaders.HitLoader; import org.seqcode.deepseq.stats.BackgroundCollection; import org.seqcode.deepseq.stats.PoissonBackgroundModel; import org.seqcode.genome.Genome; import org.seqcode.genome.location.Region; import org.seqcode.math.probability.NormalDistribution; import org.seqcode.math.stats.StatUtil; /** * HitCache acts as a cache for some or all alignment hits associated with a particular Sample. * * This class can cache either all alignment hits, or hits contained in a (rewritable) list of regions. * Which you choose to do in your application should be guided by the speed and memory trade-off. * If you choose the latter (cache each chromosome or sets of regions as you go), be careful to match your queries * with the cached regions. For example, if caching each chromosome as you go, you should group your queries * according to chromosome name. * * Note that even if you are not choosing to cache all alignment hits, the initialize method (called from constructor) * will still load all hits to memory. This is unfortunately currently required in order to calculate accurate * total hits and unique hits given the application of the per-base limit schema. Perhaps we can find * a work-around in the future. * * Hit alignments are loaded into two primative type 3D arrays -- fivePrimePos and fivePrimeCounts. * The fivePrimes field for each chrom/strand will be distinct. * Multiple reads mapped to the same bp position will be stored as counts. * In each of the 3D arrays: <br> * - the first dimension corresponds to the chromosome that a hit belongs to (based on * the mapping from a chromosome as a <tt>String</tt> to an integer via the * <tt>chrom2ID</tt> map). <br> * - the second dimension corresponds to the strand. 0 for '+' (Watson), 1 for '-' (Crick). <br> * - the third dimension contains information for a hit (e.g. its fivePrimes or counts) * * * This class also stores paired hits, if they exist for a given Sample. * All paired hits are indexed off the R1 reads. When you call getPairs(region) or similar methods, you * will only get back pairs that have a R1 read hit located in the region. Therefore, be careful with how you * interpret the numbers of paired hits in a given region. * * It may seem inefficient to store the pair information separately from the hits. However, note that the * single hits are stored as compressed hit location-weight pairs (i.e. multiple hits mapping to the same position * are treated as counts of the same hit). The single hit locations will also not match up completely with * the paired locations - only locations of hits in valid pairs are stored as pairs. * In summary, the single end hit locations are related to, but distinct from, the hit pairs, and care should be * taken when comparing items from each type. * * * @author mahony * This class combines functionality from DeepSeq and ReadCache in the old GSE setup. */ public class HitCache { private Collection<HitLoader> loaders; //Source of reads private boolean cacheMemoryEntireGenome=false; private boolean cacheInLocalFiles=false; //This is set to !cacheMemoryEntireGenome for now, but there may be situations in the future where both are false private File localCacheDir = null; private String localCacheFileBase=null; private List<Region> cachedRegions = null; private Genome gen; private ExptConfig econfig; private int numChroms=0; protected double totalHits=0; //totalHits is the sum of alignment weights protected double totalHitsPos=0; //totalHitsPos is the sum of alignment weights on the plus strand protected double totalHitsNeg=0; //totalHitsNeg is the sum of alignment weights on the minus strand protected double uniqueHits=0; //count of unique mapped positions (just counts the number of bases with non-zero counts - does not treat non-uniquely mapped positions differently) protected double totalPairs=0; //count of the total number of paired hits protected double uniquePairs=0; //count of the total number of unique paired hits protected boolean loadPairs; //Load them if they exist protected boolean hasPairs; //Some pairs have been loaded protected BackgroundCollection perBaseBack=new BackgroundCollection(); protected float maxReadsPerBP=-1; /** * Five prime ends of the read hits. <br> * First dimension represents the corresponding chromosome ID. <br> * Second dimension represents the strand. 0 for '+', 1 for '-' <br> * Third dimension contains the coordinates of the hits */ private int[][][] fivePrimePos=null; /** * Sum of read hit weights that corresponds to the 5' position * First dimension represents the corresponding chromosome ID. <br> * Second dimension represents the strand. 0 for '+', 1 for '-' <br> * Third dimension contains the number of hits at corresponding start position * Third dimension index is that of the corresponding hit location in fivePrimePos */ private float[][][] fivePrimeCounts=null; /** * Five prime ends of the R1 read hits in paired hits. <br> * First dimension represents the R1 read chromosome ID. <br> * Second dimension represents the R1 read strand. 0 for '+', 1 for '-' <br> * Third dimension contains the coordinates of the R1 read hit <br> */ private int[][][] pairR1Pos=null; /** * Five prime ends of the R2 read hits in paired hits. <br> * First dimension represents the R1 read chromosome ID. <br> * Second dimension represents the R1 read strand. 0 for '+', 1 for '-' <br> * Third dimension contains the coordinates of the R2 read hit <br> * Third dimension index is that of the corresponding R1 read in pairR1Pos */ private int[][][] pairR2Pos=null; /** * Chromosome IDs of the R2 read hits in paired hits. <br> * First dimension represents the R1 read chromosome ID. <br> * Second dimension represents the R1 read strand. 0 for '+', 1 for '-' <br> * Third dimension contains the coordinates of the R2 read hit <br> * Third dimension index is that of the corresponding R1 read in pairR1Pos */ private int[][][] pairR2Chrom=null; /** * Strands of the R2 read hits in paired hits. <br> * First dimension represents the R1 read chromosome ID. <br> * Second dimension represents the R1 read strand. 0 for '+', 1 for '-' <br> * Third dimension contains the strands of the R2 read hit <br> * Third dimension index is that of the corresponding R1 read in pairR1Pos */ private int[][][] pairR2Strand=null; /** * Weights of the paired hits. <br> * First dimension represents the R1 read chromosome ID. <br> * Second dimension represents the R1 read strand. 0 for '+', 1 for '-' <br> * Third dimension contains the weights <br> * Third dimension index is that of the corresponding R1 read in pairR1Pos */ private float[][][] pairWeight=null; private HashMap<String, Integer> chrom2ID=new HashMap<String,Integer>(); private HashMap<String, Integer> chrom2DBID=new HashMap<String,Integer>(); private HashMap<Integer,String> id2Chrom=new HashMap<Integer,String>(); private HashMap<Integer,Integer> id2DBID=new HashMap<Integer,Integer>(); /** * Constructor * @param loadPairs : boolean flag to load the read pairs (if they exist) * @param ec * @param hloaders * @param perBaseReadMax * @param cacheEverything : boolean flag to cache all hits * @param initialCacheRegions : list of regions to cache first (can be null) */ public HitCache(boolean loadPairs, ExptConfig ec, Collection<HitLoader> hloaders, float perBaseReadMax, boolean cacheEverything, List<Region> initialCacheRegions){ econfig = ec; gen = econfig.getGenome(); this.loaders = hloaders; maxReadsPerBP= perBaseReadMax; this.loadPairs = loadPairs; initialize(cacheEverything, initialCacheRegions); } //Accessors public double getHitCount(){return(totalHits);} public double getHitCountPos(){return(totalHitsPos);} public double getHitCountNeg(){return(totalHitsNeg);} public double getHitPositionCount(){return(uniqueHits);} public double getPairCount(){return(totalPairs);} public double getUniquePairCount(){return(uniquePairs);} public Genome getGenome(){return gen;} public void setGenome(Genome g){gen=g;} /** * Initialize needs to be called by the constructor. * * If all hits are being cached, initialize sets up the primitive arrays. * Unfortunately, even if all hits are not being cached we still need to * set up the primitive arrays for now. This is so that the total and unique * hit counts can be accurately calculated given the per base limits. * * @param loadType1 : boolean flag to load the left reads * @param loadType2 : boolean flag to load the right reads (if they exist) * @param loadPairs : boolean flag to load the read pairs (if they exist) * @param cacheEverything : boolean flag to cache all hits (if false, local file caching is activated) * @param initialCacheRegions : list of regions to cache first (can be null) */ private void initialize(boolean cacheEverything, List<Region> initialCacheRegions){ cacheMemoryEntireGenome=cacheEverything; cacheInLocalFiles = !cacheMemoryEntireGenome; cachedRegions = initialCacheRegions; //These lists are temporary stores while collecting reads from all sources HashMap<String, ArrayList<Integer>[]> posList = new HashMap<String, ArrayList<Integer>[]>(); HashMap<String, ArrayList<Float>[]> countsList = new HashMap<String, ArrayList<Float>[]>(); HashMap<String, ArrayList<HitPair>[]> pairsList = new HashMap<String, ArrayList<HitPair>[]>(); for(HitLoader currLoader : loaders){ try{ //Get all read hits (necessary here to correct per-base counts appropriately) currLoader.sourceAllHits(); //Add the reads to the temporary stores for(String chr: currLoader.getFivePrimePositions().keySet()){ if(!posList.containsKey(chr)){ ArrayList<Integer>[] currIArrayList = new ArrayList[2]; currIArrayList[0]=new ArrayList<Integer>(); currIArrayList[1]=new ArrayList<Integer>(); posList.put(chr, currIArrayList); } posList.get(chr)[0].addAll(currLoader.getFivePrimePositions().get(chr)[0]); posList.get(chr)[1].addAll(currLoader.getFivePrimePositions().get(chr)[1]); } for(String chr: currLoader.getFivePrimeCounts().keySet()){ if(!countsList.containsKey(chr)){ ArrayList<Float>[] currFArrayList = new ArrayList[2]; currFArrayList[0]=new ArrayList<Float>(); currFArrayList[1]=new ArrayList<Float>(); countsList.put(chr, currFArrayList); } countsList.get(chr)[0].addAll(currLoader.getFivePrimeCounts().get(chr)[0]); countsList.get(chr)[1].addAll(currLoader.getFivePrimeCounts().get(chr)[1]); } //Add the pairs to the temporary stores (if requested & exist) //Also sort the pairs (required for telling uniques apart) if(loadPairs && currLoader.hasPairedReads()){ hasPairs=true; for(String chr: currLoader.getPairs().keySet()){ if(!pairsList.containsKey(chr)){ ArrayList<HitPair>[] currLRPArrayList = new ArrayList[2]; currLRPArrayList[0]=new ArrayList<HitPair>(); currLRPArrayList[1]=new ArrayList<HitPair>(); pairsList.put(chr, currLRPArrayList); } pairsList.get(chr)[0].addAll(currLoader.getPairs().get(chr)[0]); pairsList.get(chr)[1].addAll(currLoader.getPairs().get(chr)[1]); Collections.sort(pairsList.get(chr)[0]); Collections.sort(pairsList.get(chr)[1]); } } //Reset loader to free memory currLoader.resetLoader(); }catch(OutOfMemoryError e){ e.printStackTrace(); System.err.println("Ran out of memory during hit loading; try re-running with increased -Xmx option."); System.exit(1); } } //null genome is estimated here if necessary if(gen==null) gen = estimateGenome(posList); //Make the primitive arrays populateArrays(posList, countsList, pairsList); updateTotalHits(); //Initialize a per-base background model initializeBackground(); //Enforce per-base read limits //maxReadsPerBP = 0 : poisson/gauss //maxReadsPerBP = -1 : global poisson //maxReadsPerBP > 0 : fixed if(econfig.doPerBaseFiltering()){ if(econfig.doPoissonGaussWinPerBaseFiltering() || maxReadsPerBP==0){ //global poisson/gauss model capPerBaseCountWithPoissonGaussianFilter(10e-3, 20); }else{ if(maxReadsPerBP == -1) maxReadsPerBP = perBaseBack.getMaxThreshold('.'); capPerBaseCount(maxReadsPerBP); } updateTotalHits(); initializeBackground(); //Reinitialize given updated hit count (again - just the per-base background model) } //If you are not caching everything, reduce the assigned memory if(!cacheMemoryEntireGenome){ //Cache in local files if(cacheInLocalFiles) saveCacheLocally(); //Save a subset of regions from the current data structure if necessary if(cachedRegions==null) emptyArrays(); else subsetArrays(cachedRegions); } //Free memory for(String chr: posList.keySet()){ posList.get(chr)[0].clear(); posList.get(chr)[1].clear(); countsList.get(chr)[0].clear(); countsList.get(chr)[1].clear(); if(loadPairs && hasPairs && pairsList!=null){ pairsList.get(chr)[0].clear(); pairsList.get(chr)[1].clear(); } } posList.clear(); countsList.clear(); if(loadPairs && hasPairs && pairsList!=null) pairsList.clear(); System.gc(); } /** * Load all base counts in a region, regardless of strand. * If file caching is being used, it's more efficient to group calls to this method by chromosome. * @param r Region * @return List of StrandedBaseCounts */ public List<StrandedBaseCount> getBases(Region r) { List<StrandedBaseCount> bases = new ArrayList<StrandedBaseCount>(); bases.addAll(getStrandedBases(r,'+')); bases.addAll(getStrandedBases(r,'-')); return bases; } /** * Loads hits in the region. * If file caching is being used, it's more efficient to group calls to this method by chromosome. * Unfortunately, I think the easiest way to ensure thread-safety is to allow only one thread to call this at a time. * @param r Region * @return List of StrandedBaseCounts */ public synchronized List<StrandedBaseCount> getStrandedBases(Region r, char strand) { List<StrandedBaseCount> bases = new ArrayList<StrandedBaseCount>(); if(!regionIsCached(r)){ if(cacheInLocalFiles){ loadCachedChrom(r.getChrom()); }else{ System.err.println("HitCache: Queried region "+r.getLocationString()+" is not in cache and local file caching not available!"); System.exit(1); } } String chr = r.getChrom(); if(chrom2ID.containsKey(chr)){ int chrID = chrom2ID.get(chr); int j = (strand=='+') ? 0 : 1; if(fivePrimePos[chrID][j] != null){ int[] tempStarts = fivePrimePos[chrID][j]; if(tempStarts.length != 0) { int start_ind = Arrays.binarySearch(tempStarts, r.getStart()); int end_ind = Arrays.binarySearch(tempStarts, r.getEnd()); if( start_ind < 0 ) { start_ind = -start_ind - 1; } if( end_ind < 0 ) { end_ind = -end_ind - 1; } while (start_ind > 0 && tempStarts[start_ind - 1] >= r.getStart() ) { start_ind--; } while (end_ind < tempStarts.length && tempStarts[end_ind] <= r.getEnd()) { end_ind++; } for(int k = start_ind; k < end_ind; k++) { bases.add(new StrandedBaseCount(strand, tempStarts[k], fivePrimeCounts[chrID][j][k])); } } } } return bases; }//end of getStrandedBases method /** * Load all paired hits that have an R1 read in a region. * If file caching is being used, it's more efficient to group calls to this method by chromosome. * @param r Region * @return List of StrandedPair */ public List<StrandedPair> getPairs(Region r) { List<StrandedPair> pairs = new ArrayList<StrandedPair>(); pairs.addAll(getPairsOnStrand(r,'+')); pairs.addAll(getPairsOnStrand(r,'-')); return pairs; } /** * Loads paired hits that have an R1 read in the region and on the requested strand. * If file caching is being used, it's more efficient to group calls to this method by chromosome. * Unfortunately, I think the easiest way to ensure thread-safety is to allow only one thread to call this at a time. * @param r Region * @return List of StrandedPair */ public synchronized List<StrandedPair> getPairsOnStrand(Region r, char strand) { List<StrandedPair> pairs = new ArrayList<StrandedPair>(); if(loadPairs && hasPairs && pairR1Pos!=null){ if(!regionIsCached(r)){ if(cacheInLocalFiles){ loadCachedChrom(r.getChrom()); }else{ System.err.println("HitCache: Queried region "+r.getLocationString()+" is not in cache and local file caching not available!"); System.exit(1); } } String chr = r.getChrom(); if(chrom2ID.containsKey(chr)){ int chrID = chrom2ID.get(chr); int chrDBID = chrom2DBID.get(chr); int j = (strand=='+') ? 0 : 1; if(pairR1Pos[chrID][j] != null){ int[] tempStarts = pairR1Pos[chrID][j]; if(tempStarts.length != 0) { int start_ind = Arrays.binarySearch(tempStarts, r.getStart()); int end_ind = Arrays.binarySearch(tempStarts, r.getEnd()); if( start_ind < 0 ) { start_ind = -start_ind - 1; } if( end_ind < 0 ) { end_ind = -end_ind - 1; } while (start_ind > 0 && tempStarts[start_ind - 1] >= r.getStart() ) { start_ind--; } while (end_ind < tempStarts.length && tempStarts[end_ind] <= r.getEnd()) { end_ind++; } for(int k = start_ind; k < end_ind; k++) { pairs.add(new StrandedPair(gen, chrDBID, tempStarts[k], strand, id2DBID.get(pairR2Chrom[chrID][j][k]), pairR2Pos[chrID][j][k], pairR2Strand[chrID][j][k]==0?'+':'-', pairWeight[chrID][j][k] )); } } } } } return pairs; } /** * Sum of all hit weights in a region * If file caching is being used, it's more efficient to group calls to this method by chromosome. * @param r Region * @return float */ public float countHits(Region r) { float count = 0; count += countStrandedBases(r, '+'); count += countStrandedBases(r, '-'); return count; } /** * Sum of hit weights in one strand of a region. * If file caching is being used, it's more efficient to group calls to this method by chromosome. * Unfortunately, I think the easiest way to ensure thread-safety is to allow only one thread to call this at a time. * @param r Region * @return float */ public synchronized float countStrandedBases(Region r, char strand) { float count = 0; if(!regionIsCached(r)){ if(cacheInLocalFiles){ loadCachedChrom(r.getChrom()); }else{ System.err.println("HitCache: Queried region "+r.getLocationString()+" is not in cache and local file caching not available!"); System.exit(1); } } String chr = r.getChrom(); if(chrom2ID.containsKey(chr)){ int chrID = chrom2ID.get(chr); int j = (strand=='+') ? 0 : 1; if(fivePrimePos[chrID][j] != null){ int[] tempStarts = fivePrimePos[chrID][j]; if(tempStarts.length != 0) { int start_ind = Arrays.binarySearch(tempStarts, r.getStart()); int end_ind = Arrays.binarySearch(tempStarts, r.getEnd()); if( start_ind < 0 ) { start_ind = -start_ind - 1; } if( end_ind < 0 ) { end_ind = -end_ind - 1; } while (start_ind > 0 && tempStarts[start_ind - 1] >= r.getStart() ) { start_ind--; } while (end_ind < tempStarts.length && tempStarts[end_ind] <= r.getEnd()) { end_ind++; } for(int k = start_ind; k < end_ind; k++) { count += fivePrimeCounts[chrID][j][k]; } } } } return count; } public List<ExtReadHit> exportExtReadHits(Region r, int readLen, int startShift, int fivePrimeExt, int threePrimeExt){ List<ReadHit> readHits = exportReadHits(r,readLen); List<ExtReadHit> extReadHits = new ArrayList<ExtReadHit>(); for(ReadHit rh : readHits){ extReadHits.add(new ExtReadHit(gen, rh, startShift, fivePrimeExt, threePrimeExt)); } return extReadHits; } /** * Exports readhits in a given region * If file caching is being used, it's more efficient to group calls to this method by chromosome. */ public List<ReadHit> exportReadHits(Region r, int readLen){ List<ReadHit> reghits = new ArrayList<ReadHit>(); for(StrandedBaseCount sbc : getBases(r)){ int start = sbc.getStrand() == '+' ? sbc.getCoordinate() : sbc.getCoordinate()-readLen+1; int end = sbc.getStrand() == '+' ? sbc.getCoordinate()+readLen-1 : sbc.getCoordinate(); reghits.add(new ReadHit(r.getChrom(),start,end,sbc.getStrand(),sbc.getCount())); } return(reghits); } /** * Export all single-end hits to ReadHit objects * @param readLen * @return */ public List<ReadHit> exportReadHits(int readLen){ List<ReadHit> allhits = new ArrayList<ReadHit>(); for(String chr : chrom2ID.keySet()){ Region chrom = new Region(gen, chr, 1, gen.getChromLength(chr)); for(StrandedBaseCount sbc : getBases(chrom)){ int start = sbc.getStrand()=='+' ? sbc.getCoordinate() : sbc.getCoordinate()-readLen+1; int end = sbc.getStrand()=='+' ? sbc.getCoordinate()+readLen-1 : sbc.getCoordinate(); allhits.add(new ReadHit(chr, start, end, sbc.getStrand(), sbc.getCount())); } } return(allhits); } /** * Check that a given region is in the cache. * Should be called by all methods that query hits in this class * @param r * @return */ private boolean regionIsCached(Region r){ boolean regionCached = cacheMemoryEntireGenome ? true : false; if(!regionCached && cachedRegions!=null){ for(Region curr : cachedRegions){ if(curr.contains(r)){ regionCached=true; break; } } } return regionCached; } /** * Empty the position and counts arrays, but leave skeleton intact */ private void emptyArrays(){ //Position arrays for(int i = 0; i < fivePrimePos.length; i++) { // chr for(int j = 0; j < fivePrimePos[i].length; j++) { // strand fivePrimePos[i][j]=null; } } //Count arrays for(int i = 0; i < fivePrimeCounts.length; i++) { // chr for(int j = 0; j < fivePrimeCounts[i].length; j++) { // strand fivePrimeCounts[i][j]=null; } } //Pair arrays if(hasPairs && pairR1Pos!=null){ for(int i = 0; i < pairR1Pos.length; i++) { // chr for(int j = 0; j < pairR1Pos[i].length; j++) { // strand pairR1Pos[i][j]=null; pairR2Pos[i][j]=null; pairR2Chrom[i][j]=null; pairR2Strand[i][j]=null; pairWeight[i][j]=null; } } } System.gc(); } /** * Converts lists of Integers and matched Floats to arrays. * Sorts array elements by position. */ private void populateArrays(HashMap<String, ArrayList<Integer>[]> posList, HashMap<String, ArrayList<Float>[]> countsList, HashMap<String, ArrayList<HitPair>[]> pairsList) { //Initialize chromosome name to id maps numChroms=0; for(String chr : gen.getChromList()){ chrom2ID.put(chr, numChroms); chrom2DBID.put(chr, gen.getChromID(chr)); id2DBID.put(numChroms, gen.getChromID(chr)); id2Chrom.put(numChroms, chr); numChroms++; } //Initialize the data structures fivePrimePos = new int[numChroms][2][]; fivePrimeCounts = new float[numChroms][2][]; if(hasPairs){ pairR1Pos = new int[numChroms][2][]; pairR2Pos = new int[numChroms][2][]; pairR2Chrom = new int[numChroms][2][]; pairR2Strand = new int[numChroms][2][]; pairWeight = new float[numChroms][2][]; } //Copy over the 5' position data for(String chr : gen.getChromList()){ if(posList.containsKey(chr)){ for(int j = 0; j < posList.get(chr).length; j++) fivePrimePos[chrom2ID.get(chr)][j] = list2int(posList.get(chr)[j]); }else{ fivePrimePos[chrom2ID.get(chr)][0]=null; fivePrimePos[chrom2ID.get(chr)][1]=null; } }//Copy over the count data for(String chr : gen.getChromList()){ if(countsList.containsKey(chr)){ for(int j = 0; j < countsList.get(chr).length; j++) fivePrimeCounts[chrom2ID.get(chr)][j] = list2float(countsList.get(chr)[j]); }else{ fivePrimeCounts[chrom2ID.get(chr)][0]=null; fivePrimeCounts[chrom2ID.get(chr)][1]=null; } } if(loadPairs && hasPairs){ //Copy over the paired data for(String chr : gen.getChromList()){ int c = chrom2ID.get(chr); if(pairsList.containsKey(chr)){ for(int j = 0; j < pairsList.get(chr).length; j++){ pairR1Pos[c][j] = new int[pairsList.get(chr)[j].size()]; pairR2Pos[c][j] = new int[pairsList.get(chr)[j].size()]; pairR2Chrom[c][j] = new int[pairsList.get(chr)[j].size()]; pairR2Strand[c][j] = new int[pairsList.get(chr)[j].size()]; pairWeight[c][j] = new float[pairsList.get(chr)[j].size()]; int p=0; for(HitPair lrp : pairsList.get(chr)[j]){ pairR1Pos[c][j][p] = lrp.r1Pos; pairR2Pos[c][j][p] = lrp.r2Pos; pairR2Chrom[c][j][p] = chrom2ID.get(lrp.r2Chr); pairR2Strand[c][j][p] = lrp.r2Strand; pairWeight[c][j][p] = lrp.pairWeight; p++; } } }else{ pairR1Pos[chrom2ID.get(chr)][0]=null; pairR1Pos[chrom2ID.get(chr)][1]=null; pairR2Pos[chrom2ID.get(chr)][0]=null; pairR2Pos[chrom2ID.get(chr)][1]=null; pairR2Chrom[chrom2ID.get(chr)][0]=null; pairR2Chrom[chrom2ID.get(chr)][1]=null; pairR2Strand[chrom2ID.get(chr)][0]=null; pairR2Strand[chrom2ID.get(chr)][1]=null; pairWeight[chrom2ID.get(chr)][0]=null; pairWeight[chrom2ID.get(chr)][1]=null; } } } //Sort the single-end arrays for(int i = 0; i < fivePrimePos.length; i++) { // chr for(int j = 0; j < fivePrimePos[i].length; j++) { // strand if(fivePrimePos[i][j]!=null && fivePrimeCounts[i][j]!=null){ int[] inds = StatUtil.findSort(fivePrimePos[i][j]); fivePrimeCounts[i][j] = StatUtil.permute(fivePrimeCounts[i][j], inds); } } } //Sort the paired-end arrays if(loadPairs && hasPairs){ for(int i = 0; i < pairR1Pos.length; i++) { // chr for(int j = 0; j < pairR1Pos[i].length; j++) { // strand if(pairR1Pos[i][j]!=null && pairR2Pos[i][j]!=null && pairR2Chrom[i][j]!=null && pairR2Strand[i][j]!=null){ int[] inds = StatUtil.findSort(pairR1Pos[i][j]); pairR2Pos[i][j] = StatUtil.permute(pairR2Pos[i][j], inds); pairR2Chrom[i][j] = StatUtil.permute(pairR2Chrom[i][j], inds); pairR2Strand[i][j] = StatUtil.permute(pairR2Strand[i][j], inds); pairWeight[i][j] = StatUtil.permute(pairWeight[i][j], inds); } } } } //Collapse duplicate positions (single-end arrays) for(int i = 0; i < fivePrimePos.length; i++){ for(int j = 0; j < fivePrimePos[i].length; j++){ if(fivePrimePos[i][j]!=null && fivePrimePos[i][j].length>0){ int uniquePos=1; for(int k = 0; k < fivePrimePos[i][j].length-1; k++) if(fivePrimePos[i][j][k+1]!=fivePrimePos[i][j][k]){uniquePos++;} int[] tmpPos = new int[uniquePos]; float[] tmpCnt = new float[uniquePos]; for(int x=0; x<uniquePos; x++){tmpCnt[x]=0;} int x=0; tmpPos[x] = fivePrimePos[i][j][0]; tmpCnt[x] += fivePrimeCounts[i][j][0]; for(int k = 1; k < fivePrimePos[i][j].length; k++){ if(fivePrimePos[i][j][k]!=fivePrimePos[i][j][k-1]){x++;} tmpPos[x] = fivePrimePos[i][j][k]; tmpCnt[x] += fivePrimeCounts[i][j][k]; } fivePrimeCounts[i][j] = tmpCnt; fivePrimePos[i][j] = tmpPos; } } } //Collapse duplicate positions (paired-end arrays) if(loadPairs && hasPairs){ for(int i = 0; i < pairR1Pos.length; i++){ for(int j = 0; j < pairR1Pos[i].length; j++){ if(pairR1Pos[i][j]!=null && pairR1Pos[i][j].length>0){ int uniquePos=1; for(int k = 0; k < pairR1Pos[i][j].length-1; k++) if(pairR1Pos[i][j][k+1]!=pairR1Pos[i][j][k] || pairR2Pos[i][j][k+1]!=pairR2Pos[i][j][k] || pairR2Chrom[i][j][k+1]!=pairR2Chrom[i][j][k] || pairR2Strand[i][j][k+1]!=pairR2Strand[i][j][k]){ uniquePos++; } int[] tmpR1Pos = new int[uniquePos]; int[] tmpR2Pos = new int[uniquePos]; int[] tmpR2Chrom = new int[uniquePos]; int[] tmpR2Str = new int[uniquePos]; float[] tmpW = new float[uniquePos]; for(int x=0; x<uniquePos; x++){tmpW[x]=0;} int x=0; tmpR1Pos[x] = pairR1Pos[i][j][0]; tmpR2Pos[x] = pairR2Pos[i][j][0]; tmpR2Chrom[x] = pairR2Chrom[i][j][0]; tmpR2Str[x] = pairR2Strand[i][j][0]; tmpW[x] += pairWeight[i][j][0]; for(int k = 1; k < pairR1Pos[i][j].length; k++){ if(pairR1Pos[i][j][k-1]!=pairR1Pos[i][j][k] || pairR2Pos[i][j][k-1]!=pairR2Pos[i][j][k] || pairR2Chrom[i][j][k-1]!=pairR2Chrom[i][j][k] || pairR2Strand[i][j][k-1]!=pairR2Strand[i][j][k]){ x++; } tmpR1Pos[x] = pairR1Pos[i][j][k]; tmpR2Pos[x] = pairR2Pos[i][j][k]; tmpR2Chrom[x] = pairR2Chrom[i][j][k]; tmpR2Str[x] = pairR2Strand[i][j][k]; tmpW[x] += pairWeight[i][j][k]; } pairR1Pos[i][j] = tmpR1Pos; pairR2Pos[i][j] = tmpR2Pos; pairR2Chrom[i][j] = tmpR2Chrom; pairR2Strand[i][j] = tmpR2Str; pairWeight[i][j] = tmpW; } } } } }//end of populateArrays method /** * Extract hits overlapping a subset of regions and redefine the cache using just these hits. * Should only be called during initialization when the full set of hits is loaded in the cache, * or from the loadCachedRegions method after a set of full chromosomes have been loaded from local files. * @param regs */ private void subsetArrays(List<Region> regs){ //merge overlapping regions so that hits aren't loaded twice List<Region> mregs = Region.mergeRegions(regs); //TODO: There's probably a more efficient way to do the below without needing to use the HashMaps //Extract the relevant hits HashMap<String, ArrayList<Integer>[]> posList = new HashMap<String, ArrayList<Integer>[]>(); HashMap<String, ArrayList<Float>[]> countsList = new HashMap<String, ArrayList<Float>[]>(); HashMap<String, ArrayList<HitPair>[]> pairsList = new HashMap<String, ArrayList<HitPair>[]>(); for(Region r : mregs){ if(!posList.containsKey(r.getChrom())){ ArrayList<Integer>[] currIArrayList = new ArrayList[2]; currIArrayList[0]=new ArrayList<Integer>(); currIArrayList[1]=new ArrayList<Integer>(); posList.put(r.getChrom(), currIArrayList); ArrayList<Float>[] currFArrayList = new ArrayList[2]; currFArrayList[0]=new ArrayList<Float>(); currFArrayList[1]=new ArrayList<Float>(); countsList.put(r.getChrom(), currFArrayList); } ArrayList<Integer>[] chrIArrayList = posList.get(r.getChrom()); ArrayList<Float>[] chrFArrayList = countsList.get(r.getChrom()); for(StrandedBaseCount sbc : getStrandedBases(r, '+')){ chrIArrayList[0].add(sbc.getCoordinate()); chrFArrayList[0].add(sbc.getCount()); } for(StrandedBaseCount sbc : getStrandedBases(r, '-')){ chrIArrayList[1].add(sbc.getCoordinate()); chrFArrayList[1].add(sbc.getCount()); } //Add pairs in if they exist if(loadPairs && hasPairs){ if(!pairsList.containsKey(r.getChrom())){ ArrayList<HitPair>[] currLRPArrayList = new ArrayList[2]; currLRPArrayList[0]=new ArrayList<HitPair>(); currLRPArrayList[1]=new ArrayList<HitPair>(); pairsList.put(r.getChrom(), currLRPArrayList); } ArrayList<HitPair>[] currLRPArrayList = pairsList.get(r.getChrom()); for(StrandedPair sp : getPairsOnStrand(r, '+')){ currLRPArrayList[0].add(new HitPair(sp.getR1Coordinate(), sp.getR2Chrom(), sp.getR2Coordinate(), sp.getR2Strand()=='+'?0:1, (float)1.0)); } for(StrandedPair sp : getPairsOnStrand(r, '-')){ currLRPArrayList[1].add(new HitPair(sp.getR1Coordinate(), sp.getR2Chrom(), sp.getR2Coordinate(), sp.getR2Strand()=='+'?0:1, (float)1.0)); } } } //Repopulate cache populateArrays(posList, countsList, pairsList); //Free memory for(String chr: posList.keySet()){ posList.get(chr)[0].clear(); posList.get(chr)[1].clear(); countsList.get(chr)[0].clear(); countsList.get(chr)[1].clear(); if(loadPairs && hasPairs){ pairsList.get(chr)[0].clear(); pairsList.get(chr)[1].clear(); } } posList.clear(); countsList.clear(); if(loadPairs && hasPairs) pairsList.clear(); System.gc(); } /** * Save the contents of the hit arrays to local binary files */ private void saveCacheLocally(){ //Initialize a random String for the local cache name localCacheFileBase = RandomStringUtils.randomAlphanumeric(20); //Generate local cache directories localCacheDir = new File(econfig.getFileCacheDirName()+File.separator+localCacheFileBase); if(!localCacheDir.mkdirs()){ System.err.println("Unable to make local cache directories"); System.exit(1); } //Save arrays to binary files for(String chrom : chrom2ID.keySet()){ for(int strand=0; strand<=1; strand++){ int chrID = chrom2ID.get(chrom); //Write single-end files if(fivePrimePos[chrID][strand]!=null && fivePrimeCounts[chrID][strand]!=null){ ByteBuffer posByteBuffer = ByteBuffer.allocate(fivePrimePos[chrID][strand].length * 4); ByteBuffer countsByteBuffer = ByteBuffer.allocate(fivePrimeCounts[chrID][strand].length * 4); IntBuffer posIntBuffer = posByteBuffer.asIntBuffer(); FloatBuffer countsFloatBuffer = countsByteBuffer.asFloatBuffer(); posIntBuffer.put(fivePrimePos[chrID][strand]); countsFloatBuffer.put(fivePrimeCounts[chrID][strand]); byte[] parray = posByteBuffer.array(); byte[] carray = countsByteBuffer.array(); Path ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".pos.cache"); Path cpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".counts.cache"); try { Files.write( ppath, parray, StandardOpenOption.CREATE); Files.write( cpath, carray, StandardOpenOption.CREATE); } catch (IOException e) { e.printStackTrace(); } } //Write pair files if(loadPairs && hasPairs){ if(pairR1Pos[chrID][strand]!=null && pairR2Pos[chrID][strand]!=null && pairR2Chrom[chrID][strand]!=null && pairR2Strand[chrID][strand]!=null){ ByteBuffer r1PosByteBuffer = ByteBuffer.allocate(pairR1Pos[chrID][strand].length * 4); ByteBuffer r2PosByteBuffer = ByteBuffer.allocate(pairR2Pos[chrID][strand].length * 4); ByteBuffer r2ChromByteBuffer = ByteBuffer.allocate(pairR2Chrom[chrID][strand].length * 4); ByteBuffer r2StrandByteBuffer = ByteBuffer.allocate(pairR2Strand[chrID][strand].length * 4); ByteBuffer weightByteBuffer = ByteBuffer.allocate(pairWeight[chrID][strand].length * 4); IntBuffer r1PosIntBuffer = r1PosByteBuffer.asIntBuffer(); IntBuffer r2PosIntBuffer = r2PosByteBuffer.asIntBuffer(); IntBuffer r2ChromIntBuffer = r2ChromByteBuffer.asIntBuffer(); IntBuffer r2StrandIntBuffer = r2StrandByteBuffer.asIntBuffer(); FloatBuffer weightFloatBuffer = weightByteBuffer.asFloatBuffer(); r1PosIntBuffer.put(pairR1Pos[chrID][strand]); r2PosIntBuffer.put(pairR2Pos[chrID][strand]); r2ChromIntBuffer.put(pairR2Chrom[chrID][strand]); r2StrandIntBuffer.put(pairR2Strand[chrID][strand]); weightFloatBuffer.put(pairWeight[chrID][strand]); byte[] r1parray = r1PosByteBuffer.array(); byte[] r2parray = r2PosByteBuffer.array(); byte[] r2carray = r2ChromByteBuffer.array(); byte[] r2sarray = r2StrandByteBuffer.array(); byte[] warray = weightByteBuffer.array(); Path r1ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r1pos.cache"); Path r2ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2pos.cache"); Path r2cpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2chr.cache"); Path r2spath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2str.cache"); Path wpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".weight.cache"); try { Files.write( r1ppath, r1parray, StandardOpenOption.CREATE); Files.write( r2ppath, r2parray, StandardOpenOption.CREATE); Files.write( r2cpath, r2carray, StandardOpenOption.CREATE); Files.write( r2spath, r2sarray, StandardOpenOption.CREATE); Files.write( wpath, warray, StandardOpenOption.CREATE); } catch (IOException e) { e.printStackTrace(); } } } } } } /** * Load the data from one chromosome from the local cache into the array data structure. * Be careful calling this outside of this class - ensure that operations are thread-safe * @param chrom */ public synchronized void loadCachedChrom(String chrom){ //Empty the current memory cache emptyArrays(); if(cachedRegions!=null) cachedRegions.clear(); //Get the data from files if(chrom2ID.containsKey(chrom)){ int chrID = chrom2ID.get(chrom); for(int strand=0; strand<=1; strand++){ //Read single-end files Path ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".pos.cache"); Path cpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".counts.cache"); if(Files.exists(ppath, LinkOption.NOFOLLOW_LINKS) && Files.exists(cpath, LinkOption.NOFOLLOW_LINKS)){ try { FileChannel posInChannel = FileChannel.open(ppath, StandardOpenOption.READ); FileChannel countsInChannel = FileChannel.open(cpath, StandardOpenOption.READ); int[] pResult = new int[((int)posInChannel.size())/4]; float[] cResult = new float[((int)countsInChannel.size())/4]; ByteBuffer pbuf = ByteBuffer.allocate((int)posInChannel.size()); ByteBuffer cbuf = ByteBuffer.allocate((int)countsInChannel.size()); // Fill in the buffers while(pbuf.hasRemaining( )) posInChannel.read(pbuf); while(cbuf.hasRemaining( )) countsInChannel.read(cbuf); pbuf.flip( ); cbuf.flip( ); // Create buffer views IntBuffer posIntBuffer = pbuf.asIntBuffer( ); FloatBuffer countsFloatBuffer = cbuf.asFloatBuffer( ); //Results will now contain all ints/floats read from file posIntBuffer.get(pResult); countsFloatBuffer.get(cResult); //Assign to arrays fivePrimePos[chrID][strand] = pResult; fivePrimeCounts[chrID][strand] = cResult; } catch (IOException e) { e.printStackTrace(); } } //Load pairs if(loadPairs && hasPairs){ Path r1ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r1pos.cache"); Path r2ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2pos.cache"); Path r2cpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2chr.cache"); Path r2spath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2str.cache"); Path wpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".weight.cache"); if(Files.exists(r1ppath, LinkOption.NOFOLLOW_LINKS) && Files.exists(r2ppath, LinkOption.NOFOLLOW_LINKS) && Files.exists(r2cpath, LinkOption.NOFOLLOW_LINKS) && Files.exists(r2spath, LinkOption.NOFOLLOW_LINKS)){ try { FileChannel r1posInChannel = FileChannel.open(r1ppath, StandardOpenOption.READ); FileChannel r2posInChannel = FileChannel.open(r2ppath, StandardOpenOption.READ); FileChannel r2chrInChannel = FileChannel.open(r2cpath, StandardOpenOption.READ); FileChannel r2strInChannel = FileChannel.open(r2spath, StandardOpenOption.READ); FileChannel wInChannel = FileChannel.open(wpath, StandardOpenOption.READ); int[] r1pResult = new int[((int)r1posInChannel.size())/4]; int[] r2pResult = new int[((int)r2posInChannel.size())/4]; int[] r2cResult = new int[((int)r2chrInChannel.size())/4]; int[] r2sResult = new int[((int)r2strInChannel.size())/4]; float[] wResult = new float[((int)wInChannel.size())/4]; ByteBuffer r1pbuf = ByteBuffer.allocate((int)r1posInChannel.size()); ByteBuffer r2pbuf = ByteBuffer.allocate((int)r2posInChannel.size()); ByteBuffer r2cbuf = ByteBuffer.allocate((int)r2chrInChannel.size()); ByteBuffer r2sbuf = ByteBuffer.allocate((int)r2strInChannel.size()); ByteBuffer wbuf = ByteBuffer.allocate((int)wInChannel.size()); // Fill in the buffers while(r1pbuf.hasRemaining( )) r1posInChannel.read(r1pbuf); while(r2pbuf.hasRemaining( )) r2posInChannel.read(r2pbuf); while(r2cbuf.hasRemaining( )) r2chrInChannel.read(r2cbuf); while(r2sbuf.hasRemaining( )) r2strInChannel.read(r2sbuf); while(wbuf.hasRemaining( )) wInChannel.read(wbuf); r1pbuf.flip( ); r2pbuf.flip( ); r2cbuf.flip( ); r2sbuf.flip( ); wbuf.flip( ); // Create buffer views IntBuffer r1posIntBuffer = r1pbuf.asIntBuffer( ); IntBuffer r2posIntBuffer = r2pbuf.asIntBuffer( ); IntBuffer r2chrIntBuffer = r2cbuf.asIntBuffer( ); IntBuffer r2strIntBuffer = r2sbuf.asIntBuffer( ); FloatBuffer wFloatBuffer = wbuf.asFloatBuffer( ); //Results will now contain all ints/floats read from file r1posIntBuffer.get(r1pResult); r2posIntBuffer.get(r2pResult); r2chrIntBuffer.get(r2cResult); r2strIntBuffer.get(r2sResult); wFloatBuffer.get(wResult); //Assign to arrays pairR1Pos[chrID][strand] = r1pResult; pairR2Pos[chrID][strand] = r2pResult; pairR2Chrom[chrID][strand] = r2cResult; pairR2Strand[chrID][strand] = r2sResult; pairWeight[chrID][strand] = wResult; } catch (IOException e) { e.printStackTrace(); } } } } } //Initialize cached regions if(gen.containsChromName(chrom)){ if(cachedRegions==null) cachedRegions=new ArrayList<Region>(); cachedRegions.add(new Region(gen, chrom, 1, gen.getChromLength(chrom))); } } /** * Load the data from a set of regions from the local cache into the array data structure. * To do this, each relevant chromosome is loaded one by one, and then the appropriate regions are extracted using subsetArrays() * This is an intensive method, so use very sparingly, and double check that you couldn't have * just provided appropriate regions to the constructor. * @param chrom */ public synchronized void loadCachedRegions(List<Region> regs){ if(cacheMemoryEntireGenome)//By definition, all regions are loaded return; //Empty the current memory cache emptyArrays(); if(cachedRegions!=null) cachedRegions.clear(); //Load the appropriate chromosomes from files List<String> loadChrs = new ArrayList<String>(); for(Region r : regs) if(!loadChrs.contains(r.getChrom())) loadChrs.add(r.getChrom()); for(String chrom : loadChrs){ //Get the data from files if(chrom2ID.containsKey(chrom)){ int chrID = chrom2ID.get(chrom); for(int strand=0; strand<=1; strand++){ //Read single-end files Path ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".pos.cache"); Path cpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".counts.cache"); if(Files.exists(ppath, LinkOption.NOFOLLOW_LINKS) && Files.exists(cpath, LinkOption.NOFOLLOW_LINKS)){ try { FileChannel posInChannel = FileChannel.open(ppath, StandardOpenOption.READ); FileChannel countsInChannel = FileChannel.open(cpath, StandardOpenOption.READ); int[] pResult = new int[((int)posInChannel.size())/4]; float[] cResult = new float[((int)countsInChannel.size())/4]; ByteBuffer pbuf = ByteBuffer.allocate((int)posInChannel.size()); ByteBuffer cbuf = ByteBuffer.allocate((int)countsInChannel.size()); // Fill in the buffers while(pbuf.hasRemaining( )) posInChannel.read(pbuf); while(cbuf.hasRemaining( )) countsInChannel.read(cbuf); pbuf.flip( ); cbuf.flip( ); // Create buffer views IntBuffer posIntBuffer = pbuf.asIntBuffer( ); FloatBuffer countsFloatBuffer = cbuf.asFloatBuffer( ); //Results will now contain all ints/floats read from file posIntBuffer.get(pResult); countsFloatBuffer.get(cResult); //Assign to arrays fivePrimePos[chrID][strand] = pResult; fivePrimeCounts[chrID][strand] = cResult; } catch (IOException e) { e.printStackTrace(); } } //Load pairs if(loadPairs && hasPairs){ Path r1ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r1pos.cache"); Path r2ppath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2pos.cache"); Path r2cpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2chr.cache"); Path r2spath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".r2str.cache"); Path wpath = FileSystems.getDefault().getPath(econfig.getFileCacheDirName(), localCacheFileBase, localCacheFileBase+"_"+chrom+"-"+strand+".weight.cache"); if(Files.exists(r1ppath, LinkOption.NOFOLLOW_LINKS) && Files.exists(r2ppath, LinkOption.NOFOLLOW_LINKS) && Files.exists(r2cpath, LinkOption.NOFOLLOW_LINKS) && Files.exists(r2spath, LinkOption.NOFOLLOW_LINKS)){ try { FileChannel r1posInChannel = FileChannel.open(r1ppath, StandardOpenOption.READ); FileChannel r2posInChannel = FileChannel.open(r2ppath, StandardOpenOption.READ); FileChannel r2chrInChannel = FileChannel.open(r2cpath, StandardOpenOption.READ); FileChannel r2strInChannel = FileChannel.open(r2spath, StandardOpenOption.READ); FileChannel wInChannel = FileChannel.open(wpath, StandardOpenOption.READ); int[] r1pResult = new int[((int)r1posInChannel.size())/4]; int[] r2pResult = new int[((int)r2posInChannel.size())/4]; int[] r2cResult = new int[((int)r2chrInChannel.size())/4]; int[] r2sResult = new int[((int)r2strInChannel.size())/4]; float[] wResult = new float[((int)wInChannel.size())/4]; ByteBuffer r1pbuf = ByteBuffer.allocate((int)r1posInChannel.size()); ByteBuffer r2pbuf = ByteBuffer.allocate((int)r2posInChannel.size()); ByteBuffer r2cbuf = ByteBuffer.allocate((int)r2chrInChannel.size()); ByteBuffer r2sbuf = ByteBuffer.allocate((int)r2strInChannel.size()); ByteBuffer wbuf = ByteBuffer.allocate((int)wInChannel.size()); // Fill in the buffers while(r1pbuf.hasRemaining( )) r1posInChannel.read(r1pbuf); while(r2pbuf.hasRemaining( )) r2posInChannel.read(r2pbuf); while(r2cbuf.hasRemaining( )) r2chrInChannel.read(r2cbuf); while(r2sbuf.hasRemaining( )) r2strInChannel.read(r2sbuf); while(wbuf.hasRemaining( )) wInChannel.read(wbuf); r1pbuf.flip( ); r2pbuf.flip( ); r2cbuf.flip( ); r2sbuf.flip( ); wbuf.flip( ); // Create buffer views IntBuffer r1posIntBuffer = r1pbuf.asIntBuffer( ); IntBuffer r2posIntBuffer = r2pbuf.asIntBuffer( ); IntBuffer r2chrIntBuffer = r2cbuf.asIntBuffer( ); IntBuffer r2strIntBuffer = r2sbuf.asIntBuffer( ); FloatBuffer wFloatBuffer = wbuf.asFloatBuffer( ); //Results will now contain all ints/floats read from file r1posIntBuffer.get(r1pResult); r2posIntBuffer.get(r2pResult); r2chrIntBuffer.get(r2cResult); r2strIntBuffer.get(r2sResult); wFloatBuffer.get(wResult); //Assign to arrays pairR1Pos[chrID][strand] = r1pResult; pairR2Pos[chrID][strand] = r2pResult; pairR2Chrom[chrID][strand] = r2cResult; pairR2Strand[chrID][strand] = r2sResult; pairWeight[chrID][strand] = wResult; } catch (IOException e) { e.printStackTrace(); } } } } } } subsetArrays(regs); if(cachedRegions==null) cachedRegions = new ArrayList<Region>(); cachedRegions.addAll(regs); } /** * Simple convertor * @param list * @return */ private int[] list2int(List<Integer> list) { int[] out = new int[list.size()]; for(int i = 0; i < out.length; i++) out[i] = list.get(i); return out; } /** * Simple convertor * @param list * @return */ private float[] list2float(List<Float> list) { float[] out = new float[list.size()]; for(int i = 0; i < out.length; i++) out[i] = list.get(i); return out; } /** * Enforces a per-base weight threshold * @param maxReadperBP float threshold */ private void capPerBaseCount(float maxReadperBP){ for(int i = 0; i < fivePrimeCounts.length; i++) for(int j = 0; j < fivePrimeCounts[i].length; j++) if(fivePrimeCounts[i][j]!=null) for(int k = 0; k < fivePrimeCounts[i][j].length; k++) if (fivePrimeCounts[i][j][k] > maxReadperBP){ //System.err.println("Capping "+fivePrimeCounts[i][j][k]+" to "+maxReadperBP); fivePrimeCounts[i][j][k] = maxReadperBP; } } /** * Reset duplicate reads that pass Poisson threshold. * The Poisson lambda parameter is calculated by an Gaussian average * that puts more weight for nearby bases (same chrom, same strand) */ private void capPerBaseCountWithPoissonGaussianFilter(double threshold, int width){ double g[] = new double[width*4+1]; NormalDistribution gaussianDist = new NormalDistribution(0, width*width); for (int i=0;i<g.length;i++) g[i]=gaussianDist.calcProbability((double)i); DRand re = new DRand(); Poisson P = new Poisson(0, re); for(int i = 0; i < fivePrimeCounts.length; i++) for(int j = 0; j < fivePrimeCounts[i].length; j++){ float counts[] = fivePrimeCounts[i][j]; int pos[] = fivePrimePos[i][j]; if(counts!=null){ for(int k = 0; k < counts.length; k++){ int posK = pos[k]; double sum = 0; for (int x=1;x<=width*4;x++){ // at most extend out 250 idx if (k+x>=counts.length|| pos[k+x]-posK>width*4) break; sum += counts[k+x]*g[pos[k+x]-posK]; } for (int x=1;x<=width*4;x++){ // at most extend out 250 idx if (k-x<0 || posK-pos[k-x]>width*4) break; sum += counts[k-x]*g[posK-pos[k-x]]; } sum = sum/(1-g[0]); // exclude this position for evaluation double countThres=0; P.setMean(sum); double pvalue=1; for(int b=1; pvalue>threshold; b++){ pvalue=1-P.cdf(b); //p-value as the tail of Poisson countThres=b; } if (counts[k] > Math.max(1,countThres)) counts[k] = (float) Math.max(1,countThres); } } } } /** * Simple count correction with a scaling factor and a floor of one. * Beware: only works if all reads are loaded. * @param perBaseScaling float threshold */ protected void linearCountCorrection(float perBaseScaling){ if(perBaseScaling<1) System.err.println("linearCountCorrection: perBaseScaling is less than 1 - makes no sense to scale"); else{ for(int i = 0; i < fivePrimeCounts.length; i++) for(int j = 0; j < fivePrimeCounts[i].length; j++) if(fivePrimeCounts[i][j]!=null) for(int k = 0; k < fivePrimeCounts[i][j].length; k++) if (fivePrimeCounts[i][j][k] > 0){ fivePrimeCounts[i][j][k] = fivePrimeCounts[i][j][k]/perBaseScaling; if(fivePrimeCounts[i][j][k]<1) fivePrimeCounts[i][j][k]=1; } } } /** * Recount hit weights */ private void updateTotalHits(){ totalHits = 0.0; totalHitsPos=0.0; totalHitsNeg=0.0; uniqueHits = 0.0; for(int i = 0; i < fivePrimeCounts.length; i++) for(int j = 0; j < fivePrimeCounts[i].length; j++) if(fivePrimeCounts[i][j]!=null) for(int k = 0; k < fivePrimeCounts[i][j].length; k++){ totalHits += fivePrimeCounts[i][j][k]; if(j==0){totalHitsPos+= fivePrimeCounts[i][j][k];} else{totalHitsNeg+= fivePrimeCounts[i][j][k];} if(fivePrimeCounts[i][j][k]>0) uniqueHits++; } totalPairs =0; uniquePairs = 0; if(loadPairs && hasPairs){ for(int i = 0; i < pairWeight.length; i++) for(int j = 0; j < pairWeight[i].length; j++) if(pairWeight[i][j]!=null) for(int k = 0; k < pairWeight[i][j].length; k++){ totalPairs += pairWeight[i][j][k]; if(pairWeight[i][j][k]>0) uniquePairs++; } } } /** * Estimate a genome from the observed read positions that are collected into the list * @param posList HashMap indexed by chr containing ArrayLists of hit positions * @return Genome */ private Genome estimateGenome(HashMap<String, ArrayList<Integer>[]> posList){ HashMap<String, Integer> chrLenMap = new HashMap<String, Integer>(); for(String c : posList.keySet()){ int max = 0; for(int j=0; j<posList.get(c).length; j++){ for(Integer i : posList.get(c)[j]){ if(i>max) max=i; } } chrLenMap.put(c, max); } Genome g =new Genome("Genome", chrLenMap); return(g); } /** * Initialize the per-base background model * */ private void initializeBackground(){ perBaseBack=new BackgroundCollection(); if(econfig.getGenome()==null){ System.err.println("Genome chromosome lengths not specified. Please define using --geninfo."); System.exit(1); } perBaseBack.addBackgroundModel( new PoissonBackgroundModel(-1, econfig.getPerBaseLogConf(), getHitCount(), econfig.getGenome().getGenomeLength(), econfig.getMappableGenomeProp(), 1, '.', 1, true)); } /** * Tidy up the local read caches */ public void close(){ //Delete the file cache if it exists if(cacheInLocalFiles){ if(localCacheDir.exists() ) { File[] files = localCacheDir.listFiles(); for(int i=0; i<files.length; i++) files[i].delete(); localCacheDir.delete(); } } } }