package org.seqcode.gsebricks.verbs; import java.util.*; import org.seqcode.genome.Genome; import org.seqcode.genome.location.Region; import org.seqcode.gseutils.Interval; /** * This is a data structure originally designed to analyze the overlapping * reads of a *seq experiment, but since adapted to several more uses. * * Its purpose is to track a series of intervals over a region (in this case, * a region identified with a particular chromosome and genome), and to then * be able to answer queries about the number of times that locations within * that region have been "covered" by one or more intervals. * * The method .collectRegions(int) is the most-used interface -- it collects * a series of (disjoint) regions that are each continuously covered by * *at least* 'int' intervals. * * @author Tim and Alex */ public class RunningOverlapSum { private Genome genome; private String chrom; private TreeMap<Integer,Float> changes; public RunningOverlapSum(Genome g, String c) { genome = g; chrom = c; changes = new TreeMap<Integer,Float>(); } public RunningOverlapSum(Collection<Region> rs) { changes = new TreeMap<Integer,Float>(); if(rs.isEmpty()) { throw new IllegalArgumentException(); } for(Region r : rs) { if(genome == null) { genome = r.getGenome(); chrom = r.getChrom(); } else if (r.getGenome().equals(genome) && r.getChrom().equals(chrom)) { addInterval(r.getStart(), r.getEnd()); } } } public void clear() { changes.clear(); } public void combineWith(RunningOverlapSum s) { if(!s.chrom.equals(chrom) || !genome.equals(s.genome)) { throw new IllegalArgumentException(); } for(Integer pt : s.changes.keySet()) { if(!changes.containsKey(pt)) { changes.put(pt, 0f); } changes.put(pt, changes.get(pt) + s.changes.get(pt)); } } public void addInterval(int start, int end) { if(changes.containsKey(start)) { changes.put(start, changes.get(start) + 1f); } else { changes.put(start, 1f); } /** * I'm assuming that intervals, like regions, are inclusive of their * end points... * This needs to put the changepoint at one past the position where the * interval ends to account for intervals being inclusive. -Bob */ if(changes.containsKey(end + 1)) { changes.put(end + 1, changes.get(end + 1) - 1f); } else { changes.put(end + 1, -1f); } } public void addInterval(Interval intv) { int start = intv.start; int end = intv.end; if(changes.containsKey(start)) { changes.put(start, changes.get(start) + 1f); } else { changes.put(start, 1f); } /** * I'm assuming that intervals, like regions, are inclusive of their * end points... * This needs to put the changepoint at one past the position where the * interval ends to account for intervals being inclusive. -Bob */ if(changes.containsKey(end + 1)) { changes.put(end + 1, changes.get(end + 1) - 1f); } else { changes.put(end + 1, -1f); } } public int[][] getChangePoints() { int[][] array = new int[changes.size()][]; int i = 0; for(int change : changes.keySet()) { int[] pair = new int[2]; pair[0] = change; pair[1] = changes.get(change).intValue(); array[i++] = pair; } return array; } public void addRegion(Region r) {addRegion(r, 1);} public void addRegion(Region r, float weight) { if(!genome.equals(r.getGenome())) { throw new IllegalArgumentException(r.getGenome().toString()); } if(!chrom.equals(r.getChrom())) { throw new IllegalArgumentException(r.getChrom()); } int start = r.getStart(); int end = r.getEnd(); if(changes.containsKey(start)) { changes.put(start, changes.get(start) + weight); } else { changes.put(start, weight); } /** * This needs to put the changepoint at one past the position where the * region ends to account for regions being inclusive. -Bob */ if(changes.containsKey(end + 1)) { changes.put(end + 1, changes.get(end + 1) - weight); } else { changes.put(end + 1, -weight); } } public int getMaxOverlap() { int max = 0; int running = 0; for(int pc : changes.keySet()) { int delta = changes.get(pc).intValue(); running += delta; max = Math.max(max, running); } return max; } public int getMaxOverlap(int start, int end) { int max = 0; int running = 0; Map<Integer,Float> map = changes.headMap(end); for(int pc : map.keySet()) { int delta = map.get(pc).intValue(); running += delta; if(pc >= start && pc <= end) { max = Math.max(max, running); } } return max; } public int countOverlapping(int start, int end) { int count = 0; Map<Integer,Float> map = changes.headMap(end); int running = 0; for(int pc : map.keySet()) { int delta = map.get(pc).intValue(); running += delta; if(pc >= start && delta == -1) { count += 1; } } count += running; return count; } public Collection<Region> collectRegions(int threshold) { if(threshold < 1) { throw new IllegalArgumentException(String.valueOf(threshold)); } LinkedList<Region> regions = new LinkedList<Region>(); int runningSum = 0; int rstart = -1, rend = -1; for(int pc : changes.keySet()) { int delta = changes.get(pc).intValue(); if(runningSum < threshold && runningSum + delta >= threshold) { rstart = pc; } if(runningSum >= threshold && runningSum + delta < threshold) { //subtract 1 from the endpoint because regions are inclusive rend = pc - 1; Region r = new Region(genome, chrom, rstart, rend); regions.addLast(r); rstart = rend = -1; } runningSum += delta; if(runningSum < 0) { throw new IllegalStateException(String.valueOf(runningSum)); } } if(runningSum > 0) { throw new IllegalStateException(String.valueOf(runningSum)); } return regions; } public TreeMap<Integer, Float> getChangeMap() { return changes; } }