package org.seqcode.gsebricks.verbs; import java.util.*; import org.seqcode.genome.Genome; import org.seqcode.genome.location.Region; import org.seqcode.gseutils.Interval; /** * This is a data structure originally designed to analyze the overlapping * reads of a ChIP-Seq experiment, but since adapted to several more uses. * * Its purpose is to track a series of intervals over a region (in this case, * a region identified with a particular chromosome and genome), and to then * be able to answer queries about the number of times that locations within * that region have been "covered" by one or more intervals. * * The method .collectRegions(int) is the most-used interface -- it collects * a series of (disjoint) regions that are each continuously covered by * *at least* 'int' intervals. * * @author Tim and Alex */ public class WeightedRunningOverlapSum { private Genome genome; private String chrom; private TreeMap<Integer, Double> changes; public WeightedRunningOverlapSum(Genome g, String c) { genome = g; chrom = c; changes = new TreeMap<Integer, Double>(); } public void clear() { changes.clear(); } public void addInterval(int start, int end) { this.addWeightedInterval(start, end, 1.0); } public void addWeightedInterval(int start, int end, double weight) { if (changes.containsKey(start)) { changes.put(start, changes.get(start) + weight); } else { changes.put(start, weight); } /** * I'm assuming that intervals, like regions, are inclusive of their end * points... This needs to put the changepoint at one past the position * where the interval ends to account for intervals being inclusive. -Bob */ if (changes.containsKey(end + 1)) { changes.put(end + 1, changes.get(end + 1) - weight); } else { changes.put(end + 1, -weight); } } public void addInterval(Interval intv) { this.addWeightedInterval(intv, 1.0); } public void addWeightedInterval(Interval intv, double weight) { int start = intv.start; int end = intv.end; if (changes.containsKey(start)) { changes.put(start, changes.get(start) + weight); } else { changes.put(start, weight); } /** * I'm assuming that intervals, like regions, are inclusive of their end * points... This needs to put the changepoint at one past the position * where the interval ends to account for intervals being inclusive. -Bob */ if (changes.containsKey(end + 1)) { changes.put(end + 1, changes.get(end + 1) - weight); } else { changes.put(end + 1, -weight); } } public double[][] getChangePoints() { double[][] array = new double[changes.size()][]; int i = 0; for (int change : changes.keySet()) { double[] pair = new double[2]; pair[0] = change; pair[1] = changes.get(change); array[i++] = pair; } return array; } public void addRegion(Region r) { this.addWeightedRegion(r, 1.0); } public void addWeightedRegion(Region r, double weight) { if (!genome.equals(r.getGenome())) { throw new IllegalArgumentException(r.getGenome().toString()); } if (!chrom.equals(r.getChrom())) { throw new IllegalArgumentException(r.getChrom()); } int start = r.getStart(); int end = r.getEnd(); if (changes.containsKey(start)) { changes.put(start, changes.get(start) + weight); } else { changes.put(start, weight); } /** * This needs to put the changepoint at one past the position where the * region ends to account for regions being inclusive. -Bob */ if (changes.containsKey(end + 1)) { changes.put(end + 1, changes.get(end + 1) - weight); } else { changes.put(end + 1, -weight); } } public double getMaxOverlap() { double max = 0; double running = 0; for (int pc : changes.keySet()) { double delta = changes.get(pc); running += delta; max = Math.max(max, running); } return max; } public double getMaxOverlap(int start, int end) { double max = 0; double running = 0; Map<Integer, Double> map = changes.headMap(end); for (int pc : map.keySet()) { double delta = map.get(pc); running += delta; if (pc >= start && pc <= end) { max = Math.max(max, running); } } return max; } public double countOverlapping(int start, int end) { double count = 0; Map<Integer, Double> map = changes.headMap(end); double running = 0; for (int pc : map.keySet()) { double delta = map.get(pc); running += delta; if (pc >= start && delta == -1) { count += 1; } } count += running; return count; } public Collection<Region> collectRegions(double threshold) { if (threshold <= 0) { throw new IllegalArgumentException(String.valueOf(threshold)); } LinkedList<Region> regions = new LinkedList<Region>(); double runningSum = 0; int rstart = -1, rend = -1; for (int pc : changes.keySet()) { double delta = changes.get(pc); if (runningSum < threshold && runningSum + delta >= threshold) { rstart = pc; } if (runningSum >= threshold && runningSum + delta < threshold) { // subtract 1 from the endpoint because regions are inclusive rend = pc - 1; Region r = new Region(genome, chrom, rstart, rend); regions.addLast(r); rstart = rend = -1; } runningSum += delta; if (runningSum < 0) { throw new IllegalStateException(String.valueOf(runningSum)); } } if (runningSum > 0) { throw new IllegalStateException(String.valueOf(runningSum)); } return regions; } public TreeMap<Integer, Double> getChangeMap() { return changes; } }