package org.seqcode.projects.galaxyexo;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.seqcode.data.io.RegionFileUtilities;
import org.seqcode.genome.Genome;
import org.seqcode.genome.GenomeConfig;
import org.seqcode.genome.location.NamedRegion;
import org.seqcode.genome.location.Point;
import org.seqcode.genome.location.Region;
import org.seqcode.gsebricks.verbs.location.ChromRegionIterator;
import org.seqcode.gseutils.ArgParser;
import org.seqcode.gseutils.Args;
import cern.jet.random.Poisson;
import cern.jet.random.engine.DRand;
/**
* Utility to access peak enrichment at a set of genomic regions. Statistical significance
* of peak enrichment over the set of sites is accessed using Poisson model. The null
* distribution comes from randomly placed peaks thought a genome.
*
* Input:
* - Genome
* - GFF file of peak locations
* - A set of genomic regions to test peak enrichment
* Output:
* - A text file indicating the number of overlap and Poisson p-value.
*
* @author naomi yamada
*/
public class PointEnrichmentTester {
protected GenomeConfig gconfig;
protected String outbase;
protected List<Point> gff;
protected List<Region> regions;
protected int ext; //distance to expand so that I don't double count points
protected int numItr = 1000;
protected Poisson poisson;
protected int pseudocounts; // noise added to prevent calling significance in telomere regions
protected boolean printRandOverlap = false; // flag to print number of random overlap
public PointEnrichmentTester(String base, GenomeConfig gcon,List<Point> g, List<Region> r){
outbase=base;
gconfig=gcon;
gff=g;
regions=r;
poisson = new Poisson(1, new DRand());
}
// set pseudo counts
public void setNoise(int c){pseudocounts=c;}
public void setExpansion(int e){ext=e;}
public void printRandOverlap(){printRandOverlap=true;}
public void execute() throws FileNotFoundException{
int totalRegionSize = 0;
for (Region reg : regions){totalRegionSize+=reg.getWidth();} // total size of regions
// expand regions for 20bp so that I don't double counts the peaks nearby
List<Region> expandedGff = new ArrayList<Region>();
for (Point point : gff){
expandedGff.add(point.expand(ext));
}
List <Region> mergedGff = Region.mergeRegions(expandedGff);
int totalOverlap = 0; // number of total overlap between two regions
for (Region gff : mergedGff){
for (Region reg : regions){
if (gff.overlaps(reg))
totalOverlap ++;
}
}
File outFile = new File(outbase+File.separator+"point_enrichment.txt");
outFile.getParentFile().mkdirs();
PrintWriter writer = new PrintWriter(outFile);
writer.println("total number of non-overlapping gff points : "+mergedGff.size());
writer.println("number of overlap between gff points and regions with size "+totalRegionSize+" : "+totalOverlap);
PrintWriter w = null;
if (printRandOverlap){
File randOverlapFile = new File(outbase+File.separator+"num_random_overlaps.txt");
w = new PrintWriter(randOverlapFile) ;
}
double maxPval = 0;
// Determined p-val based on Poisson distributions for numItr times and return the max p-val
for (int i=0 ; i < numItr ; i++){
double pValuePoisson =1;
// produce random hits through genome
List<Region> randomRegions = randomRegionPick(gconfig.getGenome(), null, mergedGff.size(),1);
int numRandOverlaps=0;
for (Region randRegion : randomRegions){
for (Region reg : regions){
if (randRegion.overlaps(reg)){
numRandOverlaps++;
break;
}
}
}
if (i ==1){writer.println("number of overlap with random regions for iteration one : "+numRandOverlaps);}
if (w != null){
w.print(numRandOverlaps+"\t");
w.close();
}
if (numRandOverlaps >totalOverlap){
pValuePoisson=1;
}else{
poisson.setMean(numRandOverlaps+pseudocounts);
int cA = (int)Math.ceil(totalOverlap);
pValuePoisson = 1 - poisson.cdf(cA) + poisson.pdf(cA);
}
if (pValuePoisson >maxPval) {maxPval = pValuePoisson;}
}
writer.println("Poisson p-val : "+maxPval);
writer.close();
}
public static List<Region> randomRegionPick(Genome gen, List<Region> blackList, int numSamples, int sampleSize){
List<Region> regs = new ArrayList<Region>();
Random rand = new Random();
int validSamples=0;
//First see how big the genome is:
int numChroms=0;
long genomeSize=0;
long [] chromoSize = new long[gen.getChromList().size()];
String [] chromoNames = new String[gen.getChromList().size()];
Iterator<NamedRegion> chroms = new ChromRegionIterator(gen);
while (chroms.hasNext()) {
NamedRegion currentChrom = chroms.next();
genomeSize += (double)currentChrom.getWidth();
chromoSize[numChroms]=currentChrom.getWidth();
chromoNames[numChroms]=currentChrom.getChrom();
numChroms++;
}
//Now, iteratively generate random positions and check if they are valid and not overlapping repeats.
while(validSamples<numSamples){
Region potential;
long randPos = (long)(1+(rand.nextDouble()*genomeSize));
//find the chr
boolean found=false;
long total=0;
for(int c=0; c<numChroms && !found; c++){
if(randPos<total+chromoSize[c]){
found=true;
if(randPos+sampleSize<total+chromoSize[c]){
potential = new Region(gen, chromoNames[c], (int)(randPos-total), (int)(randPos+sampleSize-total));
//is this region in the blacklist?
boolean valid=true;
if(blackList!=null){
for(Region r : blackList){
if(potential.overlaps(r)){valid=false;}
}
}
if(valid){
validSamples++;
regs.add(potential);
}
}
}total+=chromoSize[c];
}
}
return(regs);
}
public static void main(String[] args) throws FileNotFoundException{
ArgParser ap = new ArgParser(args);
GenomeConfig gconf = new GenomeConfig(args);
if (!ap.hasKey("gff") && !ap.hasKey("points")){
System.err.println("Usage:\n " +
"PointEnrichmentTester\n " +
"--geninfo <genome info file> \n " +
"--gff <gff containing site coordinates> OR --points <points containing site coordinates> \n " +
"--region <region of the genome for enrichment test> \n " +
"\nOPTIONS:\n " +
"--out <output directory (default = working directory)> \n " +
"--pseudo <pseudocounts to suppress telomere enrichment (default=0) > \n " +
"--ext <window size to merge gff points to prevent event double counts (default=20) > \n " +
"--print <flag to print number of random overlaps with region> \n " +
"");
System.exit(0);
}
List<Point> points = null;
if (ap.hasKey("gff"))
points = RegionFileUtilities.loadPointsFromGFFFile(ap.getKeyValue("gff"),gconf.getGenome());
else if (ap.hasKey("points"))
points = RegionFileUtilities.loadPointsFromFile(ap.getKeyValue("points"), gconf.getGenome());
if (points.size()==0){
System.err.println("peak files have zero hits.");
System.exit(0);
}
List<Region> reg = RegionFileUtilities.loadRegionsFromFile(ap.getKeyValue("region"),gconf.getGenome(),-1);
int pseudo = Args.parseInteger(args,"pseudo", 0);
int expand = Args.parseInteger(args,"ext", 20);
// Get outdir and outbase and make them;
String outbase = Args.parseString(args, "out", System.getProperty("user.dir"));
PointEnrichmentTester tester = new PointEnrichmentTester(outbase,gconf,points,reg);
tester.setNoise(pseudo);
tester.setExpansion(expand);
if (ap.hasKey("print")){tester.printRandOverlap();}
tester.execute();
}
}