package org.seqcode.projects.seed;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.seqcode.data.motifdb.WeightMatrix;
import org.seqcode.deepseq.StrandedBaseCount;
import org.seqcode.deepseq.experiments.ExperimentCondition;
import org.seqcode.deepseq.experiments.ExperimentManager;
import org.seqcode.deepseq.experiments.ExptConfig;
import org.seqcode.deepseq.experiments.Sample;
import org.seqcode.genome.GenomeConfig;
import org.seqcode.genome.location.Point;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.genome.sequence.SequenceUtils;
import org.seqcode.projects.seed.features.EnrichedFeature;
import org.seqcode.projects.seed.features.EnrichedPCSPeakFeature;
import org.seqcode.projects.seed.features.Feature;
/**
* Permanganate-ChIP-seq peak-finder
* @author mahony
*
*/
public class PCSPeakFinder extends PeakFinder{
public static String version = "0.1";
protected final float bubbleWindow=30;
protected int tagSeqWin = 20;
protected float[][][] tagSeqComposition; //Sequence composition around tag 5' ends; indexed by Sample, then by relative position around tag 5' end, then by base (ACGT)
public PCSPeakFinder(GenomeConfig gcon, ExptConfig econ, SEEDConfig scon, ExperimentManager man) {
super(gcon, econ, scon, man);
tagSeqComposition = new float[manager.getSamples().size()][tagSeqWin+1][4];
for(int s=0; s<manager.getSamples().size(); s++)
for(int i=0; i<=tagSeqWin; i++)
for(int j=0; j<4; j++)
tagSeqComposition[s][i][j]=0;
}
/**
* Return the class name
*/
public String getProgramName(){
return "org.seqcode.projects.seed.PCSPeakFinder";
}
/**
* Return a thread for this implementation
*/
public FeatureDetectionThread getMyThread(List<Region> regs){
return new PCSPeakFinderThread(regs);
}
/**
* @param args
*/
public static void main(String[] args) {
System.setProperty("java.awt.headless", "true");
System.err.println("Permanganate-ChIP-seq Peak Finder version "+PeakFinder.version+"\n\n");
GenomeConfig gcon = new GenomeConfig(args);
ExptConfig econ = new ExptConfig(gcon.getGenome(), args);
SEEDConfig scon = new SEEDConfig(gcon, args);
if(scon.helpWanted()){
//System.out.println(PeakFinder.getPCSPeakFinderArgs());
System.err.println(gcon.getArgsList()+
econ.getArgsList()+
scon.getArgsList());
}else{
ExperimentManager man = new ExperimentManager(econ);
PCSPeakFinder finder = new PCSPeakFinder(gcon, econ, scon, man);
System.err.println("\nBeginning peak finding...");
finder.execute();
man.close();
}
}
/**
* Return a help string
* @return
*/
public static String getPCSPeakFinderArgs() {
//TODO
return("TODO");
}
/**
* Multiple-hypothesis correction. Print the output files.
* Assumes that all peaks have been found.
* @return : final features
*/
public Map<ExperimentCondition, List<Feature>> postProcess() {
System.err.println("\nPeak finding complete.");
//Normalize the tag sequence compositions
for(int s=0; s<manager.getSamples().size(); s++)
for(int i=0; i<=tagSeqWin; i++){
float currTot=0;
for(int j=0; j<4; j++)
currTot+=tagSeqComposition[s][i][j];
for(int j=0; j<4; j++)
tagSeqComposition[s][i][j]/=currTot;
}
//Print tag sequence composition motifs
for(Sample s : manager.getSamples()){
String fName = sconfig.getOutputParentDir()+File.separator+sconfig.getOutBase()+s.getName()+"-tag-sequence-motif.motif";
String imName = sconfig.getOutputParentDir()+File.separator+sconfig.getOutBase()+s.getName()+"-tag-sequence-motif.png";
String motifLabel = s.getName()+" tag-sequence-motif";
WeightMatrix wm = makeTagSeqCompositionWeightMatrix(s);
try {
FileWriter fout = new FileWriter(fName);
fout.write(WeightMatrix.printTransfacMatrix(wm, motifLabel));
fout.close();
} catch (IOException e) {
e.printStackTrace();
}
org.seqcode.motifs.DrawMotifs.printMotifLogo(wm, new File(imName), 75, motifLabel); //Note that the painter converts the motif to log oddds
}
//Correct domains for multiple testing
for(ExperimentCondition cond : manager.getConditions()){
stats.benjaminiHochbergCorrection(features.get(cond));
}
Map<ExperimentCondition, List<Feature>> signifFeatures = this.filter(features, sconfig.perBinBinomialPThres, true);
//All domains
this.printEventsFile(features, ".all.pcspeaks.txt");
//Filtered by q-value
this.printEventsFile(signifFeatures, ".p"+sconfig.perBinBinomialPThres+".pcspeaks.txt");
//Summarize
for(ExperimentCondition cond : manager.getConditions())
System.err.println(cond.getName()+"\t"+features.get(cond).size()+" peaks\t"+signifFeatures.get(cond).size()+" peaks below threshold.");
return features;
}
protected WeightMatrix makeTagSeqCompositionWeightMatrix(Sample s){
WeightMatrix matrix = new WeightMatrix(tagSeqWin+1);
matrix.setNameVerType(s.getName()+"-tag-sequence-motif", "freq", "CUSTOM");
for (int i = 0; i <= tagSeqWin; i++) {
//Assume normalized already
matrix.matrix[i]['A'] = tagSeqComposition[s.getIndex()][i][0];
matrix.matrix[i]['C'] = tagSeqComposition[s.getIndex()][i][1];
matrix.matrix[i]['G'] = tagSeqComposition[s.getIndex()][i][2];
matrix.matrix[i]['T'] = tagSeqComposition[s.getIndex()][i][3];
}matrix.setLogOdds();
return matrix;
}
/**
* PeakFinderThread: thread that searches for domains
* @author mahony
*
*/
public class PCSPeakFinderThread extends PeakFinderThread {
protected char[] currRegionSeq;
protected char[] currRegionSeqRC;
protected SequenceGenerator<Region> seqgen;
public PCSPeakFinderThread(List<Region> regs) {
super(regs);
seqgen = gconfig.getSequenceGenerator();
}
/**
* findFeatures:
* 1) Count frequencies of bases around all tag 5' positions
* 2) Find potentially enriched domains on the genome by comparing all tag counts to a background model,
* 3) Find the most likely transcriptional bubble positions within the domain
*
* @param subRegion : region to run analysis on
* @return Map of Lists of Features in the subRegion, Indexed by ExperimentCondition
*/
public Map<ExperimentCondition, List<Feature>> findFeatures(Region subRegion) {
//Build the sequence model in the window around the tag 5' locations
currRegionSeq = seqgen.execute(subRegion).toCharArray();
currRegionSeqRC = currRegionSeq.clone();
SequenceUtils.reverseComplement(currRegionSeqRC);
//Tag sequence composition
calculateTagSequenceComposition(subRegion);
//Get the enriched domains using the functionality from the superclass (PeakFinderThread)
//and the over-ridden processDomain methods below
Map<ExperimentCondition, List<Feature>> peaks = super.findFeatures(subRegion);
return peaks;
}
/**
* ProcessDomains:
* - Trims feature coordinates back to agree with overlapping hits.
* - Counts hits in each feature, per sample
* - Finds the peak & bubble position in the domain according to a choice of methods.
*
* @param currFeatures
* @param current region
* @return : Lists of EnrichedFeatures, indexed by condition
*/
protected Map<ExperimentCondition, List<EnrichedFeature>> processDomains(Map<ExperimentCondition,List<EnrichedFeature>> currFeatures, Region currSubRegion){
Map<ExperimentCondition, List<EnrichedFeature>> peakFeatures = new HashMap<ExperimentCondition, List<EnrichedFeature>>();
for(ExperimentCondition cond : manager.getConditions())
peakFeatures.put(cond, new ArrayList<EnrichedFeature>());
for(ExperimentCondition currCondition : manager.getConditions()){
for(EnrichedFeature currDomain : currFeatures.get(currCondition)){
Map<Sample, List<StrandedBaseCount>> fHitsPos = overlappingHits(hitsPos, currDomain);
Map<Sample, List<StrandedBaseCount>> fHitsNeg = overlappingHits(hitsNeg, currDomain);
//Trim the coordinates
trimFeature(currDomain, fHitsPos, fHitsNeg, currCondition);
//Quantify the feature in each Sample and in the condition in which it was found
quantifyFeature(currDomain, fHitsPos, fHitsNeg, currCondition);
//Find the peaks
EnrichedPCSPeakFeature pcspeak= findPCSBubble(fHitsPos, fHitsNeg, (EnrichedFeature)currDomain, currCondition, currSubRegion);
if(pcspeak!=null)
peakFeatures.get(currCondition).add(pcspeak);
}
}
return(peakFeatures);
}
/**
* Calculate the sequence composition around every tag 5' position in the current region,
* and add the counts to the global models.
*
* Assumes that currRegionSeq and currRegionSeqRC have been properly initialized
* @param currReg
*/
protected void calculateTagSequenceComposition(Region currReg){
float[][][] localTagSeqComposition = new float[manager.getSamples().size()][tagSeqWin+1][4];
for(int s=0; s<manager.getSamples().size(); s++)
for(int i=0; i<=tagSeqWin; i++)
for(int j=0; j<4; j++)
localTagSeqComposition[s][i][j]=0;
int halfSeqWin = tagSeqWin/2;
for(Sample s : manager.getSamples()){
for(StrandedBaseCount sbc : hitsPos.get(s)){
int w=0;
for(int x=sbc.getCoordinate()-halfSeqWin-currReg.getStart(); x<=sbc.getCoordinate()+halfSeqWin-currReg.getStart(); x++){
if(x>=0 && x<currRegionSeq.length){
int y = SequenceUtils.char2int(currRegionSeq[x]);
if(y>=0)
localTagSeqComposition[s.getIndex()][w][y]+=sbc.getCount();
}
w++;
}
}
for(StrandedBaseCount sbc : hitsNeg.get(s)){
int w=0;
for(int x=currReg.getEnd()-sbc.getCoordinate()-halfSeqWin; x<=currReg.getEnd()-sbc.getCoordinate()+halfSeqWin; x++){
if(x>=0 && x<currRegionSeqRC.length){
int y = SequenceUtils.char2int(currRegionSeqRC[x]);
if(y>=0)
localTagSeqComposition[s.getIndex()][w][y]+=sbc.getCount();
}
w++;
}
}
}
synchronized(tagSeqComposition){
for(int s=0; s<manager.getSamples().size(); s++)
for(int i=0; i<=tagSeqWin; i++)
for(int j=0; j<4; j++)
tagSeqComposition[s][i][j]+=localTagSeqComposition[s][i][j];
}
}
/**
* Get the base preceding the 5' end of the tag
* @param a
* @param queryReg
* @return
*/
protected char getPrecedingBase(StrandedBaseCount a, Region queryReg){
char b = '.';
int wantedPos = a.getStrand()=='+' ?
a.getCoordinate()-1 :
a.getCoordinate()+1;
if(wantedPos>=queryReg.getStart() && wantedPos<queryReg.getEnd()){
b = a.getStrand()=='+' ? currRegionSeq[wantedPos-queryReg.getStart()] : currRegionSeqRC[queryReg.getEnd()-wantedPos];
}
return b;
}
/**
* Find the peak locations based on maximum overlapping read counts (T-preceding tags only).
* Also quantify the number of tags and bases in the central bubble region, for computing the bubble index.
*
* @return : Feature (EnrichedPCSPeakFeature)
*/
protected EnrichedPCSPeakFeature findPCSBubble(Map<Sample, List<StrandedBaseCount>> fHitsPos, Map<Sample, List<StrandedBaseCount>> fHitsNeg,
EnrichedFeature domain, ExperimentCondition currCondition, Region currSubRegion){
float [][] sum = new float[domain.getCoords().getWidth()+1][4];
for(int s=0; s<=domain.getCoords().getWidth(); s++)
for(int b=0; b<4; b++)
sum[s][b]=0;
for(Sample s : currCondition.getSignalSamples()){
if(!strandedEventDetection || domain.getCoords().getStrand()=='+')
for(StrandedBaseCount h : fHitsPos.get(s)){
int pbase = SequenceUtils.char2int(getPrecedingBase(h, currSubRegion));
if(pbase>=0){
int start = getLeft(h)-domain.getCoords().getStart();
int stop= getRight(h)-domain.getCoords().getStart();
for(int i=start; i<stop; i++)
if(i>=0 && i<=sum.length)
sum[i][pbase]+=h.getCount();
}
}
if(!strandedEventDetection || domain.getCoords().getStrand()=='-')
for(StrandedBaseCount h : fHitsNeg.get(s)){
int pbase = SequenceUtils.char2int(getPrecedingBase(h, currSubRegion));
if(pbase>=0){
int start = getLeft(h)-domain.getCoords().getStart();
int stop= getRight(h)-domain.getCoords().getStart();
for(int i=start; i<=stop; i++)
if(i>=0 && i<sum.length)
sum[i][pbase]+=h.getCount();
}
}
}
//Find the peak according to T tags
float max = 0; int maxPos = -1;
for(int s=0; s<sum.length; s++){
if(sum[s][3]>max){ //peaks should be determined only using T-preceding tags in permanganate-ChIP-seq
max= sum[s][3];
maxPos=s;
}
}
Point p = new Point(gen, domain.getCoords().getChrom(), maxPos+domain.getCoords().getStart());
//Calc the counts needed for the bubble index
float[] bubbleTags = new float[4];
float[] bubbleBases = new float[4];
for(int b=0; b<4; b++){bubbleTags[b]=0; bubbleBases[b]=0;}
int bubbleStart=(int)(maxPos-(bubbleWindow/2));
int bubbleEnd=(int)(maxPos+(bubbleWindow/2));
for(int z=bubbleStart; z<bubbleEnd; z++){
if(z>=0 && z<=domain.getCoords().getWidth()){
for(int b=0; b<4; b++){
bubbleTags[b]+=sum[z][b];
}
bubbleBases[SequenceUtils.char2int(currRegionSeq[z])]++;
bubbleBases[SequenceUtils.char2int(SequenceUtils.complementChar(currRegionSeq[z]))]++;
}
}
//Define the feature
EnrichedPCSPeakFeature epf=null;
try {
epf = new EnrichedPCSPeakFeature(domain.getCoords(), p, domain.getSampleCountsPos(), domain.getSampleCountsNeg(),
domain.getSignalCount(), domain.getControlCount(), domain.getScore(),
max, bubbleTags, bubbleBases);
} catch (Exception e) {}
return(epf);
}
}
}