/**
* Copyright Copyright 2014 Simon Andrews
*
* This file is part of BamQC.
*
* BamQC is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* BamQC is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with BamQC; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* Changelog:
* - Piero Dalle Pezze: Added two plots, added report, added information for regions without coverage.
* - Bart Ailey: Class creation.
*/
package uk.ac.babraham.BamQC.Modules;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import javax.swing.JPanel;
import javax.xml.stream.XMLStreamException;
import net.sf.samtools.SAMRecord;
import uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet;
import uk.ac.babraham.BamQC.DataTypes.Genome.Chromosome;
import uk.ac.babraham.BamQC.Graphs.LineWithHorizontalBarGraph;
import uk.ac.babraham.BamQC.Graphs.SeparateLineGraph;
import uk.ac.babraham.BamQC.Report.HTMLReportArchive;
import uk.ac.babraham.BamQC.Sequence.SequenceFile;
import uk.ac.babraham.BamQC.Statistics.SimpleStats;
import uk.ac.babraham.BamQC.Modules.ModuleConfig;
public class GenomeCoverage extends AbstractQCModule {
private int plotTypeChromosomesThreshold = ModuleConfig.getParam("GenomeCoverage_plot_type_chromosomes_threshold", "ignore").intValue();
private String [] chromosomeNames = null;
private double [][] binCounts = null;
private long [] coverage = null;
private double maxCoverage = 0.0;
private int maxBins = 1;
@Override
public void processSequence(SAMRecord read) { }
@Override
public void processFile(SequenceFile file) { }
@Override
public String name() {
return "Genome Coverage";
}
@Override
public String description() {
return "Genome Coverage";
}
@Override
public void reset() {
chromosomeNames = null;
binCounts = null;
coverage = null;
}
@Override
public boolean raisesError() {
return false;
}
@Override
public boolean raisesWarning() {
return false;
}
@Override
public boolean needsToSeeSequences() {
return false;
}
@Override
public boolean needsToSeeAnnotation() {
return true;
}
@Override
public void processAnnotationSet(AnnotationSet annotation) {
Chromosome [] chromosomes = annotation.chromosomeFactory().getAllChromosomes();
if(chromosomes.length <= plotTypeChromosomesThreshold) {
// This will plot the chromosomes from 1 (top) to n (bottom)
Arrays.sort(chromosomes, Collections.reverseOrder());
} else {
// This will plot the chromosomes from 1 (left) to n (right)
Arrays.sort(chromosomes);
}
chromosomeNames = new String [chromosomes.length];
binCounts = new double[chromosomes.length][];
// We'll plot everything on the same scale, which means we'll reduce everything to a
// common scale. Our limit is going to be that we'll put 200 points on the longest
// chromosome
maxBins = 1;
for (int c=0;c<chromosomes.length;c++) {
if(chromosomes[c].getBinCountData().length <= 1) {
} else if (chromosomes[c].getBinCountData().length>maxBins) {
maxBins = chromosomes[c].getBinCountData().length;
}
}
// configuration of how many bins per chromosome we want to plot.
// This is the number of bins per chromosome for the official plot getResultsPanel()
int plotBinsPerChromosome = 0;
if(chromosomeNames.length <= plotTypeChromosomesThreshold) {
plotBinsPerChromosome = ModuleConfig.getParam("GenomeCoverage_plot_bins_per_chromosome", "ignore").intValue();
} else {
plotBinsPerChromosome = ModuleConfig.getParam("GenomeCoverage_plot_bins_all_chromosomes", "ignore").intValue() / chromosomes.length;
}
int binsToUse = plotBinsPerChromosome;
double binRatio = maxBins/(double)plotBinsPerChromosome;
if (maxBins<plotBinsPerChromosome) {
binRatio = 1;
binsToUse = maxBins;
}
for (int c=0;c<chromosomes.length;c++) {
chromosomeNames[c] = chromosomes[c].name();
// System.out.println("Chromosome is " + chromosomes[c].name());
coverage = chromosomes[c].getBinCountData();
binCounts[c] = new double[binsToUse];
int [] replicateCounts = new int[binsToUse];
for (int i=0;i<coverage.length;i++) {
int thisIndex = (int)(i/binRatio);
++replicateCounts[thisIndex];
if (coverage[i] > 0) {
binCounts[c][thisIndex] += coverage[i];
}
}
int firstInvalidBin = 1 + (int)((coverage.length-1)/binRatio);
for (int i=firstInvalidBin;i<binsToUse;i++) {
binCounts[c][i] = Double.NaN;
}
// Now average the replicates
for (int i=0;i<replicateCounts.length;i++) {
if (replicateCounts[i]>0) {
binCounts[c][i] /= replicateCounts[i];
if(binCounts[c][i] <= 0) {
// Let's label these points having null coverage so that we don't miss them
binCounts[c][i] = Double.NEGATIVE_INFINITY;
continue;
}
// scale to log to enlarge the data differences. log_e makes it smaller than log_10.
if (binCounts[c][i] > 0) binCounts[c][i] = Math.log(binCounts[c][i]);
}
}
// Now convert to z-scores
double [] validValues = new double[firstInvalidBin];
for (int i=0;i<validValues.length;i++) {
validValues[i] = binCounts[c][i];
}
// The following two methods remove NaN and Infinite values from the computation.
double mean = SimpleStats.mean(validValues);
double sd = SimpleStats.stdev(validValues, mean);
for (int i=0;i<validValues.length;i++) {
if(Double.isInfinite(binCounts[c][i])) continue;
if(sd > 0) binCounts[c][i] = (binCounts[c][i]-mean)/sd;
if (binCounts[c][i] > maxCoverage) maxCoverage = binCounts[c][i];
if (0-binCounts[c][i] > maxCoverage) maxCoverage = 0-binCounts[c][i];
// if(binCounts[c][i] < 0) System.out.println(binCounts[c][i]);
}
}
}
@Override
public JPanel getResultsPanel() {
for(int i=0; i<chromosomeNames.length; i++) {
if(chromosomeNames[i].toLowerCase().startsWith("chr"))
chromosomeNames[i] = chromosomeNames[i].substring(3);
}
if(chromosomeNames.length <= plotTypeChromosomesThreshold) {
// plots the genome coverage for each chromosome separately
return getSeparateChromosomeResultsPanel();
}
// plots the genome coverage lining all chromosomes
return getAllChromosomeResultsPanel();
}
public JPanel getSeparateChromosomeResultsPanel() {
int maxBins = 0;
for (int i=0;i<binCounts.length;i++) {
if (binCounts[i].length > maxBins)
maxBins = binCounts[i].length;
}
String [] labels = new String[maxBins];
for (int i=0;i<maxBins;i++) {
labels[i] = ""+(i*Chromosome.COVERAGE_BIN_SIZE);
}
String title = "Genome Coverage ( red: z-score, blue: region with no coverage )";
String xLabel = "Genome Position";
String yLabel = "Chromosomes";
return new SeparateLineGraph(binCounts, 0-maxCoverage, maxCoverage, xLabel, yLabel, chromosomeNames, labels, title);
}
public JPanel getAllChromosomeResultsPanel() {
/* Set up for separate line chart representing chromosome coverages. */
int[] scaffoldLengths = new int[binCounts.length];
int fullBinCountsLength=0;
for(int i=0; i<binCounts.length; i++) {
boolean nanFound = false;
for(scaffoldLengths[i]=0; scaffoldLengths[i]<binCounts[i].length && !nanFound; scaffoldLengths[i]++) {
if(Double.isNaN(binCounts[i][scaffoldLengths[i]])) {
nanFound = true;
scaffoldLengths[i]--;
}
}
fullBinCountsLength = fullBinCountsLength + scaffoldLengths[i];
}
double[] fullBinCounts = new double[fullBinCountsLength];
double[] fullBinLengths = new double[binCounts.length];
int k=0;
for(int i=0; i<binCounts.length; i++) {
for(int j=0; j<scaffoldLengths[i]; j++) {
fullBinCounts[k] = binCounts[i][j];
k++;
}
fullBinLengths[i] = scaffoldLengths[i]*Chromosome.COVERAGE_BIN_SIZE;
}
int maxBins = fullBinCounts.length;
String[] labels = new String[maxBins];
for(int i=0; i<maxBins; i++) {
labels[i] = ""+(i*Chromosome.COVERAGE_BIN_SIZE);
}
/* Set up of a stacked row chart representing chromosome coverages. */
String title = "Genome Coverage ( red: z-score, blue: region with no coverage )";
/* plot the data */
JPanel resultsPanel = new JPanel();
resultsPanel.setLayout(new javax.swing.BoxLayout(resultsPanel, javax.swing.BoxLayout.PAGE_AXIS));
resultsPanel.add(new LineWithHorizontalBarGraph(fullBinLengths, fullBinCounts, 0-maxCoverage, maxCoverage, "Genome Position", chromosomeNames, "", labels, title, "Scaffold ( hover the mouse on the red bars for name:position )"));
return resultsPanel;
}
@Override
public boolean ignoreInReport() {
if(ModuleConfig.getParam("GenomeCoverage", "ignore") > 0 || chromosomeNames == null || chromosomeNames.length == 0 || maxBins == 1) {
return true;
}
return false;
}
@Override
public void makeReport(HTMLReportArchive report) throws XMLStreamException, IOException {
super.writeDefaultImage(report, "genome_coverage.png", "Genome Coverage", 800, 600);
if(chromosomeNames == null || chromosomeNames.length == 0 || maxBins == 1) { return; }
StringBuffer sb = report.dataDocument();
sb.append("Chromosome_name\tGenome_position\n");
sb.append("Name\t");
for (int i=0;i<binCounts[0].length;i++) {
sb.append(i*Chromosome.COVERAGE_BIN_SIZE).append("\t");
}
sb.append("\n");
for (int i=0;i<chromosomeNames.length;i++) {
sb.append(chromosomeNames[i]).append("\t");
for (int j=0;j<binCounts[i].length;j++) {
sb.append(binCounts[i][j]).append("\t");
}
sb.append("\n");
}
}
public String[] getChromosomeNames() {
return chromosomeNames;
}
public long[] getCoverage() {
return coverage;
}
}