/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.fastqc;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.DEFAULT_SINGLE_INPUT_PORT_NAME;
import static fr.ens.biologie.genomique.eoulsan.core.OutputPortsBuilder.singleOutputPort;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_FASTQ;
import static java.util.Collections.singletonList;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
import com.google.common.collect.Lists;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopCompatible;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder;
import fr.ens.biologie.genomique.eoulsan.core.Modules;
import fr.ens.biologie.genomique.eoulsan.core.OutputPorts;
import fr.ens.biologie.genomique.eoulsan.core.Parameter;
import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.core.Version;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFiles;
import fr.ens.biologie.genomique.eoulsan.data.DataFormat;
import fr.ens.biologie.genomique.eoulsan.data.DataFormatRegistry;
import fr.ens.biologie.genomique.eoulsan.data.DataFormats;
import fr.ens.biologie.genomique.eoulsan.modules.AbstractModule;
import fr.ens.biologie.genomique.eoulsan.util.FileUtils;
import uk.ac.babraham.FastQC.Modules.AbstractQCModule;
import uk.ac.babraham.FastQC.Modules.AdapterContent;
import uk.ac.babraham.FastQC.Modules.BasicStats;
import uk.ac.babraham.FastQC.Modules.KmerContent;
import uk.ac.babraham.FastQC.Modules.NContent;
import uk.ac.babraham.FastQC.Modules.OverRepresentedSeqs;
import uk.ac.babraham.FastQC.Modules.PerBaseQualityScores;
import uk.ac.babraham.FastQC.Modules.PerBaseSequenceContent;
import uk.ac.babraham.FastQC.Modules.PerSequenceGCContent;
import uk.ac.babraham.FastQC.Modules.PerSequenceQualityScores;
import uk.ac.babraham.FastQC.Modules.PerTileQualityScores;
import uk.ac.babraham.FastQC.Modules.QCModule;
import uk.ac.babraham.FastQC.Modules.SequenceLengthDistribution;
import uk.ac.babraham.FastQC.Report.HTMLReportArchive;
import uk.ac.babraham.FastQC.Sequence.Sequence;
import uk.ac.babraham.FastQC.Sequence.SequenceFile;
import uk.ac.babraham.FastQC.Sequence.SequenceFormatException;
/**
* This class define a module that compute QC report using FastQC.
* @author Sandrine Perrin
* @since 2.0
*/
@HadoopCompatible
public class FastQCModule extends AbstractModule {
/** Module name */
private static final String MODULE_NAME = "fastqc";
/** Input format key in parameters. */
private static final String INPUT_FORMAT_PARAMETER_NAME = "input.format";
/** Collector FastQC kmer size */
public static final String FASTQC_KMER_SIZE_PARAMETER_NAME =
"fastqc.kmer.size";
/** Collector FastQC nogroup */
public static final String FASTQC_NOGROUP_PARAMETER_NAME = "fastqc.nogroup";
/** Use exponential base groups in graph */
public static final String FASTQC_EXPGROUP_PARAMETER_NAME = "fastqc.expgroup";
/** Format fastq type casava/Illumina */
public static final String FASTQC_CASAVA_PARAMETER_NAME = "fastqc.casava";
/** Option for filter fastq file if casava=true for all modules */
public static final String FASTQC_NOFILTER_PARAMETER_NAME = "fastqc.nofilter";
/** The input format per default */
private DataFormat inputFormat = DataFormats.READS_FASTQ;
//
// Module methods
//
@Override
public String getName() {
return MODULE_NAME;
}
@Override
public String getDescription() {
return "This module launch FastQC on FASTQ or SAM files and generate an html report";
}
@Override
public Version getVersion() {
return Globals.APP_VERSION;
}
@Override
public InputPorts getInputPorts() {
final InputPortsBuilder builder = new InputPortsBuilder();
if (this.inputFormat == DataFormats.READS_FASTQ) {
builder.addPort(DEFAULT_SINGLE_INPUT_PORT_NAME, DataFormats.READS_FASTQ);
} else {
builder.addPort(DEFAULT_SINGLE_INPUT_PORT_NAME,
DataFormats.MAPPER_RESULTS_SAM);
}
return builder.create();
}
@Override
public OutputPorts getOutputPorts() {
return singleOutputPort(DataFormats.FASTQC_REPORT_HTML);
}
@Override
public void configure(final StepConfigurationContext context,
final Set<Parameter> stepParameters) throws EoulsanException {
// Define parameters of FastQC
System.setProperty("java.awt.headless", "true");
System.setProperty("fastqc.unzip", "true");
// Parse step parameters to initialize module
for (final Parameter p : stepParameters) {
switch (p.getName()) {
case INPUT_FORMAT_PARAMETER_NAME:
// Set inputPort fastq/sam from parameters
DataFormat format = DataFormatRegistry.getInstance()
.getDataFormatFromNameOrAlias(p.getLowerStringValue());
if (!(MAPPER_RESULTS_SAM.equals(format)
|| READS_FASTQ.equals(format))) {
Modules.badParameterValue(context, p,
"Unknown or format not supported as input format for FastQC");
}
this.inputFormat = format;
break;
case FASTQC_KMER_SIZE_PARAMETER_NAME:
// Kmer Size, default FastQC value is 7
System.setProperty("fastqc.kmer_size",
"" + p.getIntValueGreaterOrEqualsTo(1));
break;
case FASTQC_NOGROUP_PARAMETER_NAME:
// Set fastQC nogroup, default FastQC value false
System.setProperty("fastqc.nogroup", "" + p.getBooleanValue());
break;
case FASTQC_EXPGROUP_PARAMETER_NAME:
// Set fastQC expgroup, default FastQC value false
System.setProperty("fastqc.expgroup", "" + p.getBooleanValue());
break;
case FASTQC_CASAVA_PARAMETER_NAME:
// Set fastQC format fastq, default FastQC value false
System.setProperty("fastqc.casava", "" + p.getBooleanValue());
break;
case FASTQC_NOFILTER_PARAMETER_NAME:
// Default FastQC value true
// Set fastQC nofilter default false, if casava=true, filter fastq file
System.setProperty("fastqc.nofilter", "" + p.getBooleanValue());
break;
default:
Modules.unknownParameter(context, p);
}
}
}
@Override
public TaskResult execute(final TaskContext context,
final TaskStatus status) {
// Patch FastQC code on sequenceFile to make hadoop compatible
try {
FastQCRuntimePatcher.patchFastQC();
} catch (EoulsanException e1) {
return status.createTaskResult(e1);
}
// Get input data
final Data inData = context.getInputData(this.inputFormat);
// Get output data
final Data outData =
context.getOutputData(DataFormats.FASTQC_REPORT_HTML, inData);
// Define the list of input files
final List<DataFile> inputFiles = new ArrayList<>();
if (inData.getFormat().getMaxFilesCount() > 1) {
for (int i = 0; i < inData.getDataFileCount(); i++) {
inputFiles.add(inData.getDataFile(i));
}
} else {
inputFiles.add(inData.getDataFile());
}
// Process input files
try {
int i = 0;
for (DataFile inputFile : inputFiles) {
// Define the report output file
final DataFile reportFile = outData.getDataFile(i++);
// Launch FastQC analysis
processFile(inputFile, this.inputFormat == READS_FASTQ, reportFile,
context.getLocalTempDirectory(), status);
}
return status.createTaskResult();
} catch (final SequenceFormatException e) {
return status.createTaskResult(e,
"Error with sequence file format: " + e.getMessage());
} catch (final IOException e) {
return status.createTaskResult(e,
"Error while parsing file: " + e.getMessage());
} catch (final XMLStreamException e) {
return status.createTaskResult(e,
"Error while writing final report: " + e.getMessage());
}
}
/**
* Process an input file by FastQC.
* @param inputFile the input file
* @param fastqFormat true if the format of the input file is FASTQ
* @param outputFile the report output file
* @param tmpDir the temporary directory
* @param status the task status
* @throws SequenceFormatException if an error occurs while processing
* sequences
* @throws IOException if an error occurs while processing sequences
* @throws XMLStreamException if an error occurs while creating report
*/
private void processFile(DataFile inputFile, final boolean fastqFormat,
final DataFile outputFile, final File tmpDir, final TaskStatus status)
throws SequenceFormatException, IOException, XMLStreamException {
// Set the description of the context
status.setDescription("Process sequence of " + inputFile + " for FastQC");
// Get the SequenceFile object
final CounterSequenceFile seqFile;
if (fastqFormat) {
seqFile = new FastqSequenceFile(inputFile);
} else {
seqFile = new SAMSequenceFile(inputFile);
}
// Define modules list
final OverRepresentedSeqs os = new OverRepresentedSeqs();
final List<AbstractQCModule> modules = Lists.newArrayList(new BasicStats(),
new PerBaseQualityScores(), new PerTileQualityScores(),
new PerSequenceQualityScores(), new PerBaseSequenceContent(),
new PerSequenceGCContent(), new NContent(),
new SequenceLengthDistribution(), os.duplicationLevelModule(), os,
new AdapterContent(), new KmerContent());
// Process sequences
processSequences(modules, seqFile);
// If no entries in the input file use a dedicated module
final List<AbstractQCModule> reportModules = seqFile.getCount() > 0
? modules
: singletonList((AbstractQCModule) new EmptyFileQC(inputFile));
// Set the description of the context
status.setDescription(
"Create FastQC report on " + inputFile + " in " + outputFile.getName());
// Create the report
createReport(reportModules, seqFile, outputFile, tmpDir);
// Keep module data is now unnecessary
modules.clear();
}
/**
* Process sequences.
* @param modules the modules
* @param seqFile the sequence file
* @throws SequenceFormatException the sequence format exception
*/
private void processSequences(final List<AbstractQCModule> modules,
final SequenceFile seqFile) throws SequenceFormatException {
while (seqFile.hasNext()) {
final Sequence seq = seqFile.next();
for (final QCModule module : modules) {
module.processSequence(seq);
}
}
}
/**
* Creates the report.
* @param modules the modules
* @param seqFile the sequence file
* @param reportFile the report file
* @param tempDirectory temporary directory
* @throws IOException Signals that an I/O exception has occurred.
* @throws XMLStreamException the XML stream exception
*/
private void createReport(final List<AbstractQCModule> modules,
final SequenceFile seqFile, final DataFile reportFile,
final File tempDirectory) throws IOException, XMLStreamException {
// Get the report extension
final String reportExtension =
DataFormats.FASTQC_REPORT_HTML.getDefaultExtension();
// Define the temporary output file
final File reportTempFile =
File.createTempFile("reportfile-", reportExtension, tempDirectory);
// Create the output report
new HTMLReportArchive(seqFile,
modules.toArray(new QCModule[modules.size()]), reportTempFile);
// Report zip filename
final String baseFilename = reportFile.getName().substring(0,
reportFile.getName().length() - reportExtension.length());
// Remove zip file
final File zipFile =
new File(reportTempFile.getParent(), baseFilename + ".zip");
if (!zipFile.delete()) {
getLogger()
.warning("Unable to remove FastQC output zip file: " + zipFile);
}
// Remove directory file
final File zipDir = new File(reportTempFile.getParent(), baseFilename);
if (!FileUtils.recursiveDelete(zipDir)) {
getLogger()
.warning("Unable to remove FastQC output directory: " + zipDir);
}
// Copy the temporary file to the real output file
DataFiles.copy(new DataFile(reportTempFile), reportFile);
// Remove the temporary file
if (!reportTempFile.delete()) {
getLogger().warning(
"Unable to remove FastQC temporary output file: " + reportTempFile);
}
}
}