/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mapping.local;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_FASTQ;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.INPUT_RAW_READS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.OUTPUT_FILTERED_READS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.READS_REJECTED_BY_FILTERS_COUNTER;
import java.io.FileNotFoundException;
import java.io.IOException;
import com.google.common.base.Joiner;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.annotations.LocalOnly;
import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.ReadSequence;
import fr.ens.biologie.genomique.eoulsan.bio.io.FastqReader;
import fr.ens.biologie.genomique.eoulsan.bio.io.FastqWriter;
import fr.ens.biologie.genomique.eoulsan.bio.readsfilters.MultiReadFilter;
import fr.ens.biologie.genomique.eoulsan.bio.readsfilters.ReadFilter;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractReadsFilterModule;
import fr.ens.biologie.genomique.eoulsan.util.LocalReporter;
import fr.ens.biologie.genomique.eoulsan.util.Reporter;
/**
* This class define a module for reads filtering.
* @since 1.0
* @author Laurent Jourdren
* @author Maria Bernard
*/
@LocalOnly
public class ReadsFilterLocalModule extends AbstractReadsFilterModule {
@Override
public TaskResult execute(final TaskContext context,
final TaskStatus status) {
// Create the reporter
final Reporter reporter = new LocalReporter();
try {
// Get input and output data
final Data inData = context.getInputData(READS_FASTQ);
final Data outData = context.getOutputData(READS_FASTQ, inData);
// Get FASTQ format
final FastqFormat fastqFormat = inData.getMetadata().getFastqFormat();
// get input file count for the sample
final int inFileCount = inData.getDataFileCount();
if (inFileCount < 1) {
throw new IOException("No reads file found.");
}
if (inFileCount > 2) {
throw new IOException(
"Cannot handle more than 2 reads files at the same time.");
}
// Get the read filter
final MultiReadFilter filter = getReadFilter(reporter, COUNTER_GROUP);
getLogger().info("Reads filters to apply: "
+ Joiner.on(", ").join(filter.getFilterNames()));
// Run the filter in single or pair-end mode
if (inFileCount == 1) {
singleEnd(inData, outData, fastqFormat, reporter, status, filter);
} else {
pairedEnd(inData, outData, fastqFormat, reporter, status, filter);
}
} catch (FileNotFoundException e) {
return status.createTaskResult(e, "File not found: " + e.getMessage());
} catch (IOException e) {
return status.createTaskResult(e,
"Error while filtering reads: " + e.getMessage());
} catch (EoulsanException e) {
return status.createTaskResult(e,
"Error while initializing filter: " + e.getMessage());
}
return status.createTaskResult();
}
/**
* Filter a sample data in single end mode.
* @param inData input Data
* @param outData output Data
* @param fastqFormat FASTQ format
* @param reporter reporter to use
* @param status step status
* @param filter reads filter to use
* @throws IOException if an error occurs while filtering reads
*/
private static void singleEnd(final Data inData, final Data outData,
final FastqFormat fastqFormat, final Reporter reporter,
final TaskStatus status, final ReadFilter filter) throws IOException {
// Get the source
final DataFile inFile = inData.getDataFile(0);
// Get the dest
final DataFile outFile = outData.getDataFile(0);
// Filter reads
filterFile(inFile, outFile, reporter, filter, fastqFormat);
// Set the description of the context
status.setDescription(
"Filter reads (" + inData.getName() + ", " + inFile.getName() + ")");
// Add counters for this sample to log file
status.setCounters(reporter, COUNTER_GROUP);
}
/**
* Filter a sample data in paired-end mode.
* @param inData input Data
* @param outData output Data
* @param fastqFormat FASTQ format
* @param reporter reporter to use
* @param filter reads filter to use
* @throws IOException if an error occurs while filtering reads
*/
private static void pairedEnd(final Data inData, final Data outData,
final FastqFormat fastqFormat, final Reporter reporter,
final TaskStatus status, final ReadFilter filter) throws IOException {
// Filter reads
filterFile(inData.getDataFile(0), inData.getDataFile(1),
outData.getDataFile(0), outData.getDataFile(1), reporter, filter,
fastqFormat);
// Set the description of the context
status.setDescription("Filter reads ("
+ inData.getName() + ", " + inData.getDataFile(0).getName() + ", "
+ inData.getDataFile(1).getName() + ")");
// Add counters for this sample to log file
status.setCounters(reporter, COUNTER_GROUP);
}
/**
* Filter a file in single end mode.
* @param inFile input file
* @param outFile output file
* @param reporter reporter to use
* @param filter reads filter to use
* @param fastqFormat FastqFormat
* @throws IOException if an error occurs while filtering data
*/
private static void filterFile(final DataFile inFile, final DataFile outFile,
final Reporter reporter, final ReadFilter filter,
final FastqFormat fastqFormat) throws IOException {
getLogger().info("Filter file: " + inFile);
getLogger().info("FastqFormat: " + fastqFormat);
try (FastqReader reader = new FastqReader(inFile.open());
FastqWriter writer = new FastqWriter(outFile.create())) {
for (final ReadSequence read : reader) {
// Set Fastq format
read.setFastqFormat(fastqFormat);
reporter.incrCounter(COUNTER_GROUP,
INPUT_RAW_READS_COUNTER.counterName(), 1);
if (filter.accept(read)) {
writer.write(read);
reporter.incrCounter(COUNTER_GROUP,
OUTPUT_FILTERED_READS_COUNTER.counterName(), 1);
} else {
reporter.incrCounter(COUNTER_GROUP,
READS_REJECTED_BY_FILTERS_COUNTER.counterName(), 1);
}
}
reader.throwException();
} catch (BadBioEntryException e) {
throw new IOException("Invalid Fastq format: "
+ e.getMessage() + " File: " + inFile + " Entry: " + e.getEntry());
}
}
/**
* Filter a file in pair-end mode.
* @param inFile1 first input file
* @param inFile2 second input file
* @param outFile1 first output file
* @param outFile2 second output file
* @param reporter reporter to use
* @param filter reads filter to use
* @param fastqFormat FastqFormat
* @throws IOException if an error occurs while filtering data
*/
private static void filterFile(final DataFile inFile1, final DataFile inFile2,
final DataFile outFile1, final DataFile outFile2, final Reporter reporter,
final ReadFilter filter, final FastqFormat fastqFormat)
throws IOException {
getLogger().info("Filter files: "
+ inFile1 + ", " + inFile2 + ", Fastq format: " + fastqFormat);
try (FastqReader reader2 = new FastqReader(inFile2.open());
FastqWriter writer1 = new FastqWriter(outFile1.create());
FastqWriter writer2 = new FastqWriter(outFile2.create());
FastqReader reader1 = new FastqReader(inFile1.open())) {
for (final ReadSequence read1 : reader1) {
// Test if the second read exists
if (!reader2.hasNext()) {
reader2.throwException();
throw new IOException("Excepted end of the second reads file.");
}
// Get the second read
final ReadSequence read2 = reader2.next();
// Set fastq format
read1.setFastqFormat(fastqFormat);
read2.setFastqFormat(fastqFormat);
reporter.incrCounter(COUNTER_GROUP,
INPUT_RAW_READS_COUNTER.counterName(), 1);
if (filter.accept(read1, read2)) {
writer1.write(read1);
writer2.write(read2);
reporter.incrCounter(COUNTER_GROUP,
OUTPUT_FILTERED_READS_COUNTER.counterName(), 1);
} else {
reporter.incrCounter(COUNTER_GROUP,
READS_REJECTED_BY_FILTERS_COUNTER.counterName(), 1);
}
}
reader1.throwException();
reader2.throwException();
if (reader2.hasNext()) {
throw new IOException("Excepted end of the first reads file.");
}
} catch (BadBioEntryException e) {
throw new IOException("Invalid Fastq format: "
+ e.getMessage() + " File 1: " + inFile1 + " File2:" + inFile2
+ " Entry: " + e.getEntry());
}
}
}