/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_FASTQ;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_TFQ;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.HadoopMappingUtils.addParametersToJobConf;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsFilterMapper.READ_FILTER_PARAMETER_KEY_PREFIX;
import static java.util.Collections.singletonList;
import static com.google.common.collect.Lists.newArrayList;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.google.common.base.Joiner;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqOutputFormat;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFormat;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractReadsFilterModule;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;
/**
* This class is the main class for the filter reads program in hadoop mode.
* @since 1.0
* @author Laurent Jourdren
* @author Claire Wallon
*/
@HadoopOnly
public class ReadsFilterHadoopModule extends AbstractReadsFilterModule {
static final String OUTPUT_FILE1_KEY =
Globals.PARAMETER_PREFIX + ".filter.reads.output.file1";
static final String OUTPUT_FILE2_KEY =
Globals.PARAMETER_PREFIX + ".filter.reads.output.file2";
private static final String TEMP_DIR_SUFFIX = ".tmp";
//
// Module methods
//
@Override
public InputPorts getInputPorts() {
return allPortsRequiredInWorkingDirectory(super.getInputPorts());
}
@Override
public TaskResult execute(final TaskContext context,
final TaskStatus status) {
// Create configuration object
final Configuration conf = createConfiguration();
try {
// Get input and output data
final Data inData = context.getInputData(READS_FASTQ);
final Data outData = context.getOutputData(READS_FASTQ, inData);
final String dataName = inData.getName();
// Get FASTQ format
final FastqFormat fastqFormat = inData.getMetadata().getFastqFormat();
// Create the job to run
final Job job;
DataFile tfqFile = null;
// Pre-process paired-end files
if (inData.getDataFileCount() == 1) {
// Define input and output files
final DataFile inFile = inData.getDataFile(0);
final DataFile outFile = outData.getDataFile(0);
final List<String> filenames = singletonList(inFile.getName());
job = createJobConf(conf, dataName, inFile, filenames, READS_FASTQ,
fastqFormat, outFile);
} else {
// Define input and output files
final DataFile inFile1 = inData.getDataFile(0);
final DataFile inFile2 = inData.getDataFile(1);
final DataFile outFile1 = outData.getDataFile(0);
final DataFile outFile2 = outData.getDataFile(1);
final List<String> filenames =
newArrayList(inFile1.getName(), inFile2.getName());
tfqFile = new DataFile(inFile1.getParent(),
inFile1.getBasename() + READS_TFQ.getDefaultExtension());
// Convert FASTQ files to TFQ
MapReduceUtils.submitAndWaitForJob(
PairedEndFastqToTfq.convert(conf, inFile1, inFile2, tfqFile,
getReducerTaskCount()),
inData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status,
COUNTER_GROUP);
job = createJobConf(conf, dataName, tfqFile, filenames, READS_TFQ,
fastqFormat, outFile1, outFile2);
}
// Submit main job
MapReduceUtils.submitAndWaitForJob(job, inData.getName(),
CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);
// Cleanup paired-end
if (inData.getDataFileCount() > 1) {
final DataFile outFile1 = outData.getDataFile(0);
final DataFile outFile2 = outData.getDataFile(1);
final DataFile tmpDir =
new DataFile(outFile1.getSource() + TEMP_DIR_SUFFIX);
final DataFile outTmpFile1 = new DataFile(tmpDir, outFile1.getName());
final DataFile outTmpFile2 = new DataFile(tmpDir, outFile2.getName());
// Rename temporary file
outTmpFile1.renameTo(outFile1);
outTmpFile2.renameTo(outFile2);
// Remove TFQ temporary directory
tmpDir.delete(true);
}
return status.createTaskResult();
} catch (IOException | EoulsanException e) {
return status.createTaskResult(e,
"Error while running job: " + e.getMessage());
}
}
/**
* Create a filter reads job.
* @param parentConf Hadoop configuration
* @param dataName data name
* @param inFile input file in FASTQ or TFQ format
* @param filenames original input file names
* @param inputFormat input format (FASTQ or TFQ)
* @param fastqFormat FASTQ format
* @param outFiles output files
* @return a Job object
* @throws IOException if an error occurs while creating the job
*/
private Job createJobConf(final Configuration parentConf,
final String dataName, final DataFile inFile,
final List<String> filenames, final DataFormat inputFormat,
final FastqFormat fastqFormat, final DataFile... outFiles)
throws IOException {
final Configuration jobConf = new Configuration(parentConf);
// Set input path
final Path inputPath = new Path(inFile.getSource());
// Set counter group
jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);
// Set fastq format
jobConf.set(ReadsFilterMapper.FASTQ_FORMAT_KEY, fastqFormat.getName());
// Set read filter parameters
addParametersToJobConf(getReadFilterParameters(),
READ_FILTER_PARAMETER_KEY_PREFIX, jobConf);
// Set outputs
if (outFiles.length > 1) {
jobConf.set(OUTPUT_FILE1_KEY, outFiles[0].getName());
jobConf.set(OUTPUT_FILE2_KEY, outFiles[1].getName());
}
// Set Job name
// Create the job and its name
final Job job = Job.getInstance(jobConf, "Filter reads ("
+ dataName + ", " + Joiner.on(", ").join(filenames) + ")");
// Set the jar
job.setJarByClass(ReadsFilterHadoopModule.class);
// Set input path
FileInputFormat.addInputPath(job, inputPath);
// Set the input format
if (inputFormat == READS_FASTQ) {
job.setInputFormatClass(FastqInputFormat.class);
} else {
job.setInputFormatClass(KeyValueTextInputFormat.class);
}
// Set the Mapper class
job.setMapperClass(ReadsFilterMapper.class);
// Set the output format
job.setOutputFormatClass(FastqOutputFormat.class);
// Set the output key class
job.setOutputKeyClass(Text.class);
// Set the output value class
job.setOutputValueClass(Text.class);
// Set the number of reducers
job.setNumReduceTasks(0);
// Set output path
FileOutputFormat.setOutputPath(job, new Path(outFiles[0].getSource()
+ (outFiles.length > 1 ? TEMP_DIR_SUFFIX : "")));
return job;
}
}