/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop; import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration; import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_FASTQ; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_TFQ; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.HadoopMappingUtils.addParametersToJobConf; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsFilterMapper.READ_FILTER_PARAMETER_KEY_PREFIX; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.computeZipCheckSum; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.setZooKeeperJobConfiguration; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.SAMFilterReducer.MAP_FILTER_PARAMETER_KEY_PREFIX; import static java.util.Collections.singletonList; import static com.google.common.collect.Lists.newArrayList; import java.io.IOException; import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.chain.ChainMapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.google.common.base.Joiner; import fr.ens.biologie.genomique.eoulsan.CommonHadoop; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly; import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat; import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat; import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMOutputFormat; import fr.ens.biologie.genomique.eoulsan.core.InputPorts; import fr.ens.biologie.genomique.eoulsan.core.Modules; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext; import fr.ens.biologie.genomique.eoulsan.core.TaskContext; import fr.ens.biologie.genomique.eoulsan.core.TaskResult; import fr.ens.biologie.genomique.eoulsan.core.TaskStatus; import fr.ens.biologie.genomique.eoulsan.data.Data; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFormat; import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractFilterAndMapReadsModule; import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils; /** * This class define a Step that filter and map read in Hadoop mode. * @since 1.0 * @author Laurent Jourdren */ @HadoopOnly public class FilterAndMapReadsHadoopModule extends AbstractFilterAndMapReadsModule { @Override public InputPorts getInputPorts() { return allPortsRequiredInWorkingDirectory(super.getInputPorts()); } @Override public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters) throws EoulsanException { super.configure(context, stepParameters); // Check if the mapper can be used with Hadoop if (!getMapper().isSplitsAllowed()) { Modules.invalidConfiguration(context, "The selected mapper cannot be used in hadoop mode as " + "computation cannot be parallelized: " + getMapper().getMapperName()); } } @Override public TaskResult execute(final TaskContext context, final TaskStatus status) { // Create configuration object final Configuration conf = createConfiguration(); try { // Get input and output data final Data readsData = context.getInputData(READS_FASTQ); final String dataName = readsData.getName(); final DataFile samFile = context.getOutputData(MAPPER_RESULTS_SAM, readsData).getDataFile(); final DataFile mapperIndex = context.getInputData(MAPPER_INDEX_PORT_NAME).getDataFile(); // Get FASTQ format final FastqFormat fastqFormat = readsData.getMetadata().getFastqFormat(); // The job to run final Job job; DataFile tfqFile = null; // Pre-process paired-end files if (readsData.getDataFileCount() == 1) { // Define input and output files final DataFile inFile = readsData.getDataFile(0); final List<String> filenames = singletonList(inFile.getName()); job = createJobConf(conf, context, dataName, inFile, filenames, false, READS_FASTQ, fastqFormat, mapperIndex, samFile); } else { // Define input and output files final DataFile inFile1 = readsData.getDataFile(0); final DataFile inFile2 = readsData.getDataFile(1); final List<String> filenames = newArrayList(inFile1.getName(), inFile2.getName()); tfqFile = new DataFile(inFile1.getParent(), inFile1.getBasename() + READS_TFQ.getDefaultExtension()); // Convert FASTQ files to TFQ MapReduceUtils.submitAndWaitForJob( PairedEndFastqToTfq.convert(conf, inFile1, inFile2, tfqFile, getReducerTaskCount()), readsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, getCounterGroup()); job = createJobConf(conf, context, dataName, tfqFile, filenames, true, READS_TFQ, fastqFormat, mapperIndex, samFile); } // Submit filter and map job MapReduceUtils.submitAndWaitForJob(job, readsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, getCounterGroup()); return status.createTaskResult(); } catch (IOException | EoulsanException e) { return status.createTaskResult(e, "Error while running job: " + e.getMessage()); } } private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName, final DataFile inFile, final List<String> filenames, final boolean pairedEnd, final DataFormat inputFormat, final FastqFormat fastqFormat, final DataFile genomeIndexFile, final DataFile outFile) throws IOException { final Configuration jobConf = new Configuration(parentConf); // Set input path final Path inputPath = new Path(inFile.getSource()); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, getCounterGroup()); // // Reads filters parameters // // Set fastq format jobConf.set(ReadsFilterMapper.FASTQ_FORMAT_KEY, fastqFormat.getName()); // Set read filter parameters addParametersToJobConf(getReadFilterParameters(), READ_FILTER_PARAMETER_KEY_PREFIX, jobConf); // // Reads mapping parameters // // Set mapper name jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName()); // Set mapper version jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion()); // Set mapper flavor jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor()); // Set pair end or single end mode jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd)); // Set the number of threads for the mapper if (getMapperHadoopThreads() < 0) { jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads()); } // Set mapper arguments if (getMapperArguments() != null) { jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, getMapperArguments()); } // Set Mapper fastq format jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat); // Set mapper index checksum jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(genomeIndexFile, parentConf)); // timeout jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT); // Don't reuse JVM jobConf.set("mapreduce.job.jvm.numtasks", "" + 1); // Set the memory required by the reads mapper jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired()); // Set the memory required by JVM (BWA need more memory than the other // mapper for buffering named pipes) jobConf.set("mapreduce.map.java.opts", "-Xmx4096M"); // Set ZooKeeper client configuration setZooKeeperJobConfiguration(jobConf, context); // // Alignment filtering // // Set SAM filter parameters addParametersToJobConf(getAlignmentsFilterParameters(), MAP_FILTER_PARAMETER_KEY_PREFIX, jobConf); // // Job creation // // Create the job and its name final Job job = Job.getInstance(jobConf, "Filter and map reads (" + dataName + ", " + Joiner.on(", ").join(filenames) + ")"); // Set the jar job.setJarByClass(ReadsFilterHadoopModule.class); // Set input path FileInputFormat.addInputPath(job, inputPath); // Add genome mapper index to distributed cache // Set genome index reference path in the distributed cache final Path genomeIndex = new Path(genomeIndexFile.getSource()); job.addCacheFile(genomeIndex.toUri()); // Set the input format if (inputFormat == READS_FASTQ) { job.setInputFormatClass(FastqInputFormat.class); } else { job.setInputFormatClass(KeyValueTextInputFormat.class); } // Set the Mappers classes using a chain mapper ChainMapper.addMapper(job, ReadsFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf); ChainMapper.addMapper(job, ReadsMapperMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf); ChainMapper.addMapper(job, SAMFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf); // Set the reducer class job.setReducerClass(SAMFilterReducer.class); // Set the output format job.setOutputFormatClass(SAMOutputFormat.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(Text.class); // Set output path FileOutputFormat.setOutputPath(job, new Path(outFile.getSource())); return job; } }