/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop; import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration; import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.HadoopMappingUtils.addParametersToJobConf; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.SAMFilterReducer.MAP_FILTER_PARAMETER_KEY_PREFIX; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import fr.ens.biologie.genomique.eoulsan.CommonHadoop; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly; import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMInputFormat; import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMOutputFormat; import fr.ens.biologie.genomique.eoulsan.core.InputPorts; import fr.ens.biologie.genomique.eoulsan.core.TaskContext; import fr.ens.biologie.genomique.eoulsan.core.TaskResult; import fr.ens.biologie.genomique.eoulsan.core.TaskStatus; import fr.ens.biologie.genomique.eoulsan.data.Data; import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractSAMFilterModule; import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils; /** * This class define a filter alignment step in Hadoop mode. * @since 1.0 * @author Laurent Jourdren */ @HadoopOnly public class SAMFilterHadoopModule extends AbstractSAMFilterModule { @Override public InputPorts getInputPorts() { return allPortsRequiredInWorkingDirectory(super.getInputPorts()); } @Override public TaskResult execute(final TaskContext context, final TaskStatus status) { // Create configuration object final Configuration conf = createConfiguration(); try { final Data inData = context.getInputData(MAPPER_RESULTS_SAM); final Data outData = context.getOutputData(MAPPER_RESULTS_SAM, inData); // Create the job to run final Job job = createJob(conf, inData, outData); // Launch jobs MapReduceUtils.submitAndWaitForJob(job, inData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP); return status.createTaskResult(); } catch (IOException | EoulsanException e) { return status.createTaskResult(e, "Error while running job: " + e.getMessage()); } } /** * Create the JobConf object for a sample * @param inData input data * @param outData output data * @return a new JobConf object * @throws IOException */ private Job createJob(final Configuration parentConf, final Data inData, final Data outData) throws IOException { final Configuration jobConf = new Configuration(parentConf); // Set input path final Path inputPath = new Path(inData.getDataFile().getSource()); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP); // Set SAM filter parameters addParametersToJobConf(getAlignmentsFilterParameters(), MAP_FILTER_PARAMETER_KEY_PREFIX, jobConf); // timeout jobConf.set("mapreduce.task.timeout", "" + 30 * 60 * 1000); // Create the job and its name final Job job = Job.getInstance(jobConf, "Filter SAM file (" + inData.getName() + ", " + inputPath.getName() + ")"); // Set the jar job.setJarByClass(ReadsMapperHadoopModule.class); // Set input path FileInputFormat.addInputPath(job, inputPath); // Set input format job.setInputFormatClass(SAMInputFormat.class); // Set the Mapper class job.setMapperClass(SAMFilterMapper.class); // Set the reducer class job.setReducerClass(SAMFilterReducer.class); // Set the reducer task count if (getReducerTaskCount() > 0) { job.setNumReduceTasks(getReducerTaskCount()); } // job.setPartitionerClass(SAMRecordsPartitioner.class); // job.setSortComparatorClass(SAMRecordsKeyComparator.class); // job.setGroupingComparatorClass(SAMRecordsGroupComparator.class); // Set the output format job.setOutputFormatClass(SAMOutputFormat.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(Text.class); // Set output path FileOutputFormat.setOutputPath(job, new Path(outData.getDataFile().getSource())); return job; } }