/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop; import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.INPUT_RAW_READS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.OUTPUT_FILTERED_READS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.READS_REJECTED_BY_FILTERS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.HadoopMappingUtils.jobConfToParameters; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsFilterHadoopModule.OUTPUT_FILE1_KEY; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsFilterHadoopModule.OUTPUT_FILE2_KEY; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import fr.ens.biologie.genomique.eoulsan.CommonHadoop; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.EoulsanLogger; import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.HadoopEoulsanRuntime; import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat; import fr.ens.biologie.genomique.eoulsan.bio.ReadSequence; import fr.ens.biologie.genomique.eoulsan.bio.readsfilters.MultiReadFilter; import fr.ens.biologie.genomique.eoulsan.bio.readsfilters.MultiReadFilterBuilder; import fr.ens.biologie.genomique.eoulsan.util.hadoop.HadoopReporterIncrementer; /** * This class defines a read filter mapper. * @since 1.0 * @author Laurent Jourdren */ public class ReadsFilterMapper extends Mapper<Text, Text, Text, Text> { // Parameters keys static final String FASTQ_FORMAT_KEY = Globals.PARAMETER_PREFIX + ".filter.reads.fastq.format"; static final String READ_FILTER_PARAMETER_KEY_PREFIX = Globals.PARAMETER_PREFIX + ".filter.reads.parameter."; private static final Splitter TAB_SPLITTER = Splitter.on('\t').trimResults(); private final List<String> fields = new ArrayList<>(); private MultiReadFilter filter; private String counterGroup; private final ReadSequence read1 = new ReadSequence(); private final ReadSequence read2 = new ReadSequence(); private final Text outValue = new Text(); private MultipleOutputs<Text, Text> out; private String outputFilename1; private String outputFilename2; // // Setup // @Override protected void setup(final Context context) throws IOException, InterruptedException { EoulsanLogger.initConsoleHandler(); getLogger().info("Start of setup()"); // Get configuration object final Configuration conf = context.getConfiguration(); // Initialize Eoulsan Settings if (!EoulsanRuntime.isRuntime()) { HadoopEoulsanRuntime.newEoulsanRuntime(conf); } // Set the FastqFormat final FastqFormat fastqFormat = FastqFormat.getFormatFromName(conf.get(FASTQ_FORMAT_KEY, "" + EoulsanRuntime.getSettings().getDefaultFastqFormat())); this.read1.setFastqFormat(fastqFormat); this.read2.setFastqFormat(fastqFormat); // Counter group this.counterGroup = conf.get(CommonHadoop.COUNTER_GROUP_KEY); if (this.counterGroup == null) { throw new IOException("No counter group defined"); } getLogger().info("Fastq format: " + fastqFormat); // Set the filters try { final MultiReadFilterBuilder mrfb = new MultiReadFilterBuilder(); // Add the parameters from the job configuration to the builder mrfb.addParameters( jobConfToParameters(conf, READ_FILTER_PARAMETER_KEY_PREFIX)); this.filter = mrfb.getReadFilter(new HadoopReporterIncrementer(context), this.counterGroup); getLogger().info("Reads filters to apply: " + Joiner.on(", ").join(this.filter.getFilterNames())); } catch (EoulsanException e) { throw new IOException(e); } // Set the output writers this.out = new MultipleOutputs<>(context); this.outputFilename1 = createOutputPath(conf, OUTPUT_FILE1_KEY); this.outputFilename2 = createOutputPath(conf, OUTPUT_FILE2_KEY); getLogger().info("End of setup()"); } private static String createOutputPath(final Configuration conf, final String key) { if (conf == null || key == null) { return null; } final String value = conf.get(key); return value + "/part"; } // // Map // /** * 'key': offset of the beginning of the line from the beginning of the TFQ * file. 'value': the TFQ line. */ @Override protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException { context.getCounter(this.counterGroup, INPUT_RAW_READS_COUNTER.counterName()) .increment(1); final String line = value.toString(); this.fields.clear(); for (String e : TAB_SPLITTER.split(line)) { this.fields.add(e); } final int fieldsSize = this.fields.size(); if (fieldsSize == 3) { // Single end this.read1.setName(this.fields.get(0)); this.read1.setSequence(this.fields.get(1)); this.read1.setQuality(this.fields.get(2)); if (this.filter.accept(this.read1)) { this.outValue.set(this.read1.toTFQ()); context.write(key, this.outValue); context.getCounter(this.counterGroup, OUTPUT_FILTERED_READS_COUNTER.counterName()).increment(1); } else { context.getCounter(this.counterGroup, READS_REJECTED_BY_FILTERS_COUNTER.counterName()).increment(1); } } else if (fieldsSize == 6) { // First end this.read1.setName(this.fields.get(0)); this.read1.setSequence(this.fields.get(1)); this.read1.setQuality(this.fields.get(2)); // Second end this.read2.setName(this.fields.get(3)); this.read2.setSequence(this.fields.get(4)); this.read2.setQuality(this.fields.get(5)); if (this.filter.accept(this.read1, this.read2)) { if (this.outputFilename1 == null) { // Output of the mapper is chained this.outValue.set(this.read1.toTFQ() + '\t' + this.read2.toTFQ()); context.write(key, this.outValue); } else { // The output of the mapper is not reused by another mapper or reducer // Write read 1 this.outValue.set(this.read1.toTFQ()); this.out.write(key, this.outValue, this.outputFilename1); // Write read 2 this.outValue.set(this.read2.toTFQ()); this.out.write(key, this.outValue, this.outputFilename2); } context.getCounter(this.counterGroup, OUTPUT_FILTERED_READS_COUNTER.counterName()).increment(1); } else { context.getCounter(this.counterGroup, READS_REJECTED_BY_FILTERS_COUNTER.counterName()).increment(1); } } } @Override protected void cleanup(final Context context) throws IOException, InterruptedException { // Close the multiple output writer if (this.out != null) { this.out.close(); } } }