/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.INPUT_RAW_READS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.OUTPUT_FILTERED_READS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.READS_REJECTED_BY_FILTERS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.HadoopMappingUtils.jobConfToParameters;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsFilterHadoopModule.OUTPUT_FILE1_KEY;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsFilterHadoopModule.OUTPUT_FILE2_KEY;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.EoulsanLogger;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.HadoopEoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.ReadSequence;
import fr.ens.biologie.genomique.eoulsan.bio.readsfilters.MultiReadFilter;
import fr.ens.biologie.genomique.eoulsan.bio.readsfilters.MultiReadFilterBuilder;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.HadoopReporterIncrementer;
/**
* This class defines a read filter mapper.
* @since 1.0
* @author Laurent Jourdren
*/
public class ReadsFilterMapper extends Mapper<Text, Text, Text, Text> {
// Parameters keys
static final String FASTQ_FORMAT_KEY =
Globals.PARAMETER_PREFIX + ".filter.reads.fastq.format";
static final String READ_FILTER_PARAMETER_KEY_PREFIX =
Globals.PARAMETER_PREFIX + ".filter.reads.parameter.";
private static final Splitter TAB_SPLITTER = Splitter.on('\t').trimResults();
private final List<String> fields = new ArrayList<>();
private MultiReadFilter filter;
private String counterGroup;
private final ReadSequence read1 = new ReadSequence();
private final ReadSequence read2 = new ReadSequence();
private final Text outValue = new Text();
private MultipleOutputs<Text, Text> out;
private String outputFilename1;
private String outputFilename2;
//
// Setup
//
@Override
protected void setup(final Context context)
throws IOException, InterruptedException {
EoulsanLogger.initConsoleHandler();
getLogger().info("Start of setup()");
// Get configuration object
final Configuration conf = context.getConfiguration();
// Initialize Eoulsan Settings
if (!EoulsanRuntime.isRuntime()) {
HadoopEoulsanRuntime.newEoulsanRuntime(conf);
}
// Set the FastqFormat
final FastqFormat fastqFormat =
FastqFormat.getFormatFromName(conf.get(FASTQ_FORMAT_KEY,
"" + EoulsanRuntime.getSettings().getDefaultFastqFormat()));
this.read1.setFastqFormat(fastqFormat);
this.read2.setFastqFormat(fastqFormat);
// Counter group
this.counterGroup = conf.get(CommonHadoop.COUNTER_GROUP_KEY);
if (this.counterGroup == null) {
throw new IOException("No counter group defined");
}
getLogger().info("Fastq format: " + fastqFormat);
// Set the filters
try {
final MultiReadFilterBuilder mrfb = new MultiReadFilterBuilder();
// Add the parameters from the job configuration to the builder
mrfb.addParameters(
jobConfToParameters(conf, READ_FILTER_PARAMETER_KEY_PREFIX));
this.filter = mrfb.getReadFilter(new HadoopReporterIncrementer(context),
this.counterGroup);
getLogger().info("Reads filters to apply: "
+ Joiner.on(", ").join(this.filter.getFilterNames()));
} catch (EoulsanException e) {
throw new IOException(e);
}
// Set the output writers
this.out = new MultipleOutputs<>(context);
this.outputFilename1 = createOutputPath(conf, OUTPUT_FILE1_KEY);
this.outputFilename2 = createOutputPath(conf, OUTPUT_FILE2_KEY);
getLogger().info("End of setup()");
}
private static String createOutputPath(final Configuration conf,
final String key) {
if (conf == null || key == null) {
return null;
}
final String value = conf.get(key);
return value + "/part";
}
//
// Map
//
/**
* 'key': offset of the beginning of the line from the beginning of the TFQ
* file. 'value': the TFQ line.
*/
@Override
protected void map(final Text key, final Text value, final Context context)
throws IOException, InterruptedException {
context.getCounter(this.counterGroup, INPUT_RAW_READS_COUNTER.counterName())
.increment(1);
final String line = value.toString();
this.fields.clear();
for (String e : TAB_SPLITTER.split(line)) {
this.fields.add(e);
}
final int fieldsSize = this.fields.size();
if (fieldsSize == 3) {
// Single end
this.read1.setName(this.fields.get(0));
this.read1.setSequence(this.fields.get(1));
this.read1.setQuality(this.fields.get(2));
if (this.filter.accept(this.read1)) {
this.outValue.set(this.read1.toTFQ());
context.write(key, this.outValue);
context.getCounter(this.counterGroup,
OUTPUT_FILTERED_READS_COUNTER.counterName()).increment(1);
} else {
context.getCounter(this.counterGroup,
READS_REJECTED_BY_FILTERS_COUNTER.counterName()).increment(1);
}
} else if (fieldsSize == 6) {
// First end
this.read1.setName(this.fields.get(0));
this.read1.setSequence(this.fields.get(1));
this.read1.setQuality(this.fields.get(2));
// Second end
this.read2.setName(this.fields.get(3));
this.read2.setSequence(this.fields.get(4));
this.read2.setQuality(this.fields.get(5));
if (this.filter.accept(this.read1, this.read2)) {
if (this.outputFilename1 == null) {
// Output of the mapper is chained
this.outValue.set(this.read1.toTFQ() + '\t' + this.read2.toTFQ());
context.write(key, this.outValue);
} else {
// The output of the mapper is not reused by another mapper or reducer
// Write read 1
this.outValue.set(this.read1.toTFQ());
this.out.write(key, this.outValue, this.outputFilename1);
// Write read 2
this.outValue.set(this.read2.toTFQ());
this.out.write(key, this.outValue, this.outputFilename2);
}
context.getCounter(this.counterGroup,
OUTPUT_FILTERED_READS_COUNTER.counterName()).increment(1);
} else {
context.getCounter(this.counterGroup,
READS_REJECTED_BY_FILTERS_COUNTER.counterName()).increment(1);
}
}
}
@Override
protected void cleanup(final Context context)
throws IOException, InterruptedException {
// Close the multiple output writer
if (this.out != null) {
this.out.close();
}
}
}