/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.INPUT_RAW_READS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.mapping.MappingCounters.OUTPUT_PRETREATMENT_READS_COUNTER;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.google.common.base.Splitter;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanLogger;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.HadoopEoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.ReadSequence;
/**
* This class defines a mapper for the pretreatment of paired-end data before
* the reads filtering step.
* @since 1.2
* @author Claire Wallon
*/
public class PreTreatmentMapper extends Mapper<LongWritable, Text, Text, Text> {
// Parameters keys
static final String FASTQ_FORMAT_KEY =
Globals.PARAMETER_PREFIX + ".pretreatment.fastq.format";
private String counterGroup;
private static final Splitter TAB_SPLITTER = Splitter.on('\t').trimResults();
private final List<String> fields = new ArrayList<>();
private final ReadSequence read = new ReadSequence();
private Text outKey = new Text();
private Text outValue = new Text();
//
// Setup
//
@Override
protected void setup(final Context context)
throws IOException, InterruptedException {
EoulsanLogger.initConsoleHandler();
getLogger().info("Start of setup()");
// Get configuration object
final Configuration conf = context.getConfiguration();
// Initialize Eoulsan Settings
if (!EoulsanRuntime.isRuntime()) {
HadoopEoulsanRuntime.newEoulsanRuntime(conf);
}
// Set the FastqFormat
final FastqFormat fastqFormat =
FastqFormat.getFormatFromName(conf.get(FASTQ_FORMAT_KEY,
"" + EoulsanRuntime.getSettings().getDefaultFastqFormat()));
this.read.setFastqFormat(fastqFormat);
getLogger().info("Fastq format: " + fastqFormat);
// Counter group
this.counterGroup = conf.get(CommonHadoop.COUNTER_GROUP_KEY);
if (this.counterGroup == null) {
throw new IOException("No counter group defined");
}
getLogger().info("End of setup()");
}
//
// Map
//
/**
* 'key': offset of the beginning of the line from the beginning of the TFQ
* file. 'value': the TFQ record.
*/
@Override
protected void map(final LongWritable key, final Text value,
final Context context) throws IOException, InterruptedException {
context.getCounter(this.counterGroup, INPUT_RAW_READS_COUNTER.counterName())
.increment(1);
final String line = value.toString();
this.fields.clear();
for (String e : TAB_SPLITTER.split(line)) {
this.fields.add(e);
}
this.read.setName(this.fields.get(0));
this.read.setSequence(this.fields.get(1));
this.read.setQuality(this.fields.get(2));
// Illumina technology and Casava 1.8 format for the '@' line
if (!this.fields.get(0).contains("/")) {
this.outKey = new Text(this.read.getName().split(" ")[0]);
this.outValue = new Text(this.read.getName().split(" ")[1]
+ "\t" + this.read.getSequence() + "\t" + this.read.getQuality());
}
// Before Casava 1.8 or technology other than Illumina
else {
this.outKey = new Text(this.read.getName().split("/")[0] + "/");
this.outValue = new Text(this.read.getName().split("/")[1]
+ "\t" + this.read.getSequence() + "\t" + this.read.getQuality());
}
context.write(this.outKey, this.outValue);
context.getCounter(this.counterGroup,
OUTPUT_PRETREATMENT_READS_COUNTER.counterName()).increment(1);
}
@Override
protected void cleanup(final Context context)
throws IOException, InterruptedException {
}
}