/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop; import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.INVALID_SAM_ENTRIES_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.SAM_RECORD_PAIRED_END_SERPARATOR; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.SAMHeaderHadoopUtils.createSAMSequenceDictionaryFromSAMHeader; import static fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.SAMHeaderHadoopUtils.loadSAMHeaders; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import fr.ens.biologie.genomique.eoulsan.EoulsanLogger; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.bio.SAMComparator; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFormatException; import htsjdk.samtools.SAMLineParser; import htsjdk.samtools.SAMRecord; /** * This class define a reducer for the pretreatment of paired-end data before * the expression estimation step. * @since 1.2 * @author Claire Wallon */ public class PreTreatmentExpressionReducer extends Reducer<Text, Text, Text, Text> { private String counterGroup; private final Text outKey = new Text(); private final Text outValue = new Text(); private final SAMLineParser parser = new SAMLineParser(new SAMFileHeader()); private final List<SAMRecord> records = new ArrayList<>(); @Override protected void setup(final Context context) throws IOException, InterruptedException { EoulsanLogger.initConsoleHandler(); getLogger().info("Start of setup()"); final Configuration conf = context.getConfiguration(); // Set the chromosomes sizes in the parser final List<String> samHeader = loadSAMHeaders(context); this.parser.getFileHeader().setSequenceDictionary( createSAMSequenceDictionaryFromSAMHeader(samHeader)); // Counter group this.counterGroup = conf.get(Globals.PARAMETER_PREFIX + ".counter.group"); if (this.counterGroup == null) { throw new IOException("No counter group defined"); } getLogger().info("End of setup()"); } /** * 'key': the identifier of the aligned read without the integer indicating * the member of the pair. 'values': the rest of the paired alignments, i.e * the SAM line of the first paired alignment and the SAM line of the second * paired alignment. */ @Override protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException { String stringVal; final String strOutKey; StringBuilder strOutValue = new StringBuilder(); SAMRecord samRecord; String stringRecord; this.records.clear(); for (Text val : values) { stringVal = val.toString(); stringRecord = key.toString() + stringVal; try { samRecord = this.parser.parseLine(stringRecord); this.records.add(samRecord); } catch (SAMFormatException e) { context.getCounter(this.counterGroup, INVALID_SAM_ENTRIES_COUNTER.counterName()).increment(1); getLogger().info("Invalid SAM output entry: " + e.getMessage() + " line='" + stringRecord + "'"); return; } } // sort alignments of the current read Collections.sort(this.records, new SAMComparator()); // Writing records int indexOfFirstTab = this.records.get(0).getSAMString().indexOf("\t"); strOutKey = this.records.get(0).getSAMString().substring(0, indexOfFirstTab); strOutValue.append(this.records.get(0).getSAMString() .substring(indexOfFirstTab + 1).replaceAll("\n", "")); this.records.remove(0); for (SAMRecord r : this.records) { if (r.getFirstOfPairFlag()) { strOutValue.append('\n'); } else { strOutValue.append(SAM_RECORD_PAIRED_END_SERPARATOR); } strOutValue.append(r.getSAMString().replaceAll("\n", "")); } this.outKey.set(strOutKey); this.outValue.set(strOutValue.toString()); context.write(this.outKey, this.outValue); } }