/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop; import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.AMBIGUOUS_ALIGNMENTS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.ELIMINATED_READS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.EMPTY_ALIGNMENTS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.INVALID_SAM_ENTRIES_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.LOW_QUAL_ALIGNMENTS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.NOT_ALIGNED_ALIGNMENTS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.NOT_UNIQUE_ALIGNMENTS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.TOTAL_ALIGNMENTS_COUNTER; import static fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.SAM_RECORD_PAIRED_END_SERPARATOR; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import fr.ens.biologie.genomique.eoulsan.CommonHadoop; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.EoulsanLogger; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.bio.GenomeDescription; import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray; import fr.ens.biologie.genomique.eoulsan.bio.GenomicInterval; import fr.ens.biologie.genomique.eoulsan.bio.SAMUtils; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqUtils; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage; import fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters; import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFormatException; import htsjdk.samtools.SAMLineParser; import htsjdk.samtools.SAMRecord; /** * Mapper for the expression estimation with htseq-count. * @since 1.2 * @author Claire Wallon */ public class HTSeqCountMapper extends Mapper<Text, Text, Text, LongWritable> { // Parameters keys static final String STRANDED_PARAM = Globals.PARAMETER_PREFIX + ".expression.stranded.parameter"; static final String OVERLAP_MODE_PARAM = Globals.PARAMETER_PREFIX + ".expression.overlapmode.parameter"; static final String REMOVE_AMBIGUOUS_CASES = Globals.PARAMETER_PREFIX + ".expression.no.ambiguous.cases"; private final GenomicArray<String> features = new GenomicArray<>(); private String counterGroup; private StrandUsage stranded; private OverlapMode overlapMode; private boolean removeAmbiguousCases; private final SAMLineParser parser = new SAMLineParser(new SAMFileHeader()); private final Pattern recordSplitterPattern = Pattern.compile("" + SAM_RECORD_PAIRED_END_SERPARATOR); private final Text outKey = new Text(); private final LongWritable outValue = new LongWritable(1L); @Override public void setup(final Context context) throws IOException, InterruptedException { EoulsanLogger.initConsoleHandler(); getLogger().info("Start of setup()"); try { final Configuration conf = context.getConfiguration(); final URI[] localCacheFiles = context.getCacheFiles(); if (localCacheFiles == null || localCacheFiles.length == 0) { throw new IOException("Unable to retrieve genome index"); } if (localCacheFiles.length > 1) { throw new IOException( "Retrieve more than one file in distributed cache"); } getLogger().info("Genome index compressed file (from distributed cache): " + localCacheFiles[0]); if (localCacheFiles == null || localCacheFiles.length == 0) { throw new IOException("Unable to retrieve annotation index"); } if (localCacheFiles.length > 1) { throw new IOException( "Retrieve more than one file in distributed cache"); } // Load features this.features.load( PathUtils.createInputStream(new Path(localCacheFiles[0]), conf)); // Counter group this.counterGroup = conf.get(CommonHadoop.COUNTER_GROUP_KEY); if (this.counterGroup == null) { throw new IOException("No counter group defined"); } // Get the genome description filename final String genomeDescFile = conf.get(ExpressionHadoopModule.GENOME_DESC_PATH_KEY); if (genomeDescFile == null) { throw new IOException("No genome desc file set"); } // Load genome description object final GenomeDescription genomeDescription = GenomeDescription .load(PathUtils.createInputStream(new Path(genomeDescFile), conf)); // Set the chromosomes sizes in the parser this.parser.getFileHeader().setSequenceDictionary( SAMUtils.newSAMSequenceDictionary(genomeDescription)); // Get the "stranded" parameter this.stranded = StrandUsage.getStrandUsageFromName(conf.get(STRANDED_PARAM)); // Get the "overlap mode" parameter this.overlapMode = OverlapMode.getOverlapModeFromName(conf.get(OVERLAP_MODE_PARAM)); // Get the "no ambiguous cases" parameter this.removeAmbiguousCases = conf.getBoolean(REMOVE_AMBIGUOUS_CASES, true); } catch (IOException e) { getLogger().severe( "Error while loading annotation data in Mapper: " + e.getMessage()); } getLogger().info("End of setup()"); } /** * 'key': offset of the beginning of the line from the beginning of the * alignment file. 'value': the SAM record, if data are in paired-end mode, * 'value' contains the two paired alignments separated by a '£' (TSAM * format). */ @Override public void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException { final String line = value.toString(); // Discard SAM headers if (line.length() > 0 && line.charAt(0) == '@') { return; } final String[] fields = recordSplitterPattern.split(line); final List<GenomicInterval> ivSeq; try { // Add intervals switch (fields.length) { // Single end data case 1: ivSeq = createSingleEndIntervals(context, fields[0]); break; // paired end data case 2: ivSeq = addPairedEndIntervals(context, fields[0], fields[1]); break; default: throw new EoulsanException( "Invalid number of SAM record(s) found in the entry: " + fields.length); } incrementCounter(context, TOTAL_ALIGNMENTS_COUNTER, fields.length); final Set<String> fs = null2empty(HTSeqUtils.featuresOverlapped(ivSeq, this.features, this.overlapMode, this.stranded)); switch (fs.size()) { case 0: incrementCounter(context, EMPTY_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); break; case 1: final String id1 = fs.iterator().next(); this.outKey.set(id1); context.write(this.outKey, this.outValue); break; default: if (this.removeAmbiguousCases) { // Ambiguous case will be removed incrementCounter(context, AMBIGUOUS_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); } else { // Ambiguous case will be used in the count for (String id2 : fs) { this.outKey.set(id2); context.write(this.outKey, this.outValue); } } break; } } catch (SAMFormatException | EoulsanException e) { incrementCounter(context, INVALID_SAM_ENTRIES_COUNTER); getLogger().info("Invalid SAM output entry: " + e.getMessage() + " line='" + line + "'"); } } @Override public void cleanup(final Context context) throws IOException { this.features.clear(); } // // Intervals creation methods // /** * Create single end intervals. * @param context Hadoop context * @param record the SAM record */ private List<GenomicInterval> createSingleEndIntervals(final Context context, final String record) { final List<GenomicInterval> ivSeq = new ArrayList<>(); final SAMRecord samRecord = this.parser.parseLine(record); // unmapped read if (samRecord.getReadUnmappedFlag()) { incrementCounter(context, NOT_ALIGNED_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); return ivSeq; } // multiple alignment if (samRecord.getAttribute("NH") != null && samRecord.getIntegerAttribute("NH") > 1) { incrementCounter(context, NOT_UNIQUE_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); return ivSeq; } // too low quality if (samRecord.getMappingQuality() < 0) { incrementCounter(context, LOW_QUAL_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); return ivSeq; } ivSeq.addAll(HTSeqUtils.addIntervals(samRecord, this.stranded)); return ivSeq; } /** * Create paired end intervals. * @param context Hadoop context * @param record1 the SAM record of the first end * @param record2 the SAM record of the second end */ private List<GenomicInterval> addPairedEndIntervals(final Context context, final String record1, final String record2) { final List<GenomicInterval> ivSeq = new ArrayList<>(); final SAMRecord samRecord1 = this.parser.parseLine(record1); final SAMRecord samRecord2 = this.parser.parseLine(record2); if (!samRecord1.getReadUnmappedFlag()) { ivSeq.addAll(HTSeqUtils.addIntervals(samRecord1, this.stranded)); } if (!samRecord2.getReadUnmappedFlag()) { ivSeq.addAll(HTSeqUtils.addIntervals(samRecord2, this.stranded)); } // unmapped read if (samRecord1.getReadUnmappedFlag() && samRecord2.getReadUnmappedFlag()) { incrementCounter(context, NOT_ALIGNED_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); return ivSeq; } // multiple alignment if ((samRecord1.getAttribute("NH") != null && samRecord1.getIntegerAttribute("NH") > 1) || (samRecord2.getAttribute("NH") != null && samRecord2.getIntegerAttribute("NH") > 1)) { incrementCounter(context, NOT_UNIQUE_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); return ivSeq; } // too low quality if (samRecord1.getMappingQuality() < 0 || samRecord2.getMappingQuality() < 0) { incrementCounter(context, LOW_QUAL_ALIGNMENTS_COUNTER); incrementCounter(context, ELIMINATED_READS_COUNTER); return ivSeq; } return ivSeq; } // // Other methods // /** * Increment an expression counter with a value of 1. * @param context the Hadoop context * @param counter the expression counter */ private void incrementCounter(final Context context, final ExpressionCounters counter) { incrementCounter(context, counter, 1); } /** * Increment an expression counter. * @param context the Hadoop context * @param counter the expression counter * @param increment the increment */ private void incrementCounter(final Context context, final ExpressionCounters counter, final int increment) { context.getCounter(this.counterGroup, counter.counterName()) .increment(increment); } /** * Return an empty set if the parameter is null. * @param set the set * @return an empty set if the parameter is null or the original set */ private static <E> Set<E> null2empty(final Set<E> set) { if (set == null) { return Collections.emptySet(); } return set; } }