/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop;
import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.AMBIGUOUS_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.ELIMINATED_READS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.EMPTY_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.INVALID_SAM_ENTRIES_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.LOW_QUAL_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.NOT_ALIGNED_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.NOT_UNIQUE_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters.TOTAL_ALIGNMENTS_COUNTER;
import static fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.SAM_RECORD_PAIRED_END_SERPARATOR;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.EoulsanLogger;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.bio.GenomeDescription;
import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray;
import fr.ens.biologie.genomique.eoulsan.bio.GenomicInterval;
import fr.ens.biologie.genomique.eoulsan.bio.SAMUtils;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqUtils;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode;
import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage;
import fr.ens.biologie.genomique.eoulsan.modules.expression.ExpressionCounters;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFormatException;
import htsjdk.samtools.SAMLineParser;
import htsjdk.samtools.SAMRecord;
/**
* Mapper for the expression estimation with htseq-count.
* @since 1.2
* @author Claire Wallon
*/
public class HTSeqCountMapper extends Mapper<Text, Text, Text, LongWritable> {
// Parameters keys
static final String STRANDED_PARAM =
Globals.PARAMETER_PREFIX + ".expression.stranded.parameter";
static final String OVERLAP_MODE_PARAM =
Globals.PARAMETER_PREFIX + ".expression.overlapmode.parameter";
static final String REMOVE_AMBIGUOUS_CASES =
Globals.PARAMETER_PREFIX + ".expression.no.ambiguous.cases";
private final GenomicArray<String> features = new GenomicArray<>();
private String counterGroup;
private StrandUsage stranded;
private OverlapMode overlapMode;
private boolean removeAmbiguousCases;
private final SAMLineParser parser = new SAMLineParser(new SAMFileHeader());
private final Pattern recordSplitterPattern =
Pattern.compile("" + SAM_RECORD_PAIRED_END_SERPARATOR);
private final Text outKey = new Text();
private final LongWritable outValue = new LongWritable(1L);
@Override
public void setup(final Context context)
throws IOException, InterruptedException {
EoulsanLogger.initConsoleHandler();
getLogger().info("Start of setup()");
try {
final Configuration conf = context.getConfiguration();
final URI[] localCacheFiles = context.getCacheFiles();
if (localCacheFiles == null || localCacheFiles.length == 0) {
throw new IOException("Unable to retrieve genome index");
}
if (localCacheFiles.length > 1) {
throw new IOException(
"Retrieve more than one file in distributed cache");
}
getLogger().info("Genome index compressed file (from distributed cache): "
+ localCacheFiles[0]);
if (localCacheFiles == null || localCacheFiles.length == 0) {
throw new IOException("Unable to retrieve annotation index");
}
if (localCacheFiles.length > 1) {
throw new IOException(
"Retrieve more than one file in distributed cache");
}
// Load features
this.features.load(
PathUtils.createInputStream(new Path(localCacheFiles[0]), conf));
// Counter group
this.counterGroup = conf.get(CommonHadoop.COUNTER_GROUP_KEY);
if (this.counterGroup == null) {
throw new IOException("No counter group defined");
}
// Get the genome description filename
final String genomeDescFile =
conf.get(ExpressionHadoopModule.GENOME_DESC_PATH_KEY);
if (genomeDescFile == null) {
throw new IOException("No genome desc file set");
}
// Load genome description object
final GenomeDescription genomeDescription = GenomeDescription
.load(PathUtils.createInputStream(new Path(genomeDescFile), conf));
// Set the chromosomes sizes in the parser
this.parser.getFileHeader().setSequenceDictionary(
SAMUtils.newSAMSequenceDictionary(genomeDescription));
// Get the "stranded" parameter
this.stranded =
StrandUsage.getStrandUsageFromName(conf.get(STRANDED_PARAM));
// Get the "overlap mode" parameter
this.overlapMode =
OverlapMode.getOverlapModeFromName(conf.get(OVERLAP_MODE_PARAM));
// Get the "no ambiguous cases" parameter
this.removeAmbiguousCases = conf.getBoolean(REMOVE_AMBIGUOUS_CASES, true);
} catch (IOException e) {
getLogger().severe(
"Error while loading annotation data in Mapper: " + e.getMessage());
}
getLogger().info("End of setup()");
}
/**
* 'key': offset of the beginning of the line from the beginning of the
* alignment file. 'value': the SAM record, if data are in paired-end mode,
* 'value' contains the two paired alignments separated by a '£' (TSAM
* format).
*/
@Override
public void map(final Text key, final Text value, final Context context)
throws IOException, InterruptedException {
final String line = value.toString();
// Discard SAM headers
if (line.length() > 0 && line.charAt(0) == '@') {
return;
}
final String[] fields = recordSplitterPattern.split(line);
final List<GenomicInterval> ivSeq;
try {
// Add intervals
switch (fields.length) {
// Single end data
case 1:
ivSeq = createSingleEndIntervals(context, fields[0]);
break;
// paired end data
case 2:
ivSeq = addPairedEndIntervals(context, fields[0], fields[1]);
break;
default:
throw new EoulsanException(
"Invalid number of SAM record(s) found in the entry: "
+ fields.length);
}
incrementCounter(context, TOTAL_ALIGNMENTS_COUNTER, fields.length);
final Set<String> fs = null2empty(HTSeqUtils.featuresOverlapped(ivSeq,
this.features, this.overlapMode, this.stranded));
switch (fs.size()) {
case 0:
incrementCounter(context, EMPTY_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
break;
case 1:
final String id1 = fs.iterator().next();
this.outKey.set(id1);
context.write(this.outKey, this.outValue);
break;
default:
if (this.removeAmbiguousCases) {
// Ambiguous case will be removed
incrementCounter(context, AMBIGUOUS_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
} else {
// Ambiguous case will be used in the count
for (String id2 : fs) {
this.outKey.set(id2);
context.write(this.outKey, this.outValue);
}
}
break;
}
} catch (SAMFormatException | EoulsanException e) {
incrementCounter(context, INVALID_SAM_ENTRIES_COUNTER);
getLogger().info("Invalid SAM output entry: "
+ e.getMessage() + " line='" + line + "'");
}
}
@Override
public void cleanup(final Context context) throws IOException {
this.features.clear();
}
//
// Intervals creation methods
//
/**
* Create single end intervals.
* @param context Hadoop context
* @param record the SAM record
*/
private List<GenomicInterval> createSingleEndIntervals(final Context context,
final String record) {
final List<GenomicInterval> ivSeq = new ArrayList<>();
final SAMRecord samRecord = this.parser.parseLine(record);
// unmapped read
if (samRecord.getReadUnmappedFlag()) {
incrementCounter(context, NOT_ALIGNED_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
return ivSeq;
}
// multiple alignment
if (samRecord.getAttribute("NH") != null
&& samRecord.getIntegerAttribute("NH") > 1) {
incrementCounter(context, NOT_UNIQUE_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
return ivSeq;
}
// too low quality
if (samRecord.getMappingQuality() < 0) {
incrementCounter(context, LOW_QUAL_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
return ivSeq;
}
ivSeq.addAll(HTSeqUtils.addIntervals(samRecord, this.stranded));
return ivSeq;
}
/**
* Create paired end intervals.
* @param context Hadoop context
* @param record1 the SAM record of the first end
* @param record2 the SAM record of the second end
*/
private List<GenomicInterval> addPairedEndIntervals(final Context context,
final String record1, final String record2) {
final List<GenomicInterval> ivSeq = new ArrayList<>();
final SAMRecord samRecord1 = this.parser.parseLine(record1);
final SAMRecord samRecord2 = this.parser.parseLine(record2);
if (!samRecord1.getReadUnmappedFlag()) {
ivSeq.addAll(HTSeqUtils.addIntervals(samRecord1, this.stranded));
}
if (!samRecord2.getReadUnmappedFlag()) {
ivSeq.addAll(HTSeqUtils.addIntervals(samRecord2, this.stranded));
}
// unmapped read
if (samRecord1.getReadUnmappedFlag() && samRecord2.getReadUnmappedFlag()) {
incrementCounter(context, NOT_ALIGNED_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
return ivSeq;
}
// multiple alignment
if ((samRecord1.getAttribute("NH") != null
&& samRecord1.getIntegerAttribute("NH") > 1)
|| (samRecord2.getAttribute("NH") != null
&& samRecord2.getIntegerAttribute("NH") > 1)) {
incrementCounter(context, NOT_UNIQUE_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
return ivSeq;
}
// too low quality
if (samRecord1.getMappingQuality() < 0
|| samRecord2.getMappingQuality() < 0) {
incrementCounter(context, LOW_QUAL_ALIGNMENTS_COUNTER);
incrementCounter(context, ELIMINATED_READS_COUNTER);
return ivSeq;
}
return ivSeq;
}
//
// Other methods
//
/**
* Increment an expression counter with a value of 1.
* @param context the Hadoop context
* @param counter the expression counter
*/
private void incrementCounter(final Context context,
final ExpressionCounters counter) {
incrementCounter(context, counter, 1);
}
/**
* Increment an expression counter.
* @param context the Hadoop context
* @param counter the expression counter
* @param increment the increment
*/
private void incrementCounter(final Context context,
final ExpressionCounters counter, final int increment) {
context.getCounter(this.counterGroup, counter.counterName())
.increment(increment);
}
/**
* Return an empty set if the parameter is null.
* @param set the set
* @return an empty set if the parameter is null or the original set
*/
private static <E> Set<E> null2empty(final Set<E> set) {
if (set == null) {
return Collections.emptySet();
}
return set;
}
}