/*
* Eoulsan development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public License version 2.1 or
* later and CeCILL-C. This should be distributed with the code.
* If you do not have a copy, see:
*
* http://www.gnu.org/licenses/lgpl-2.1.txt
* http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
*
* Copyright for this code is held jointly by the Genomic platform
* of the Institut de Biologie de l'École normale supérieure and
* the individual authors. These should be listed in @author doc
* comments.
*
* For more information on the Eoulsan project and its aims,
* or to join the Eoulsan Google group, visit the home page
* at:
*
* http://outils.genomique.biologie.ens.fr/eoulsan
*
*/
package fr.ens.biologie.genomique.eoulsan.bio.expressioncounters;
import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode.INTERSECTION_NONEMPTY;
import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode.INTERSECTION_STRICT;
import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode.UNION;
import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage.REVERSE;
import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage.YES;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Splitter;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException;
import fr.ens.biologie.genomique.eoulsan.bio.GFFEntry;
import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray;
import fr.ens.biologie.genomique.eoulsan.bio.GenomicInterval;
import fr.ens.biologie.genomique.eoulsan.bio.io.GFFReader;
import fr.ens.biologie.genomique.eoulsan.bio.io.GTFReader;
import fr.ens.biologie.genomique.eoulsan.util.GuavaCompatibility;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
/**
* This class groups HTSeq functions that are used in both local and distributed
* modes.
* @since 1.2
* @author Claire Wallon
*/
public class HTSeqUtils {
public static void storeAnnotation(final GenomicArray<String> features,
final InputStream annotationIs, final boolean gtfFormat,
final String featureType, final StrandUsage stranded,
final String attributeId, final boolean splitAttributeValues,
final Map<String, Integer> counts)
throws IOException, EoulsanException, BadBioEntryException {
final Splitter splitter = Splitter.on(',').omitEmptyStrings().trimResults();
// Splitter for parents string
try (final GFFReader gffReader =
gtfFormat ? new GTFReader(annotationIs) : new GFFReader(annotationIs)) {
// Read the annotation file
for (final GFFEntry gff : gffReader) {
if (featureType.equals(gff.getType())) {
final String featureId = gff.getAttributeValue(attributeId);
if (featureId == null) {
throw new EoulsanException("Feature "
+ featureType + " does not contain a " + attributeId
+ " attribute");
}
if ((stranded == StrandUsage.YES || stranded == StrandUsage.REVERSE)
&& '.' == gff.getStrand()) {
throw new EoulsanException("Feature "
+ featureType
+ " does not have strand information but you are running "
+ "htseq-count in stranded mode.");
}
// Addition to the list of features of a GenomicInterval object
// corresponding to the current annotation line
final List<String> featureIds;
if (splitAttributeValues) {
featureIds = GuavaCompatibility.splitToList(splitter, featureId);
} else {
featureIds = Collections.singletonList(featureId);
}
// Split parent if needed
for (String f : featureIds) {
features.addEntry(
new GenomicInterval(gff, stranded.isSaveStrandInfo()), f);
counts.put(f, 0);
}
}
}
gffReader.throwException();
}
}
/**
* Add intervals of a SAM record that are alignment matches (thanks to the
* CIGAR code).
* @param record the SAM record to treat.
* @param stranded strand to consider.
* @return the list of intervals of the SAM record.
*/
public static List<GenomicInterval> addIntervals(final SAMRecord record,
final StrandUsage stranded) {
if (record == null) {
return null;
}
List<GenomicInterval> result = new ArrayList<>();
// single-end mode or first read in the paired-end mode
if (!record.getReadPairedFlag()
|| (record.getReadPairedFlag() && record.getFirstOfPairFlag())) {
// the read has to be mapped to the opposite strand as the feature
if (stranded == REVERSE) {
result.addAll(parseCigar(record.getCigar(), record.getReferenceName(),
record.getAlignmentStart(),
record.getReadNegativeStrandFlag() ? '+' : '-'));
}
// stranded == "yes" (so the read has to be mapped to the same strand as
// the feature) or stranded == "no" (so the read is considered
// overlapping with a feature regardless of whether it is mapped to the
// same or the opposite strand as the feature)
else {
result.addAll(parseCigar(record.getCigar(), record.getReferenceName(),
record.getAlignmentStart(),
record.getReadNegativeStrandFlag() ? '-' : '+'));
}
}
// second read in the paired-end mode
else if (record.getReadPairedFlag() && !record.getFirstOfPairFlag()) {
// the read has to be mapped to the opposite strand as the feature
if (stranded == StrandUsage.REVERSE) {
result.addAll(parseCigar(record.getCigar(), record.getReferenceName(),
record.getAlignmentStart(),
record.getReadNegativeStrandFlag() ? '-' : '+'));
}
// stranded == "yes" (so the read has to be mapped to the same strand as
// the feature) or stranded == "no" (so the read is considered
// overlapping with a feature regardless of whether it is mapped to the
// same or the opposite strand as the feature)
else {
result.addAll(parseCigar(record.getCigar(), record.getReferenceName(),
record.getAlignmentStart(),
record.getReadNegativeStrandFlag() ? '+' : '-'));
}
} else {
return null;
}
return result;
}
/**
* Parse a CIGAR string to have intervals of a chromosome that are alignments
* matches.
* @param cigar CIGAR string to parse.
* @param chromosome chromosome that support the alignment.
* @param start start position of the alignment.
* @param strand strand to consider.
* @return the list of intervals that are alignments matches.
*/
public static final List<GenomicInterval> parseCigar(final Cigar cigar,
final String chromosome, final int start, final char strand) {
if (cigar == null) {
return null;
}
final List<GenomicInterval> result = new ArrayList<>();
int pos = start;
for (CigarElement ce : cigar.getCigarElements()) {
final int len = ce.getLength();
// the CIGAR element correspond to a mapped region
if (ce.getOperator() == CigarOperator.M) {
result.add(new GenomicInterval(chromosome, pos, pos + len - 1, strand));
pos += len;
}
// the CIGAR element did not correspond to a mapped region
else {
// regions coded by a 'I' (insertion) do not have to be counted
// (are there other cases like this one ?)
if (pos != start && ce.getOperator() != CigarOperator.I) {
pos += len;
}
}
}
return result;
}
/**
* Determine features that overlap genomic intervals.
* @param ivList the list of genomic intervals.
* @param features the list of features.
* @param mode the overlap mode.
* @return the set of features that overlap genomic intervals according to the
* overlap mode.
* @throws EoulsanException
*/
public static Set<String> featuresOverlapped(
final List<GenomicInterval> ivList, final GenomicArray<String> features,
final OverlapMode mode, final StrandUsage stranded)
throws EoulsanException, IOException {
Set<String> fs = null;
// Overlap mode "union"
if (mode == UNION) {
fs = new HashSet<>();
for (final GenomicInterval iv : ivList) {
final String chr = iv.getChromosome();
if (!features.containsChromosome(chr)) {
throw new EoulsanException("Unknown chromosome: " + chr);
}
// Get features that overlap the current interval of the read
final Map<GenomicInterval, Set<String>> intervals =
features.getEntries(chr, iv.getStart(), iv.getEnd());
// Filter intervals if necessary
if (stranded == YES || stranded == REVERSE) {
filterIntervalsStrands(intervals, iv.getStrand());
}
// At least one interval is found
if (intervals != null && intervals.size() > 0) {
for (Map.Entry<GenomicInterval, Set<String>> e : intervals
.entrySet()) {
if (e.getValue() != null) {
fs.addAll(e.getValue());
}
}
}
}
}
// Overlap modes : "intersection-nonempty" or "intersection-strict"
else if (mode == INTERSECTION_NONEMPTY || mode == INTERSECTION_STRICT) {
for (final GenomicInterval iv : ivList) {
final String chr = iv.getChromosome();
if (!features.containsChromosome(chr)) {
throw new EoulsanException("Unknown chromosome: " + chr);
}
// Get features that overlap the current interval of the read
final Map<GenomicInterval, Set<String>> intervals =
features.getEntries(chr, iv.getStart(), iv.getEnd());
// Filter intervals if necessary
if (stranded == StrandUsage.YES || stranded == StrandUsage.REVERSE) {
filterIntervalsStrands(intervals, iv.getStrand());
}
// If internal is empty, add an entry with requested iv as key and an
// empty set as value (HTSeq compatibility)
if (intervals.isEmpty()) {
final Set<String> emptySet = Collections.emptySet();
intervals.put(iv, emptySet);
}
// At least one interval is found
if (intervals.size() > 0) {
for (Map.Entry<GenomicInterval, Set<String>> i : intervals
.entrySet()) {
final Set<String> fs2 = i.getValue();
if (fs2.size() > 0 || mode == INTERSECTION_STRICT) {
if (fs == null) {
fs = new HashSet<>(fs2);
} else {
fs.retainAll(fs2);
}
}
}
}
}
} else {
throw new EoulsanException("Error : illegal overlap mode.");
}
return fs;
}
/**
* Filter the output of GenomicArray.getEntries() by keeping only features on
* a strand
* @param intervals intervals to filter
* @param strand strand to keep
*/
private static void filterIntervalsStrands(
final Map<GenomicInterval, Set<String>> intervals, final char strand) {
if (intervals == null) {
return;
}
final Set<GenomicInterval> toRemove = new HashSet<>();
for (Map.Entry<GenomicInterval, Set<String>> e : intervals.entrySet()) {
if (e.getKey().getStrand() != strand) {
toRemove.add(e.getKey());
}
}
for (GenomicInterval iv : toRemove) {
intervals.remove(iv);
}
}
}