/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.bio.expressioncounters; import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode.INTERSECTION_NONEMPTY; import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode.INTERSECTION_STRICT; import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode.UNION; import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage.REVERSE; import static fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage.YES; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import com.google.common.base.Splitter; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException; import fr.ens.biologie.genomique.eoulsan.bio.GFFEntry; import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray; import fr.ens.biologie.genomique.eoulsan.bio.GenomicInterval; import fr.ens.biologie.genomique.eoulsan.bio.io.GFFReader; import fr.ens.biologie.genomique.eoulsan.bio.io.GTFReader; import fr.ens.biologie.genomique.eoulsan.util.GuavaCompatibility; import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; /** * This class groups HTSeq functions that are used in both local and distributed * modes. * @since 1.2 * @author Claire Wallon */ public class HTSeqUtils { public static void storeAnnotation(final GenomicArray<String> features, final InputStream annotationIs, final boolean gtfFormat, final String featureType, final StrandUsage stranded, final String attributeId, final boolean splitAttributeValues, final Map<String, Integer> counts) throws IOException, EoulsanException, BadBioEntryException { final Splitter splitter = Splitter.on(',').omitEmptyStrings().trimResults(); // Splitter for parents string try (final GFFReader gffReader = gtfFormat ? new GTFReader(annotationIs) : new GFFReader(annotationIs)) { // Read the annotation file for (final GFFEntry gff : gffReader) { if (featureType.equals(gff.getType())) { final String featureId = gff.getAttributeValue(attributeId); if (featureId == null) { throw new EoulsanException("Feature " + featureType + " does not contain a " + attributeId + " attribute"); } if ((stranded == StrandUsage.YES || stranded == StrandUsage.REVERSE) && '.' == gff.getStrand()) { throw new EoulsanException("Feature " + featureType + " does not have strand information but you are running " + "htseq-count in stranded mode."); } // Addition to the list of features of a GenomicInterval object // corresponding to the current annotation line final List<String> featureIds; if (splitAttributeValues) { featureIds = GuavaCompatibility.splitToList(splitter, featureId); } else { featureIds = Collections.singletonList(featureId); } // Split parent if needed for (String f : featureIds) { features.addEntry( new GenomicInterval(gff, stranded.isSaveStrandInfo()), f); counts.put(f, 0); } } } gffReader.throwException(); } } /** * Add intervals of a SAM record that are alignment matches (thanks to the * CIGAR code). * @param record the SAM record to treat. * @param stranded strand to consider. * @return the list of intervals of the SAM record. */ public static List<GenomicInterval> addIntervals(final SAMRecord record, final StrandUsage stranded) { if (record == null) { return null; } List<GenomicInterval> result = new ArrayList<>(); // single-end mode or first read in the paired-end mode if (!record.getReadPairedFlag() || (record.getReadPairedFlag() && record.getFirstOfPairFlag())) { // the read has to be mapped to the opposite strand as the feature if (stranded == REVERSE) { result.addAll(parseCigar(record.getCigar(), record.getReferenceName(), record.getAlignmentStart(), record.getReadNegativeStrandFlag() ? '+' : '-')); } // stranded == "yes" (so the read has to be mapped to the same strand as // the feature) or stranded == "no" (so the read is considered // overlapping with a feature regardless of whether it is mapped to the // same or the opposite strand as the feature) else { result.addAll(parseCigar(record.getCigar(), record.getReferenceName(), record.getAlignmentStart(), record.getReadNegativeStrandFlag() ? '-' : '+')); } } // second read in the paired-end mode else if (record.getReadPairedFlag() && !record.getFirstOfPairFlag()) { // the read has to be mapped to the opposite strand as the feature if (stranded == StrandUsage.REVERSE) { result.addAll(parseCigar(record.getCigar(), record.getReferenceName(), record.getAlignmentStart(), record.getReadNegativeStrandFlag() ? '-' : '+')); } // stranded == "yes" (so the read has to be mapped to the same strand as // the feature) or stranded == "no" (so the read is considered // overlapping with a feature regardless of whether it is mapped to the // same or the opposite strand as the feature) else { result.addAll(parseCigar(record.getCigar(), record.getReferenceName(), record.getAlignmentStart(), record.getReadNegativeStrandFlag() ? '+' : '-')); } } else { return null; } return result; } /** * Parse a CIGAR string to have intervals of a chromosome that are alignments * matches. * @param cigar CIGAR string to parse. * @param chromosome chromosome that support the alignment. * @param start start position of the alignment. * @param strand strand to consider. * @return the list of intervals that are alignments matches. */ public static final List<GenomicInterval> parseCigar(final Cigar cigar, final String chromosome, final int start, final char strand) { if (cigar == null) { return null; } final List<GenomicInterval> result = new ArrayList<>(); int pos = start; for (CigarElement ce : cigar.getCigarElements()) { final int len = ce.getLength(); // the CIGAR element correspond to a mapped region if (ce.getOperator() == CigarOperator.M) { result.add(new GenomicInterval(chromosome, pos, pos + len - 1, strand)); pos += len; } // the CIGAR element did not correspond to a mapped region else { // regions coded by a 'I' (insertion) do not have to be counted // (are there other cases like this one ?) if (pos != start && ce.getOperator() != CigarOperator.I) { pos += len; } } } return result; } /** * Determine features that overlap genomic intervals. * @param ivList the list of genomic intervals. * @param features the list of features. * @param mode the overlap mode. * @return the set of features that overlap genomic intervals according to the * overlap mode. * @throws EoulsanException */ public static Set<String> featuresOverlapped( final List<GenomicInterval> ivList, final GenomicArray<String> features, final OverlapMode mode, final StrandUsage stranded) throws EoulsanException, IOException { Set<String> fs = null; // Overlap mode "union" if (mode == UNION) { fs = new HashSet<>(); for (final GenomicInterval iv : ivList) { final String chr = iv.getChromosome(); if (!features.containsChromosome(chr)) { throw new EoulsanException("Unknown chromosome: " + chr); } // Get features that overlap the current interval of the read final Map<GenomicInterval, Set<String>> intervals = features.getEntries(chr, iv.getStart(), iv.getEnd()); // Filter intervals if necessary if (stranded == YES || stranded == REVERSE) { filterIntervalsStrands(intervals, iv.getStrand()); } // At least one interval is found if (intervals != null && intervals.size() > 0) { for (Map.Entry<GenomicInterval, Set<String>> e : intervals .entrySet()) { if (e.getValue() != null) { fs.addAll(e.getValue()); } } } } } // Overlap modes : "intersection-nonempty" or "intersection-strict" else if (mode == INTERSECTION_NONEMPTY || mode == INTERSECTION_STRICT) { for (final GenomicInterval iv : ivList) { final String chr = iv.getChromosome(); if (!features.containsChromosome(chr)) { throw new EoulsanException("Unknown chromosome: " + chr); } // Get features that overlap the current interval of the read final Map<GenomicInterval, Set<String>> intervals = features.getEntries(chr, iv.getStart(), iv.getEnd()); // Filter intervals if necessary if (stranded == StrandUsage.YES || stranded == StrandUsage.REVERSE) { filterIntervalsStrands(intervals, iv.getStrand()); } // If internal is empty, add an entry with requested iv as key and an // empty set as value (HTSeq compatibility) if (intervals.isEmpty()) { final Set<String> emptySet = Collections.emptySet(); intervals.put(iv, emptySet); } // At least one interval is found if (intervals.size() > 0) { for (Map.Entry<GenomicInterval, Set<String>> i : intervals .entrySet()) { final Set<String> fs2 = i.getValue(); if (fs2.size() > 0 || mode == INTERSECTION_STRICT) { if (fs == null) { fs = new HashSet<>(fs2); } else { fs.retainAll(fs2); } } } } } } else { throw new EoulsanException("Error : illegal overlap mode."); } return fs; } /** * Filter the output of GenomicArray.getEntries() by keeping only features on * a strand * @param intervals intervals to filter * @param strand strand to keep */ private static void filterIntervalsStrands( final Map<GenomicInterval, Set<String>> intervals, final char strand) { if (intervals == null) { return; } final Set<GenomicInterval> toRemove = new HashSet<>(); for (Map.Entry<GenomicInterval, Set<String>> e : intervals.entrySet()) { if (e.getKey().getStrand() != strand) { toRemove.add(e.getKey()); } } for (GenomicInterval iv : toRemove) { intervals.remove(iv); } } }