package htsjdk.samtools.util;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import static java.util.Arrays.asList;
/**
* Utility for determining the type of quality encoding/format (see FastqQualityFormat) used in a SAM/BAM or Fastq.
* <p/>
* To use this class, invoke the detect() method with a SAMFileReader or FastqReader, as appropriate. The consumer is
* responsible for closing readers.
*
* @author mccowan@broadinstitute.org
*/
public class QualityEncodingDetector {
private QualityRecordAggregator qualityAggregator = new QualityRecordAggregator();
/**
* The maximum number of records over which the detector will iterate before making a determination, by default.
*/
public static final long DEFAULT_MAX_RECORDS_TO_ITERATE = 10000;
private static final Log log = Log.getInstance(QualityEncodingDetector.class);
public enum FileContext {FASTQ, SAM}
static class Range {
final int low, high;
Range(final int low, final int high) {
this.low = low;
this.high = high;
}
boolean contains(final int value) {
return value <= high && value >= low;
}
}
/**
* Collection of data about the different quality formats and how they are interpreted.
*/
enum QualityScheme {
Phred(
new Range(0, 93), // Raw value range
new Range(33, 126), // ASCII value range
asList(new Range(33, 58)), // Ranges into which we expect at least one ASCII value to fall
FastqQualityFormat.Standard // Associated quality format
),
Solexa(
new Range(-5, 62),
new Range(59, 126),
new ArrayList<Range>(),
FastqQualityFormat.Solexa
),
Illumina(
new Range(0, 62),
new Range(64, 126),
new ArrayList<Range>(),
FastqQualityFormat.Illumina
);
final Range rawRange, asciiRange;
/**
* Ranges into which we expect at least one value to fall if this formatting is being used. For example, for
* Standard encoding, we expect to at least one ASCII value between 33 and 58 (0 and 25); otherwise, it's
* probably not Standard-encoded.
*/
final List<Range> expectedAsciiRanges;
final FastqQualityFormat qualityFormat;
QualityScheme(final Range rawRange, final Range asciiRange, final List<Range> expectedAsciiRanges, final FastqQualityFormat qualityFormat) {
this.rawRange = rawRange;
this.asciiRange = asciiRange;
this.expectedAsciiRanges = expectedAsciiRanges;
this.qualityFormat = qualityFormat;
}
}
/**
* Collecting reads and their quality scores for later analysis. Uses ASCII values since those are the inherently
* "raw-est", read unmodified from the file.
*/
private static class QualityRecordAggregator {
private Set<Integer> observedAsciiQualities = new HashSet<Integer>();
public Set<Integer> getObservedAsciiQualities() {
return Collections.unmodifiableSet(observedAsciiQualities);
}
/**
* Adds the FastqRecord's quality scores.
*/
public void add(final FastqRecord fastqRecord) {
addAsciiQuality(fastqRecord.getBaseQualityString().getBytes());
}
/**
* Adds the SAMRecord's quality scores.
* <p/>
* Does not assume Phred quality encoding (for obvious reasons); getBaseQualityString() is used to read the
* unmodified ASCII score. To elaborate, SAMFileReader, which is generating these SAMRecords, builds the
* SAMRecord by subtracting a value from each quality score and storing that transformed value internally.
* Since we desire original scores here (whatever was in the file to begin with), we effectively undo this
* transformation by asking SAMRecord to convert the quality back into the ASCII that was read in the file.
*/
public void add(final SAMRecord samRecord, final boolean useOriginalQualities) {
addAsciiQuality(useOriginalQualities && samRecord.getOriginalBaseQualities() != null
? SAMUtils.phredToFastq(samRecord.getOriginalBaseQualities()).getBytes()
: samRecord.getBaseQualityString().getBytes());
}
public void add(final SAMRecord samRecord) {
add(samRecord, false);
}
private void addAsciiQuality(final byte... asciiQualities) {
for (final byte asciiQuality : asciiQualities) {
observedAsciiQualities.add((int) asciiQuality);
}
}
}
/**
* Adds the provided reader's records to the detector.
* @return The number of records read
*/
public long add(final long maxRecords, final FastqReader... readers) {
final Iterator<FastqRecord> iterator = generateInterleavedFastqIterator(readers);
long recordCount = 0;
while (iterator.hasNext() && recordCount++ != maxRecords) {
this.add(iterator.next());
}
log.debug(String.format("Read %s records from %s.", recordCount, Arrays.toString(readers)));
return recordCount;
}
/**
* Adds the provided reader's records to the detector.
* @return The number of records read
*/
public long add(final long maxRecords, final SAMFileReader reader) {
return add(maxRecords, reader.iterator());
}
/**
* Adds the provided iterator's records (optionally using the original qualities) to the detector.
* @return The number of records read
*/
public long add(final long maxRecords, final CloseableIterator<SAMRecord> iterator, final boolean useOriginalQualities) {
long recordCount = 0;
try {
while (iterator.hasNext() && recordCount++ != maxRecords) {
this.add(iterator.next(), useOriginalQualities);
}
return recordCount;
} finally {
iterator.close();
}
}
public long add(final long maxRecords, final CloseableIterator<SAMRecord> iterator) {
return add(maxRecords, iterator, false);
}
/**
* Adds the provided record's qualities to the detector.
*/
public void add(final FastqRecord fastqRecord) {
this.qualityAggregator.add(fastqRecord);
}
/**
* Adds the provided record's qualities to the detector.
*/
public void add(final SAMRecord samRecord, final boolean useOriginalQualities) {
this.qualityAggregator.add(samRecord, useOriginalQualities);
}
public void add(final SAMRecord samRecord) {
this.add(samRecord, false);
}
/**
* Tests whether or not the detector can make a determination without guessing (i.e., if all but one quality format
* can be excluded using established exclusion conventions).
*
* @return True if more than one format is possible after exclusions; false otherwise
*/
public boolean isDeterminationAmbiguous() {
return this.generateCandidateQualities(true).size() > 1;
}
/**
* Processes collected quality data and applies rules to determine which quality formats are possible.
* <p/>
* Specifically, for each format's known range of possible values (its "quality scheme"), exclude formats if any
* observed values fall outside of that range. Additionally, exclude formats for which we expect to see at
* least one quality in a range of values, but do not. (For example, for Phred, we expect to eventually see
* a value below 58. If we never see such a value, we exclude Phred as a possible format unless the checkExpected
* flag is set to false in which case we leave Phred as a possible quality format.)
*/
public EnumSet<FastqQualityFormat> generateCandidateQualities(final boolean checkExpected) {
final EnumSet<FastqQualityFormat> candidateFormats = EnumSet.allOf(FastqQualityFormat.class);
final Set<Integer> observedAsciiQualities = this.qualityAggregator.getObservedAsciiQualities();
if (observedAsciiQualities.isEmpty())
throw new SAMException("Cannot determine candidate qualities: no qualities found.");
for (final QualityScheme scheme : QualityScheme.values()) {
final Iterator<Integer> qualityBinIterator = observedAsciiQualities.iterator();
final Collection<Range> remainingExpectedValueRanges = new ArrayList<Range>(scheme.expectedAsciiRanges);
while (qualityBinIterator.hasNext()) {
final int quality = qualityBinIterator.next();
if (!scheme.asciiRange.contains(quality)) {
candidateFormats.remove(scheme.qualityFormat);
}
final Iterator<Range> expectedValueRangeIterator = remainingExpectedValueRanges.iterator();
while (expectedValueRangeIterator.hasNext()) {
if (expectedValueRangeIterator.next().contains(quality)) {
expectedValueRangeIterator.remove();
}
}
}
/**
* We remove elements from this list as we observe values in the corresponding range; if the list isn't
* empty, we haven't seen a value in that range. In other words, we haven't seen a value we expected.
* Consequently, we remove the corresponding format from the running possibilities.
*/
if (!remainingExpectedValueRanges.isEmpty() && checkExpected) {
candidateFormats.remove(scheme.qualityFormat);
}
}
return candidateFormats;
}
/**
* Interleaves FastqReader iterators so that serial-iteration of the result cycles between the constituent iterators.
*/
private static Iterator<FastqRecord> generateInterleavedFastqIterator(final FastqReader... readers) {
return new Iterator<FastqRecord>() {
private Queue<Iterator<FastqRecord>> queue = new LinkedList<Iterator<FastqRecord>>();
{
for (final FastqReader reader : readers) {
queue.add(reader.iterator());
}
}
public boolean hasNext() {
// If this returns true, the head of the queue will have a next element
while (!queue.isEmpty()) {
if (queue.peek().hasNext()) {
return true;
}
queue.poll();
}
return false;
}
public FastqRecord next() {
if (!hasNext()) throw new NoSuchElementException();
final Iterator<FastqRecord> i = queue.poll();
final FastqRecord result = i.next();
queue.offer(i);
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/**
* Reads through the records in the provided fastq reader and uses their quality scores to determine the quality
* format used in the fastq.
*
* @param readers The fastq readers from which qualities are to be read; at least one must be provided
* @param maxRecords The maximum number of records to read from the reader before making a determination (a guess,
* so more records is better)
* @return The determined quality format
*/
public static FastqQualityFormat detect(final long maxRecords, final FastqReader... readers) {
final QualityEncodingDetector detector = new QualityEncodingDetector();
final long recordCount = detector.add(maxRecords, readers);
log.debug(String.format("Read %s records from %s.", recordCount, Arrays.toString(readers)));
return detector.generateBestGuess(FileContext.FASTQ, null);
}
public static FastqQualityFormat detect(final FastqReader... readers) {
return detect(DEFAULT_MAX_RECORDS_TO_ITERATE, readers);
}
/**
* Reads through the records in the provided SAM reader and uses their quality scores to determine the quality
* format used in the SAM.
*
* @param iterator The iterator from which SAM records are to be read
* @param maxRecords The maximum number of records to read from the reader before making a determination (a guess,
* @param useOriginalQualities whether to use the original qualities (if available) rather than the current ones
* so more records is better)
* @return The determined quality format
*/
public static FastqQualityFormat detect(final long maxRecords, final CloseableIterator<SAMRecord> iterator, final boolean useOriginalQualities) {
final QualityEncodingDetector detector = new QualityEncodingDetector();
final long recordCount = detector.add(maxRecords, iterator, useOriginalQualities);
log.debug(String.format("Read %s records.", recordCount));
return detector.generateBestGuess(FileContext.SAM, null);
}
public static FastqQualityFormat detect(final long maxRecords, final CloseableIterator<SAMRecord> iterator) {
return detect(maxRecords, iterator, false);
}
public static FastqQualityFormat detect(final long maxRecords, final SAMFileReader reader) {
return detect(maxRecords, reader.iterator());
}
public static FastqQualityFormat detect(final SAMFileReader reader) {
return detect(DEFAULT_MAX_RECORDS_TO_ITERATE, reader);
}
/**
* Reads through the records in the provided SAM reader and uses their quality scores to sanity check the expected
* quality passed in. If the expected quality format is sane we just hand this back otherwise we throw a
* {@link SAMException}.
*/
public static FastqQualityFormat detect(final SAMFileReader reader, final FastqQualityFormat expectedQualityFormat) {
//sanity check expectedQuality
final QualityEncodingDetector detector = new QualityEncodingDetector();
final long recordCount = detector.add(DEFAULT_MAX_RECORDS_TO_ITERATE, reader.iterator());
log.debug(String.format("Read %s records from %s.", recordCount, reader));
return detector.generateBestGuess(FileContext.SAM, expectedQualityFormat);
}
/**
* Make the best guess at the quality format. If an expected quality is passed in the values are sanity checked
* (ignoring expected range) and if they are deemed acceptable the expected quality is passed back. Otherwise we use
* a set of heuristics to make our best guess.
*/
public FastqQualityFormat generateBestGuess(final FileContext context, final FastqQualityFormat expectedQuality) {
final EnumSet<FastqQualityFormat> possibleFormats;
if (null != expectedQuality) {
possibleFormats = this.generateCandidateQualities(false);
if (possibleFormats.contains(expectedQuality)) {
return expectedQuality;
} else {
throw new SAMException(
String.format("The quality values do not fall in the range appropriate for the expected quality of %s.",
expectedQuality.name()));
}
} else {
possibleFormats = this.generateCandidateQualities(true);
switch (possibleFormats.size()) {
case 1:
return possibleFormats.iterator().next();
case 2:
if (possibleFormats.equals(EnumSet.of(FastqQualityFormat.Illumina, FastqQualityFormat.Solexa))) {
return FastqQualityFormat.Illumina;
} else if (possibleFormats.equals(EnumSet.of(FastqQualityFormat.Illumina, FastqQualityFormat.Standard))) {
switch (context) {
case FASTQ:
return FastqQualityFormat.Illumina;
case SAM:
return FastqQualityFormat.Standard;
}
} else if (possibleFormats.equals(EnumSet.of(FastqQualityFormat.Standard, FastqQualityFormat.Solexa))) {
return FastqQualityFormat.Standard;
} else throw new SAMException("Unreachable code.");
case 3:
throw new SAMException("The quality format cannot be determined: no formats were excluded.");
case 0:
throw new SAMException("The quality format cannot be determined: all formats were excluded.");
default:
throw new SAMException("Unreachable code.");
}
}
}
}