/*******************************************************************************
* Copyright 2013 EMBL-EBI
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package htsjdk.samtools.cram.lossy;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.encoding.readfeatures.BaseQualityScore;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
import htsjdk.samtools.cram.ref.ReferenceTracks;
import htsjdk.samtools.cram.structure.CramCompressionRecord;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
public class QualityScorePreservation {
private final List<PreservationPolicy> policyList;
private enum TYPE {
LOSSLESS, LOSSY, DROP_ALL
};
private TYPE type;
@Deprecated
public QualityScorePreservation(final String specification) {
this(parsePolicies(specification));
}
public QualityScorePreservation(List<PreservationPolicy> policies) {
policyList = policies;
type = TYPE.LOSSY;
}
public static QualityScorePreservation lossless() {
QualityScorePreservation p = new QualityScorePreservation("");
p.type = TYPE.LOSSLESS;
return p;
}
public static QualityScorePreservation dropAll() {
QualityScorePreservation p = new QualityScorePreservation("");
p.type = TYPE.DROP_ALL;
return p;
}
public static QualityScorePreservation lossyFromSpec(String spec) {
return new QualityScorePreservation(parsePolicies(spec));
}
public List<PreservationPolicy> getPreservationPolicies() {
return policyList;
}
public boolean isLossless() {
return type == TYPE.LOSSLESS;
}
public boolean isDropAll() {
return type == TYPE.DROP_ALL;
}
@Override
public String toString() {
if (isLossless())
return "lossless";
if (isDropAll())
return "drop all";
StringBuffer sb = new StringBuffer();
for (PreservationPolicy p : policyList) {
sb.append("[");
if (p.readCategory != null) {
sb.append("reads:").append(p.readCategory.type.name());
switch (p.readCategory.type) {
case HIGHER_MAPPING_SCORE:
case LOWER_MAPPING_SCORE:
sb.append(",").append(p.readCategory.param);
break;
default:
break;
}
}
if (p.baseCategories != null && !p.baseCategories.isEmpty())
sb.append(" bases");
for (final BaseCategory c : p.baseCategories) {
sb.append(":").append(c.type.name());
switch (c.type) {
case LOWER_COVERAGE:
case PILEUP:
sb.append("(").append(c.param).append(")");
break;
default:
break;
}
}
sb.append(" treatment:");
switch (p.treatment.type) {
case BIN:
sb.append("BIN");
break;
case DROP:
sb.append("DROP");
break;
case PRESERVE:
sb.append("PRESERVE");
break;
default:
break;
}
sb.append("]");
}
return sb.toString();
}
private static int readParam(final LinkedList<Character> list) {
int value = 0;
while (!list.isEmpty() && Character.isDigit(list.getFirst()))
value = value * 10 + (list.removeFirst() - 48);
return value;
}
private static QualityScoreTreatment readTreatment(final LinkedList<Character> list) {
final int param = readParam(list);
final QualityScoreTreatment t;
switch (param) {
case 0:
t = QualityScoreTreatment.drop();
break;
case 40:
t = QualityScoreTreatment.preserve();
break;
default:
t = QualityScoreTreatment.bin(param);
break;
}
return t;
}
public static List<PreservationPolicy> parsePolicies(final String spec) {
final List<PreservationPolicy> policyList = new ArrayList<PreservationPolicy>();
for (final String s : spec.split("-")) {
if (s.length() == 0)
continue;
final PreservationPolicy policy = parseSinglePolicy(s);
policyList.add(policy);
}
Collections.sort(policyList, new Comparator<PreservationPolicy>() {
@Override
public int compare(final PreservationPolicy o1, final PreservationPolicy o2) {
final QualityScoreTreatment t1 = o1.treatment;
final QualityScoreTreatment t2 = o2.treatment;
final int result = t2.type.ordinal() - t1.type.ordinal();
if (result != 0)
return result;
return 0;
}
});
return policyList;
}
private static PreservationPolicy parseSinglePolicy(final String spec) {
final PreservationPolicy p = new PreservationPolicy();
final LinkedList<Character> list = new LinkedList<Character>();
for (final char b : spec.toCharArray())
list.add(b);
while (!list.isEmpty()) {
final char code = list.removeFirst();
switch (code) {
case 'R':
p.baseCategories.add(BaseCategory.match());
p.treatment = readTreatment(list);
break;
case 'N':
p.baseCategories.add(BaseCategory.mismatch());
p.treatment = readTreatment(list);
break;
case 'X':
final int coverage = readParam(list);
p.baseCategories.add(BaseCategory.lowerThanCoverage(coverage));
break;
case 'D':
p.baseCategories.add(BaseCategory.flankingDeletion());
p.treatment = readTreatment(list);
break;
case 'M':
int score = readParam(list);
p.readCategory = ReadCategory.higher_than_mapping_score(score);
break;
case 'm':
score = readParam(list);
p.readCategory = ReadCategory.lower_than_mapping_score(score);
break;
case 'U':
p.readCategory = ReadCategory.unplaced();
p.treatment = readTreatment(list);
break;
case 'P':
final int mismatches = readParam(list);
p.baseCategories.add(BaseCategory.pileup(mismatches));
p.treatment = readTreatment(list);
break;
case 'I':
p.baseCategories.add(BaseCategory.insertion());
p.treatment = readTreatment(list);
break;
case '_':
p.treatment = readTreatment(list);
break;
case '*':
p.readCategory = ReadCategory.all();
p.treatment = readTreatment(list);
break;
default:
throw new RuntimeException("Unknown read or base category: " + code);
}
if (p.treatment == null)
p.treatment = QualityScoreTreatment.preserve();
}
return p;
}
private static void applyBinning(final byte[] scores) {
for (int i = 0; i < scores.length; i++)
scores[i] = Binning.Illumina_binning_matrix[scores[i]];
}
private static byte applyTreatment(final byte score, final QualityScoreTreatment t) {
switch (t.type) {
case BIN:
return Binning.Illumina_binning_matrix[score];
case DROP:
return -1;
case PRESERVE:
return score;
}
throw new RuntimeException("Unknown quality score treatment type: " + t.type.name());
}
public void addQualityScores(final SAMRecord s, final CramCompressionRecord r, final ReferenceTracks t) {
if (s.getBaseQualities() == SAMRecord.NULL_QUALS) {
r.qualityScores = SAMRecord.NULL_QUALS;
r.setForcePreserveQualityScores(false);
return;
}
final byte[] scores = new byte[s.getReadLength()];
Arrays.fill(scores, (byte) -1);
for (final PreservationPolicy p : policyList)
addQS(s, r, scores, t, p);
if (!r.isForcePreserveQualityScores()) {
for (int i = 0; i < scores.length; i++) {
if (scores[i] > -1) {
if (r.readFeatures == null || r.readFeatures == Collections.EMPTY_LIST)
r.readFeatures = new LinkedList<ReadFeature>();
r.readFeatures.add(new BaseQualityScore(i + 1, scores[i]));
}
}
if (r.readFeatures != null)
Collections.sort(r.readFeatures, readFeaturePositionComparator);
}
r.qualityScores = scores;
}
private static final Comparator<ReadFeature> readFeaturePositionComparator = new Comparator<ReadFeature>() {
@Override
public int compare(final ReadFeature o1, final ReadFeature o2) {
return o1.getPosition() - o2.getPosition();
}
};
public boolean areReferenceTracksRequired() {
if (policyList == null || policyList.isEmpty())
return false;
for (final PreservationPolicy p : policyList) {
if (p.baseCategories == null || p.baseCategories.isEmpty())
continue;
for (final BaseCategory c : p.baseCategories) {
switch (c.type) {
case LOWER_COVERAGE:
case PILEUP:
return true;
default:
break;
}
}
}
return false;
}
private static void addQS(final SAMRecord s, final CramCompressionRecord r, final byte[] scores,
final ReferenceTracks t, final PreservationPolicy p) {
final int alSpan = s.getAlignmentEnd() - s.getAlignmentStart();
final byte[] qs = s.getBaseQualities();
// check if read is falling into the read category:
if (p.readCategory != null) {
@SuppressWarnings("UnusedAssignment")
boolean properRead = false;
switch (p.readCategory.type) {
case ALL:
properRead = true;
break;
case UNPLACED:
properRead = s.getReadUnmappedFlag();
break;
case LOWER_MAPPING_SCORE:
properRead = s.getMappingQuality() < p.readCategory.param;
break;
case HIGHER_MAPPING_SCORE:
properRead = s.getMappingQuality() > p.readCategory.param;
break;
default:
throw new RuntimeException("Unknown read category: " + p.readCategory.type.name());
}
if (!properRead) // nothing to do here:
return;
}
// apply treatment if there is no per-base policy:
if (p.baseCategories == null || p.baseCategories.isEmpty()) {
switch (p.treatment.type) {
case BIN:
if (r.qualityScores == null)
r.qualityScores = s.getBaseQualities();
System.arraycopy(s.getBaseQualities(), 0, scores, 0, scores.length);
applyBinning(scores);
r.setForcePreserveQualityScores(true);
break;
case PRESERVE:
System.arraycopy(s.getBaseQualities(), 0, scores, 0, scores.length);
r.setForcePreserveQualityScores(true);
break;
case DROP:
r.qualityScores = null;
r.setForcePreserveQualityScores(false);
break;
default:
throw new RuntimeException("Unknown quality score treatment type: " + p.treatment.type.name());
}
// nothing else to do here:
return;
}
// here we go, scan all bases to check if the policy applies:
final boolean[] mask = new boolean[qs.length];
final int alStart = s.getAlignmentStart();
// must be a mapped read at this point:
if (alStart == SAMRecord.NO_ALIGNMENT_START)
return;
t.ensureRange(alStart, alSpan);
for (final BaseCategory c : p.baseCategories) {
int pos;
int refPos;
switch (c.type) {
case FLANKING_DELETION:
pos = 0;
for (final CigarElement ce : s.getCigar().getCigarElements()) {
if (ce.getOperator() == CigarOperator.D) {
// if (pos > 0)
mask[pos] = true;
if (pos + 1 < mask.length)
mask[pos + 1] = true;
}
pos += ce.getOperator().consumesReadBases() ? ce.getLength() : 0;
}
break;
case MATCH:
case MISMATCH:
pos = 0;
refPos = s.getAlignmentStart();
for (final CigarElement ce : s.getCigar().getCigarElements()) {
if (ce.getOperator().consumesReadBases()) {
for (int i = 0; i < ce.getLength(); i++) {
final boolean match = s.getReadBases()[pos + i] == t.baseAt(refPos + i);
mask[pos + i] = (c.type == BaseCategoryType.MATCH && match)
|| (c.type == BaseCategoryType.MISMATCH && !match);
}
pos += ce.getLength();
}
refPos += ce.getOperator().consumesReferenceBases() ? ce.getLength() : 0;
}
break;
case INSERTION:
pos = 0;
for (final CigarElement ce : s.getCigar().getCigarElements()) {
switch (ce.getOperator()) {
case I:
for (int i = 0; i < ce.getLength(); i++)
mask[pos + i] = true;
break;
default:
break;
}
pos += ce.getOperator().consumesReadBases() ? ce.getLength() : 0;
}
break;
case LOWER_COVERAGE:
pos = 1;
refPos = s.getAlignmentStart();
for (final CigarElement ce : s.getCigar().getCigarElements()) {
switch (ce.getOperator()) {
case M:
case EQ:
case X:
for (int i = 0; i < ce.getLength(); i++)
mask[pos + i - 1] = t.coverageAt(refPos + i) < c.param;
break;
default:
break;
}
pos += ce.getOperator().consumesReadBases() ? ce.getLength() : 0;
refPos += ce.getOperator().consumesReferenceBases() ? ce.getLength() : 0;
}
break;
case PILEUP:
for (int i = 0; i < qs.length; i++)
if (t.mismatchesAt(alStart + i) > c.param)
mask[i] = true;
break;
default:
break;
}
}
int maskedCount = 0;
for (int i = 0; i < mask.length; i++)
if (mask[i]) {
scores[i] = applyTreatment(qs[i], p.treatment);
maskedCount++;
}
// safety latch, store all qs if there are too many individual score
// to store:
if (maskedCount > s.getReadLength() / 2)
r.setForcePreserveQualityScores(true);
}
}