/**
* ****************************************************************************
* Copyright 2013 EMBL-EBI
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ****************************************************************************
*/
package htsjdk.samtools.cram.build;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.encoding.readfeatures.BaseQualityScore;
import htsjdk.samtools.cram.encoding.readfeatures.Deletion;
import htsjdk.samtools.cram.encoding.readfeatures.InsertBase;
import htsjdk.samtools.cram.encoding.readfeatures.Insertion;
import htsjdk.samtools.cram.encoding.readfeatures.ReadBase;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
import htsjdk.samtools.cram.encoding.readfeatures.RefSkip;
import htsjdk.samtools.cram.encoding.readfeatures.SoftClip;
import htsjdk.samtools.cram.encoding.readfeatures.Substitution;
import htsjdk.samtools.cram.ref.CRAMReferenceSource;
import htsjdk.samtools.cram.structure.AlignmentSpan;
import htsjdk.samtools.cram.structure.CramCompressionRecord;
import htsjdk.samtools.cram.structure.SubstitutionMatrix;
import htsjdk.samtools.util.Log;
import net.sf.cram.ref.ReferenceRegion;
public class CramNormalizer {
private final SAMFileHeader header;
private int readCounter = 0;
private static Log log = Log.getInstance(CramNormalizer.class);
private CRAMReferenceSource referenceSource;
private CramNormalizer(final SAMFileHeader header) {
this.header = header;
}
public CramNormalizer(final SAMFileHeader header, final CRAMReferenceSource referenceSource) {
if (referenceSource == null) {
throw new IllegalArgumentException("A reference is required.");
}
this.header = header;
this.referenceSource = referenceSource;
}
public void normalize(final ArrayList<CramCompressionRecord> records, final byte[] ref,
final int refOffset_zeroBased, final SubstitutionMatrix substitutionMatrix) {
final int startCounter = readCounter;
for (final CramCompressionRecord record : records) {
record.index = ++readCounter;
if (record.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
record.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME;
record.alignmentStart = SAMRecord.NO_ALIGNMENT_START;
} else {
record.sequenceName = header.getSequence(record.sequenceId).getSequenceName();
}
}
{// restore pairing first:
for (final CramCompressionRecord record : records) {
if (!record.isMultiFragment() || record.isDetached()) {
record.recordsToNextFragment = -1;
record.next = null;
record.previous = null;
continue;
}
if (record.isHasMateDownStream()) {
final CramCompressionRecord downMate = records
.get(record.index + record.recordsToNextFragment - startCounter);
record.next = downMate;
downMate.previous = record;
}
}
for (final CramCompressionRecord record : records) {
if (record.previous != null)
continue;
if (record.next == null)
continue;
restoreMateInfo(record);
}
}
// assign some read names if needed:
for (final CramCompressionRecord record : records) {
if (record.readName == null) {
final String readNamePrefix = "";
final String name = readNamePrefix + record.index;
record.readName = name;
if (record.next != null)
record.next.readName = name;
if (record.previous != null)
record.previous.readName = name;
}
}
// resolve bases:
for (final CramCompressionRecord record : records) {
if (record.isSegmentUnmapped())
continue;
byte[] refBases = ref;
{
// ref could be supplied (aka forced) already or needs looking
// up:
// ref.length=0 is a special case of seqId=-2 (multiref)
if ((ref == null || ref.length == 0) && referenceSource != null)
refBases = referenceSource.getReferenceBases(header.getSequence(record.sequenceId), true);
}
if (record.isUnknownBases()) {
record.readBases = SAMRecord.NULL_SEQUENCE;
} else
record.readBases = restoreReadBases(record, refBases, refOffset_zeroBased, substitutionMatrix);
}
// restore quality scores:
final byte defaultQualityScore = '?' - '!';
restoreQualityScores(defaultQualityScore, records);
}
public void normalizeRecordsForReferenceSource(final ArrayList<CramCompressionRecord> records,
final net.sf.cram.ref.ReferenceSource referenceSource, final SubstitutionMatrix substitutionMatrix)
throws IOException {
final int startCounter = readCounter;
for (final CramCompressionRecord record : records) {
record.index = ++readCounter;
if (record.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
record.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME;
record.alignmentStart = SAMRecord.NO_ALIGNMENT_START;
} else {
record.sequenceName = header.getSequence(record.sequenceId).getSequenceName();
}
}
{// restore pairing first:
for (final CramCompressionRecord record : records) {
if (!record.isMultiFragment() || record.isDetached()) {
record.recordsToNextFragment = -1;
record.next = null;
record.previous = null;
continue;
}
if (record.isHasMateDownStream()) {
final CramCompressionRecord downMate = records
.get(record.index + record.recordsToNextFragment - startCounter);
record.next = downMate;
downMate.previous = record;
}
}
for (final CramCompressionRecord record : records) {
if (record.previous != null)
continue;
if (record.next == null)
continue;
restoreMateInfo(record);
}
}
// assign some read names if needed:
for (final CramCompressionRecord record : records) {
if (record.readName == null) {
final String readNamePrefix = "";
final String name = readNamePrefix + record.index;
record.readName = name;
if (record.next != null)
record.next.readName = name;
if (record.previous != null)
record.previous.readName = name;
}
}
// resolve bases:
restoreBases(records, header, referenceSource, substitutionMatrix);
// restore quality scores:
final byte defaultQualityScore = '?' - '!';
restoreQualityScores(defaultQualityScore, records);
}
public static void restoreBases(final List<CramCompressionRecord> records, SAMFileHeader header,
final net.sf.cram.ref.ReferenceSource referenceSource, final SubstitutionMatrix substitutionMatrix)
throws IOException {
Map<Integer, AlignmentSpan> map = new HashMap<Integer, AlignmentSpan>();
for (final CramCompressionRecord record : records) {
if (!record.isSegmentUnmapped() || record.readBases == null) {
int end = record.getAlignmentEnd();
if (record.alignmentStart > SAMRecord.NO_ALIGNMENT_START && end >= record.alignmentStart)
addSpan(record.sequenceId, record.alignmentStart, end - record.alignmentStart + 1, 1, map);
}
}
log.debug("Found refs: " + map.size());
for (Entry<Integer, AlignmentSpan> entry : map.entrySet()) {
SAMSequenceRecord sequence = header.getSequence(entry.getKey());
ReferenceRegion region = referenceSource.getRegion(sequence, entry.getValue().getStart(),
entry.getValue().getStart() + entry.getValue().getSpan() - 1);
if (region == null) {
throw new RuntimeException("Reference sequence required but not found: " + sequence.getSequenceName()
+ ", md5=" + sequence.getMd5());
}
for (final CramCompressionRecord record : records) {
if (record.sequenceId != sequence.getSequenceIndex())
continue;
if (record.isUnknownBases()) {
record.readBases = SAMRecord.NULL_SEQUENCE;
} else {
if (!record.isSegmentUnmapped() || record.readBases == null)
record.readBases = restoreReadBases(record, region.array, (int) region.alignmentStart - 1,
substitutionMatrix);
}
}
}
}
private static void addSpan(final int seqId, final int start, final int span, final int count,
final Map<Integer, AlignmentSpan> map) {
if (map.containsKey(seqId)) {
map.get(seqId).add(start, span, count);
} else {
map.put(seqId, new AlignmentSpan(start, span, count));
}
}
private static void restoreMateInfo(final CramCompressionRecord record) {
if (record.next == null) {
return;
}
CramCompressionRecord cur;
cur = record;
while (cur.next != null) {
setNextMate(cur, cur.next);
cur = cur.next;
}
// cur points to the last segment now:
final CramCompressionRecord last = cur;
setNextMate(last, record);
// record.setFirstSegment(true);
// last.setLastSegment(true);
final int templateLength = computeInsertSize(record, last);
record.templateSize = templateLength;
last.templateSize = -templateLength;
}
private static void setNextMate(final CramCompressionRecord record, final CramCompressionRecord next) {
record.mateAlignmentStart = next.alignmentStart;
record.setMateUnmapped(next.isSegmentUnmapped());
record.setMateNegativeStrand(next.isNegativeStrand());
record.mateSequenceID = next.sequenceId;
if (record.mateSequenceID == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX)
record.mateAlignmentStart = SAMRecord.NO_ALIGNMENT_START;
}
public static void restoreQualityScores(final byte defaultQualityScore, final List<CramCompressionRecord> records) {
for (final CramCompressionRecord record : records)
restoreQualityScores(defaultQualityScore, record);
}
private static byte[] restoreQualityScores(final byte defaultQualityScore, final CramCompressionRecord record) {
if (!record.isForcePreserveQualityScores()) {
boolean star = true;
final byte[] scores = new byte[record.readLength];
Arrays.fill(scores, defaultQualityScore);
if (record.readFeatures != null)
for (final ReadFeature feature : record.readFeatures) {
switch (feature.getOperator()) {
case BaseQualityScore.operator:
int pos = feature.getPosition();
scores[pos - 1] = ((BaseQualityScore) feature).getQualityScore();
star = false;
break;
case ReadBase.operator:
pos = feature.getPosition();
scores[pos - 1] = ((ReadBase) feature).getQualityScore();
star = false;
break;
default:
break;
}
}
if (star)
record.qualityScores = SAMRecord.NULL_QUALS;
else
record.qualityScores = scores;
} else {
final byte[] scores = record.qualityScores;
int missingScores = 0;
for (int i = 0; i < scores.length; i++)
if (scores[i] == -1) {
scores[i] = defaultQualityScore;
missingScores++;
}
if (missingScores == scores.length)
record.qualityScores = SAMRecord.NULL_QUALS;
}
return record.qualityScores;
}
private static byte[] restoreReadBases(final CramCompressionRecord record, final byte[] ref,
final int refOffsetZeroBased, final SubstitutionMatrix substitutionMatrix) {
if (record.isUnknownBases() || record.readLength == 0)
return SAMRecord.NULL_SEQUENCE;
final int readLength = record.readLength;
final byte[] bases = new byte[readLength];
int posInRead = 1;
final int alignmentStart = record.alignmentStart - 1;
int posInSeq = 0;
if (record.readFeatures == null || record.readFeatures.isEmpty()) {
if (ref.length + refOffsetZeroBased < alignmentStart + bases.length) {
Arrays.fill(bases, (byte) 'N');
try {
final int offsetOnRef = alignmentStart - refOffsetZeroBased;
final int availableOnRef = ref.length - offsetOnRef;
if (offsetOnRef > 0 && availableOnRef > 0)
System.arraycopy(ref, offsetOnRef, bases, 0, Math.min(bases.length, availableOnRef));
} catch (ArrayIndexOutOfBoundsException e) {
e.printStackTrace();
System.err.printf("ref.length=%d, refOffsetZeroBased=%d, alignmentStart=%d, bases.length=%d.\n",
ref.length, refOffsetZeroBased, alignmentStart, bases.length);
throw e;
}
} else
System.arraycopy(ref, alignmentStart - refOffsetZeroBased, bases, 0, bases.length);
return bases;
}
final List<ReadFeature> variations = record.readFeatures;
for (final ReadFeature variation : variations) {
for (; posInRead < variation.getPosition(); posInRead++) {
final int rp = alignmentStart + posInSeq++ - refOffsetZeroBased;
if (rp < 0)
System.err.printf("alignmentStart=%d, posInSeq=%d, refOffsetZeroBased=%d, posInRead=%d.\n",
alignmentStart, posInSeq, refOffsetZeroBased, posInRead);
bases[posInRead - 1] = getByteOrDefault(ref, rp, (byte) 'N');
}
switch (variation.getOperator()) {
case Substitution.operator:
final Substitution substitution = (Substitution) variation;
if (alignmentStart + posInSeq - refOffsetZeroBased < 0) {
System.out.println("gotcha");
}
byte refBase = getByteOrDefault(ref, alignmentStart + posInSeq - refOffsetZeroBased, (byte) 'N');
refBase = Utils.normalizeBase(refBase);
final byte base = substitutionMatrix.base(refBase, substitution.getCode());
substitution.setBase(base);
substitution.setReferenceBase(refBase);
bases[posInRead++ - 1] = base;
posInSeq++;
break;
case Insertion.operator:
final Insertion insertion = (Insertion) variation;
for (int i = 0; i < insertion.getSequence().length; i++)
bases[posInRead++ - 1] = insertion.getSequence()[i];
break;
case SoftClip.operator:
final SoftClip softClip = (SoftClip) variation;
for (int i = 0; i < softClip.getSequence().length; i++)
bases[posInRead++ - 1] = softClip.getSequence()[i];
break;
case Deletion.operator:
final Deletion deletion = (Deletion) variation;
posInSeq += deletion.getLength();
break;
case InsertBase.operator:
final InsertBase insert = (InsertBase) variation;
bases[posInRead++ - 1] = insert.getBase();
break;
case RefSkip.operator:
posInSeq += ((RefSkip) variation).getLength();
break;
}
}
for (; posInRead <= readLength
&& alignmentStart + posInSeq - refOffsetZeroBased < ref.length; posInRead++, posInSeq++) {
bases[posInRead - 1] = ref[alignmentStart + posInSeq - refOffsetZeroBased];
}
// ReadBase overwrites bases:
for (final ReadFeature variation : variations) {
switch (variation.getOperator()) {
case ReadBase.operator:
final ReadBase readBase = (ReadBase) variation;
bases[variation.getPosition() - 1] = readBase.getBase();
break;
default:
break;
}
}
for (int i = 0; i < bases.length; i++) {
bases[i] = Utils.normalizeBase(bases[i]);
}
return bases;
}
private static byte getByteOrDefault(final byte[] array, final int pos, final byte outOfBoundsValue) {
if (pos >= array.length)
return outOfBoundsValue;
else
return array[pos];
}
/**
* The method is similar in semantics to
* {@link htsjdk.samtools.SamPairUtil#computeInsertSize(SAMRecord, SAMRecord)
* computeInsertSize} but operates on CRAM native records instead of
* SAMRecord objects.
*
* @param firstEnd
* first mate of the pair
* @param secondEnd
* second mate of the pair
* @return template length
*/
public static int computeInsertSize(final CramCompressionRecord firstEnd, final CramCompressionRecord secondEnd) {
if (firstEnd.isSegmentUnmapped() || secondEnd.isSegmentUnmapped()) {
return 0;
}
if (firstEnd.sequenceId != secondEnd.sequenceId) {
return 0;
}
final int firstEnd5PrimePosition = firstEnd.isNegativeStrand() ? firstEnd.getAlignmentEnd()
: firstEnd.alignmentStart;
final int secondEnd5PrimePosition = secondEnd.isNegativeStrand() ? secondEnd.getAlignmentEnd()
: secondEnd.alignmentStart;
final int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1;
return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment;
}
}