/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
import java.util.List;
import static htsjdk.samtools.GenomicIndexUtil.MAX_BINS;
/**
* Builder for a BinningIndexContent object.
*/
public class BinningIndexBuilder {
private final int referenceSequence;
// the bins for the current reference
private final Bin[] bins; // made only as big as needed for each reference
private int binsSeen = 0;
// linear index for the current reference
private final long[] index = new long[LinearIndex.MAX_LINEAR_INDEX_SIZE];
private int largestIndexSeen = -1;
/**
*
* @param referenceSequence
* @param sequenceLength 0 implies unknown length. Known length will reduce memory use.
*/
public BinningIndexBuilder(final int referenceSequence, final int sequenceLength) {
this.referenceSequence = referenceSequence;
final int numBins;
if (sequenceLength <= 0) numBins = MAX_BINS + 1;
else numBins = AbstractBAMFileIndex.getMaxBinNumberForSequenceLength(sequenceLength) + 1;
bins = new Bin[numBins];
}
public BinningIndexBuilder(final int referenceSequence) {
this(referenceSequence, 0);
}
/**
* coordinates are 1-based, inclusive
*/
public interface FeatureToBeIndexed {
public int getStart();
public int getEnd();
public Integer getIndexingBin();
public Chunk getChunk();
}
public void processFeature(final FeatureToBeIndexed feature) {
// process bins
final Integer binNumber = feature.getIndexingBin();
final int binNum = binNumber == null ? computeIndexingBin(feature) : binNumber;
// is there a bin already represented for this index? if not, add one
final Bin bin;
if (bins[binNum] != null) {
bin = bins[binNum];
} else {
bin = new Bin(referenceSequence, binNum);
bins[binNum] = bin;
binsSeen++;
}
// process chunks
final Chunk newChunk = feature.getChunk();
final long chunkStart = newChunk.getChunkStart();
final long chunkEnd = newChunk.getChunkEnd();
final List<Chunk> oldChunks = bin.getChunkList();
if (!bin.containsChunks()) {
bin.addInitialChunk(newChunk);
} else {
final Chunk lastChunk = bin.getLastChunk();
// Coalesce chunks that are in the same or adjacent file blocks.
// Similar to AbstractBAMFileIndex.optimizeChunkList,
// but no need to copy the list, no minimumOffset, and maintain bin.lastChunk
if (BlockCompressedFilePointerUtil.areInSameOrAdjacentBlocks(lastChunk.getChunkEnd(), chunkStart)) {
lastChunk.setChunkEnd(chunkEnd); // coalesced
} else {
oldChunks.add(newChunk);
bin.setLastChunk(newChunk);
}
}
// process linear index
// the smallest file offset that appears in the 16k window for this bin
final int featureEnd = feature.getEnd();
int startWindow = LinearIndex.convertToLinearIndexOffset(feature.getStart()); // the 16k window
final int endWindow;
if (featureEnd == GenomicIndexUtil.UNSET_GENOMIC_LOCATION) { // assume feature uses one position
// Next line for C (samtools index) compatibility. Differs only when on a window boundary
startWindow = LinearIndex.convertToLinearIndexOffset(feature.getStart() - 1);
endWindow = startWindow;
} else {
endWindow = LinearIndex.convertToLinearIndexOffset(featureEnd);
}
if (endWindow > largestIndexSeen) {
largestIndexSeen = endWindow;
}
// set linear index at every 16K window that this feature overlaps
for (int win = startWindow; win <= endWindow; win++) {
if (index[win] == 0 || chunkStart < index[win]) {
index[win] = chunkStart;
}
}
}
/**
* Creates the BAMIndexContent for this reference.
* Requires all features of the reference have already been processed.
*/
public BinningIndexContent generateIndexContent() {
// process bins
if (binsSeen == 0) return null; // no bins for this reference
// process chunks
// nothing needed
// process linear index
// linear index will only be as long as the largest index seen
final long[] newIndex = new long[largestIndexSeen + 1]; // in java1.6 Arrays.copyOf(index, largestIndexSeen + 1);
// C (samtools index) also fills in intermediate 0's with values. This seems unnecessary, but safe
long lastNonZeroOffset = 0;
for (int i = 0; i <= largestIndexSeen; i++) {
if (index[i] == 0) {
index[i] = lastNonZeroOffset; // not necessary, but C (samtools index) does this
// note, if you remove the above line BAMIndexWriterTest.compareTextual and compareBinary will have to change
} else {
lastNonZeroOffset = index[i];
}
newIndex[i] = index[i];
}
final LinearIndex linearIndex = new LinearIndex(referenceSequence, 0, newIndex);
return new BinningIndexContent(referenceSequence, new BinningIndexContent.BinList(bins, binsSeen), linearIndex);
}
private int computeIndexingBin(final FeatureToBeIndexed feature) {
// reg2bin has zero-based, half-open API
final int start = feature.getStart()-1;
int end = feature.getEnd();
if (end <= 0) {
// If feature end cannot be determined (e.g. because a read is not really aligned),
// then treat this as a one base feature for indexing purposes.
end = start + 1;
}
return GenomicIndexUtil.reg2bin(start, end);
}
}