/*
* The MIT License
*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sub-license, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;
import htsjdk.samtools.util.Log;
import java.io.File;
import java.io.OutputStream;
/**
* Class for both constructing BAM index content and writing it out.
* There are two usage patterns:
* 1) Building a bam index from an existing bam file
* 2) Building a bam index while building the bam file
* In both cases, processAlignment is called for each alignment record and
* finish() is called at the end.
*/
public class BAMIndexer {
// The number of references (chromosomes) in the BAM file
private final int numReferences;
// output written as binary, or (for debugging) as text
private final BAMIndexWriter outputWriter;
private int currentReference = 0;
// content is built up from the input bam file using this
private final BAMIndexBuilder indexBuilder;
/**
* @param output binary BAM Index (.bai) file
* @param fileHeader header for the corresponding bam file
*/
public BAMIndexer(final File output, final SAMFileHeader fileHeader) {
numReferences = fileHeader.getSequenceDictionary().size();
indexBuilder = new BAMIndexBuilder(fileHeader.getSequenceDictionary());
outputWriter = new BinaryBAMIndexWriter(numReferences, output);
}
/**
* Prepare to index a BAM.
* @param output Index will be written here. output will be closed when finish() method is called.
* @param fileHeader header for the corresponding bam file.
*/
public BAMIndexer(final OutputStream output, final SAMFileHeader fileHeader) {
numReferences = fileHeader.getSequenceDictionary().size();
indexBuilder = new BAMIndexBuilder(fileHeader.getSequenceDictionary());
outputWriter = new BinaryBAMIndexWriter(numReferences, output);
}
/**
* Record any index information for a given BAM record.
* If this alignment starts a new reference, write out the old reference.
* Requires a non-null value for rec.getFileSource().
*
* @param rec The BAM record
*/
public void processAlignment(final SAMRecord rec) {
try {
final int reference = rec.getReferenceIndex();
if (reference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && reference != currentReference) {
// process any completed references
advanceToReference(reference);
}
indexBuilder.processAlignment(rec);
} catch (final Exception e) {
throw new SAMException("Exception creating BAM index for record " + rec, e);
}
}
/**
* After all the alignment records have been processed, finish is called.
* Writes any final information and closes the output file.
*/
public void finish() {
// process any remaining references
advanceToReference(numReferences);
outputWriter.writeNoCoordinateRecordCount(indexBuilder.getNoCoordinateRecordCount());
outputWriter.close();
}
/** write out any references between the currentReference and the nextReference */
private void advanceToReference(final int nextReference) {
while (currentReference < nextReference) {
final BAMIndexContent content = indexBuilder.processReference(currentReference);
outputWriter.writeReference(content);
currentReference++;
if (currentReference < numReferences) {
indexBuilder.startNewReference();
}
}
}
/**
* Generates a BAM index file, either textual or binary, from an input BAI file.
* Only used for testing, but located here for visibility into CachingBAMFileIndex.
*
* @param output BAM Index (.bai) file (or bai.txt file when text)
* @param textOutput Whether to create text output or binary
*/
static public void createAndWriteIndex(final File input, final File output, final boolean textOutput) {
// content is from an existing bai file.
final CachingBAMFileIndex existingIndex = new CachingBAMFileIndex(input, null);
final int n_ref = existingIndex.getNumberOfReferences();
final BAMIndexWriter outputWriter;
if (textOutput) {
outputWriter = new TextualBAMIndexWriter(n_ref, output);
} else {
outputWriter = new BinaryBAMIndexWriter(n_ref, output);
}
// write the content one reference at a time
try {
for (int i = 0; i < n_ref; i++) {
outputWriter.writeReference(existingIndex.getQueryResults(i));
}
outputWriter.writeNoCoordinateRecordCount(existingIndex.getNoCoordinateCount());
outputWriter.close();
} catch (final Exception e) {
throw new SAMException("Exception creating BAM index", e);
}
}
/**
* Class for constructing BAM index files.
* One instance is used to construct an entire index.
* processAlignment is called for each alignment until a new reference is encountered, then
* processReference is called when all records for the reference have been processed.
*/
private class BAMIndexBuilder {
private final SAMSequenceDictionary sequenceDictionary;
private BinningIndexBuilder binningIndexBuilder;
private int currentReference = -1;
// information in meta data
private final BAMIndexMetaData indexStats = new BAMIndexMetaData();
BAMIndexBuilder(final SAMSequenceDictionary sequenceDictionary) {
this.sequenceDictionary = sequenceDictionary;
if (!sequenceDictionary.isEmpty()) startNewReference();
}
/**
* Record any index information for a given BAM record
*
* @param rec The BAM record. Requires rec.getFileSource() is non-null.
*/
public void processAlignment(final SAMRecord rec) {
// metadata
indexStats.recordMetaData(rec);
if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) {
return; // do nothing for records without coordinates, but count them
}
// various checks
final int reference = rec.getReferenceIndex();
if (reference != currentReference) {
throw new SAMException("Unexpected reference " + reference +
" when constructing index for " + currentReference + " for record " + rec);
}
binningIndexBuilder.processFeature(new BinningIndexBuilder.FeatureToBeIndexed() {
@Override
public int getStart() {
return rec.getAlignmentStart();
}
@Override
public int getEnd() {
return rec.getAlignmentEnd();
}
@Override
public Integer getIndexingBin() {
final Integer binNumber = rec.getIndexingBin();
return (binNumber == null ? rec.computeIndexingBin() : binNumber);
}
@Override
public Chunk getChunk() {
final SAMFileSource source = rec.getFileSource();
if (source == null) {
throw new SAMException("No source (virtual file offsets); needed for indexing on BAM Record " + rec);
}
return ((BAMFileSpan) source.getFilePointer()).getSingleChunk();
}
});
}
/**
* Creates the BAMIndexContent for this reference.
* Requires all alignments of the reference have already been processed.
* @return Null if there are no features for this reference.
*/
public BAMIndexContent processReference(final int reference) {
if (reference != currentReference) {
throw new SAMException("Unexpected reference " + reference + " when constructing index for " + currentReference);
}
final BinningIndexContent indexContent = binningIndexBuilder.generateIndexContent();
if (indexContent == null) return null;
return new BAMIndexContent(indexContent.getReferenceSequence(), indexContent.getBins(),
indexStats, indexContent.getLinearIndex());
}
/**
* @return the count of records with no coordinate positions
*/
public long getNoCoordinateRecordCount() {
return indexStats.getNoCoordinateRecordCount();
}
/**
* reinitialize all data structures when the reference changes
*/
void startNewReference() {
++currentReference;
// I'm not crazy about recycling this object, but that is the way it was originally written and
// it helps keep track of no-coordinate read count (which shouldn't be stored in this class anyway).
indexStats.newReference();
binningIndexBuilder = new BinningIndexBuilder(currentReference,
sequenceDictionary.getSequence(currentReference).getSequenceLength());
}
}
/**
* Generates a BAM index file from an input BAM file
*
* @param reader SAMFileReader for input BAM file
* @param output File for output index file
*/
public static void createIndex(SAMFileReader reader, File output) {
createIndex(reader, output, null);
}
/**
* Generates a BAM index file from an input BAM file
*
* @param reader SAMFileReader for input BAM file
* @param output File for output index file
*/
public static void createIndex(SAMFileReader reader, File output, Log log) {
BAMIndexer indexer = new BAMIndexer(output, reader.getFileHeader());
reader.enableFileSource(true);
int totalRecords = 0;
// create and write the content
for (SAMRecord rec : reader) {
if (++totalRecords % 1000000 == 0) {
if (null != log) log.info(totalRecords + " reads processed ...");
}
indexer.processAlignment(rec);
}
indexer.finish();
}
}