/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools.reference; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.IOUtil; import java.io.File; import java.io.FileNotFoundException; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.Scanner; import java.util.regex.MatchResult; /** * Reads a fasta index file (.fai), as generated by `samtools faidx`. */ public class FastaSequenceIndex implements Iterable<FastaSequenceIndexEntry> { /** * Store the entries. Use a LinkedHashMap for consistent iteration in insertion order. */ private final Map<String,FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<String,FastaSequenceIndexEntry>(); /** * Build a sequence index from the specified file. * @param indexFile File to open. * @throws FileNotFoundException if the index file cannot be found. */ public FastaSequenceIndex( File indexFile ) { IOUtil.assertFileIsReadable(indexFile); parseIndexFile(indexFile); } /** * Empty, protected constructor for unit testing. */ protected FastaSequenceIndex() {} /** * Add a new index entry to the list. Protected for unit testing. * @param indexEntry New index entry to add. */ protected void add(FastaSequenceIndexEntry indexEntry) { final FastaSequenceIndexEntry ret = sequenceEntries.put(indexEntry.getContig(),indexEntry); if (ret != null) { throw new SAMException("Contig '" + indexEntry.getContig() + "' already exists in fasta index."); } } /** * Renames the existing index entry to the new index entry with the specified name. * @param entry entry to update. * @param newName New name for the index entry. */ protected void rename(FastaSequenceIndexEntry entry,String newName) { sequenceEntries.remove(entry.getContig()); entry.setContig(newName); add(entry); } /** * Compare two FastaSequenceIndex objects for equality. * @param other Another FastaSequenceIndex to compare * @return True if index has the same entries as other instance, in the same order */ public boolean equals(Object other) { if(!(other instanceof FastaSequenceIndex)) return false; if (this == other) return true; FastaSequenceIndex otherIndex = (FastaSequenceIndex)other; if(this.size() != otherIndex.size()) return false; Iterator<FastaSequenceIndexEntry> iter = this.iterator(); Iterator<FastaSequenceIndexEntry> otherIter = otherIndex.iterator(); while (iter.hasNext()) { if (!otherIter.hasNext()) return false; if (!iter.next().equals(otherIter.next())) return false; } return true; } /** * Parse the contents of an index file, caching the results internally. * @param indexFile File to parse. * @throws FileNotFoundException Thrown if file could not be opened. */ private void parseIndexFile(File indexFile) { try { Scanner scanner = new Scanner(indexFile); int sequenceIndex = 0; while( scanner.hasNext() ) { // Tokenize and validate the index line. String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); if( result == null ) throw new SAMException("Found invalid line in index file:" + scanner.nextLine()); MatchResult tokens = scanner.match(); if( tokens.groupCount() != 5 ) throw new SAMException("Found invalid line in index file:" + scanner.nextLine()); // Skip past the line separator scanner.nextLine(); // Parse the index line. String contig = tokens.group(1); long size = Long.valueOf(tokens.group(2)); long location = Long.valueOf(tokens.group(3)); int basesPerLine = Integer.valueOf(tokens.group(4)); int bytesPerLine = Integer.valueOf(tokens.group(5)); contig = SAMSequenceRecord.truncateSequenceName(contig); // Build sequence structure add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) ); } scanner.close(); } catch (FileNotFoundException e) { throw new SAMException("Fasta index file should be found but is not: " + indexFile, e); } } /** * Does the given contig name have a corresponding entry? * @param contigName The contig name for which to search. * @return True if contig name is present; false otherwise. */ public boolean hasIndexEntry( String contigName ) { return sequenceEntries.containsKey(contigName); } /** * Retrieve the index entry associated with the given contig. * @param contigName Name of the contig for which to search. * @return Index entry associated with the given contig. * @throws SAMException if the associated index entry can't be found. */ public FastaSequenceIndexEntry getIndexEntry( String contigName ) { if( !hasIndexEntry(contigName) ) throw new SAMException("Unable to find entry for contig: " + contigName); return sequenceEntries.get(contigName); } /** * Creates an iterator which can iterate through all entries in a fasta index. * @return iterator over all fasta index entries. */ public Iterator<FastaSequenceIndexEntry> iterator() { return sequenceEntries.values().iterator(); } /** * Returns the number of elements in the index. * @return Number of elements in the index. */ public int size() { return sequenceEntries.size(); } } /** * Hold an individual entry in a fasta sequence index file. */ class FastaSequenceIndexEntry { private String contig; private long location; private long size; private int basesPerLine; private int bytesPerLine; private final int sequenceIndex; /** * Create a new entry with the given parameters. * @param contig Contig this entry represents. * @param location Location (byte coordinate) in the fasta file. * @param size The number of bases in the contig. * @param basesPerLine How many bases are on each line. * @param bytesPerLine How many bytes are on each line (includes newline characters). */ public FastaSequenceIndexEntry( String contig, long location, long size, int basesPerLine, int bytesPerLine, int sequenceIndex) { this.contig = contig; this.location = location; this.size = size; this.basesPerLine = basesPerLine; this.bytesPerLine = bytesPerLine; this.sequenceIndex = sequenceIndex; } /** * Gets the contig associated with this entry. * @return String representation of the contig. */ public String getContig() { return contig; } /** * Sometimes contigs need to be adjusted on-the-fly to * match sequence dictionary entries. Provide that capability * to other classes w/i the package. * @param contig New value for the contig. */ protected void setContig(String contig) { this.contig = contig; } /** * Gets the location of this contig within the fasta. * @return seek position within the fasta. */ public long getLocation() { return location; } /** * Gets the size, in bytes, of the data in the contig. * @return size of the contig bases in bytes. */ public long getSize() { return size; } /** * Gets the number of bases in a given line. * @return Number of bases in the fasta line. */ public int getBasesPerLine() { return basesPerLine; } /** * How many bytes (bases + whitespace) are consumed by the * given line? * @return Number of bytes in a line. */ public int getBytesPerLine() { return bytesPerLine; } public int getSequenceIndex() { return sequenceIndex; } /** * For debugging. Emit the contents of each contig line. * @return A string representation of the contig line. */ public String toString() { return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig, location, size, basesPerLine, bytesPerLine ); } /** * Compare this index entry to another index entry. * @param other another FastaSequenceIndexEntry * @return True if each has the same name, location, size, basesPerLine and bytesPerLine */ public boolean equals(Object other) { if(!(other instanceof FastaSequenceIndexEntry)) return false; if (this == other) return true; FastaSequenceIndexEntry otherEntry = (FastaSequenceIndexEntry)other; return (contig.equals(otherEntry.contig) && size == otherEntry.size && location == otherEntry.location && basesPerLine == otherEntry.basesPerLine && bytesPerLine == otherEntry.bytesPerLine); } /** * In general, we expect one entry per contig, so compute the hash based only on the contig. * @return A unique hash code representing this object. */ public int hashCode() { return contig.hashCode(); } }