/* * The MIT License (MIT) * * Copyright (c) 2007-2015 Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.broad.igv.feature.genome; import htsjdk.samtools.seekablestream.SeekableStream; import org.apache.log4j.Logger; import org.broad.igv.util.ParsingUtils; import org.broad.igv.util.stream.IGVSeekableStreamFactory; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * Implementation of Sequence backed by an indexed fasta file * * @author jrobinso * @date 8/7/11 */ public class FastaIndexedSequence implements Sequence { static Logger log = Logger.getLogger(FastaIndexedSequence.class); final FastaIndex index; final String path; final long contentLength; private final ArrayList<String> chromoNamesList; public FastaIndexedSequence(String path) throws IOException { this.path = path; contentLength = ParsingUtils.getContentLength(path); String indexPath = path + ".fai"; // The check below is not useful in the files have been copied or moved, which is always the case for our hosted // genomes. It causes lots of spurious warnings // if(ParsingUtils.getLastModified(path) > ParsingUtils.getLastModified(indexPath)){ // log.warn("Index file for " + path + " is older than the file it indexes"); // } index = new FastaIndex(indexPath); chromoNamesList = new ArrayList<String>(index.getSequenceNames()); } /** * Return the sequence for the query interval as a byte array. Coordinates are "ucsc" style (0 based) * <p/> * Example: 5 bases per line, 6 bytes per line * <p/> * Bases 0 1 2 3 4 * | 5 6 7 8 9 * | 10 11 12 13 14 * etc * Offset 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 * Bytes 0 1 2 3 4 5 | 6 7 8 9 10 | 11 12 13 14 15 16 * <p/> * query 9 - 13 * start line = 1 * base0 = 1*5 = 5 * offset = (9 - 5) = 4 * start byte = (1*6) + 3 = 10 * end line = 2 * * @param chr * @param qstart * @param qend * @return */ public byte[] getSequence(String chr, int qstart, int qend) { FastaIndex.FastaSequenceIndexEntry idxEntry = index.getIndexEntry(chr); if (idxEntry == null) { return null; } try { final int start = Math.max(0, qstart); // qstart should never be < 0 final int end = Math.min((int) idxEntry.getSize(), qend); final int bytesPerLine = idxEntry.getBytesPerLine(); final int basesPerLine = idxEntry.getBasesPerLine(); int nEndBytes = bytesPerLine - basesPerLine; int startLine = start / basesPerLine; int endLine = end / basesPerLine; int base0 = startLine * basesPerLine; // Base at beginning of start line int offset = start - base0; final long position = idxEntry.getPosition(); long startByte = position + startLine * bytesPerLine + offset; int base1 = endLine * basesPerLine; int offset1 = end - base1; long endByte = Math.min(contentLength, position + endLine * bytesPerLine + offset1); if (startByte >= endByte) { return null; } // Read all the bytes in the range. This will include endline characters byte[] allBytes = readBytes(startByte, endByte); // Create the array for the sequence -- this will be "allBytes" without the endline characters. ByteArrayOutputStream bos = new ByteArrayOutputStream(end - start); int srcPos = 0; int desPos = 0; // Copy first line final int allBytesLength = allBytes.length; if (offset > 0) { int nBases = Math.min(end - start, basesPerLine - offset); bos.write(allBytes, srcPos, nBases); srcPos += (nBases + nEndBytes); desPos += nBases; } while (srcPos < allBytesLength) { int nBases = Math.min(basesPerLine, allBytesLength - srcPos); bos.write(allBytes, srcPos, nBases); srcPos += (nBases + nEndBytes); desPos += nBases; } return bos.toByteArray(); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. return null; } } @Override public byte getBase(String chr, int position) { throw new RuntimeException("getBase() is not implemented for class " + FastaIndexedSequence.class.getName()); } /** * Read the bytes between file position posStart and posEnd * * @throws IOException */ private byte[] readBytes(long posStart, long posEnd) throws IOException { SeekableStream ss = null; try { ss = IGVSeekableStreamFactory.getInstance().getStreamFor(path); int nBytes = (int) (posEnd - posStart); byte[] bytes = new byte[nBytes]; ss.seek(posStart); ss.readFully(bytes); return bytes; } finally { if (ss != null) { ss.close(); } } } @Override public List<String> getChromosomeNames() { return chromoNamesList; } @Override public int getChromosomeLength(String chrname) { return index.getSequenceSize(chrname); } }