/* * The MIT License (MIT) * * Copyright (c) 2007-2015 Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.broad.igv.bbfile; import htsjdk.samtools.seekablestream.SeekableStream; import org.apache.log4j.Logger; import org.broad.igv.util.LittleEndianInputStream; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; import java.util.ArrayList; /** * Created by IntelliJ IDEA. * User: martind * Date: Jan 6, 2010 * Time: 4:05:31 PM * To change this template use File | Settings | File Templates. */ /* * RPTree class will construct a R+ tree from a binary Bed/Wig BBFile. * (or by insertion of tree nodes - TBD see insert method) * * 1) RPTree will first read in the R+ tree header with RPTreeHeader class. * * 2) Starting with the root node, the readRPTreeNode method will read in the * node format, determine if the node contains child nodes (isLeaf = false) * or leaf items (isLeaf = true). * * 3) If the node is a leaf node, all leaf items are read in to the node's leaf array. * * 4) If node is a child node, readRPTreeNode will be called recursively, * until a leaf node is encountered, where step 3 is performed. * * 5) The child nodes will be populated with their child node items in reverse order * of recursion from step 4, until the tree is completely populated * back up to the root node. * * 6) The getChromosomeKey is provided to construct a valid key for B+ * chromosome tree searches, and getChromosomeID returns a chromosome ID for * searches in the R+ index tree. * **/ public class RPTree { private static Logger log = Logger.getLogger(RPTree.class); public static final int RPTREE_NODE_FORMAT_SIZE = 4; // node format size public static final int RPTREE_NODE_LEAF_ITEM_SIZE = 32; // leaf item size public static final int RPTREE_NODE_CHILD_ITEM_SIZE = 24; // child item size // R+ tree access variables - for reading in R+ tree nodes from a file private int uncompressBuffSize; // decompression buffer size; or 0 for uncompressed data private boolean isLowToHigh; // binary data low to high if true; else high to low private long rpTreeOffset; // file offset to the R+ tree // R+ tree index header - Table K private RPTreeHeader rpTreeHeader; // R+ tree header (Table K for BBFile) // R+ tree bounds private RPChromosomeRegion chromosomeBounds; // R+ tree's bounding chromosome region // R+ tree nodal variables private int order; // R+ tree order: maximum number of leaves per node private RPTreeNode rootNode; // root node for R+ tree private long nodeCount; // number of nodes defined in the R+ tree private long leafCount; // number of leaves in the R+ tree /* * Constructor for reading in a B+ tree from a BBFile/input stream. * */ /* * Constructor for R+ chromosome data locator tree * * Parameters: * fis - file input stream handle * fileOffset - location for R+ tree header * isLowToHigh - binary values are low to high if true; else high to low * uncompressBuffSize - buffer size for decompression; else 0 for uncompressed data * */ public RPTree(SeekableStream fis, long fileOffset, boolean isLowToHigh, int uncompressBuffSize, boolean forceDescend) { // save the seekable file handle and B+ Tree file offset // Note: the offset is the file position just after the B+ Tree Header // mBBFis = fis; rpTreeOffset = fileOffset; this.uncompressBuffSize = uncompressBuffSize; this.isLowToHigh = isLowToHigh; // read in R+ tree header - verify the R+ tree info exits rpTreeHeader = new RPTreeHeader(fis, rpTreeOffset, isLowToHigh); // log error if header not found and throw exception if (!rpTreeHeader.isHeaderOK()) { int badMagic = rpTreeHeader.getMagic(); log.error("Error reading R+ tree header: bad magic = " + badMagic); throw new RuntimeException("Error reading R+ tree header: bad magic = " + badMagic); } // assigns R+ tree organization from the header order = rpTreeHeader.getBlockSize(); chromosomeBounds = new RPChromosomeRegion(rpTreeHeader.getStartChromID(), rpTreeHeader.getStartBase(), rpTreeHeader.getEndChromID(), rpTreeHeader.getEndBase()); // populate the tree - read in the nodes long nodeOffset = rpTreeOffset + rpTreeHeader.getHeaderSize(); RPTreeNode parentNode = null; // parent node of the root is itself, or null // start constructing the R+ tree - get the root node rootNode = readRPTreeNode(fis, nodeOffset, isLowToHigh, forceDescend); } /* * Constructs an R+ Tree which conforms to the supplied information * order - the items per node factor, sometimes called the m factor, * where any node must have at least m/2 items and no more than m items. * keySize - the number of significant bytes in a item key. * */ public RPTree(int order) { // R+ tree node specification this.order = order; // Note: acknowledge no bounds specified as a null object chromosomeBounds = null; } /* * Method returns size of buffer required for data decompression. * * Returns: * Data decompression buffer size in bytes; else 0 for uncompressed data in file. * */ public int getUncompressBuffSize() { return uncompressBuffSize; } /* * Method returns if file contains formatted data in low to high byte order. * * Returns: * Returns true if data ordered in low to high byte order; false if high to low. * */ public boolean isIsLowToHigh() { return isLowToHigh; } /* * Method returns the R+ tree order, the maximum number of leaf items per node. * * Returns: * Maximum number of leaf items per node.. * */ public int getOrder() { return order; } /* * Method returns the R+ tree index header. * * Returns: * R+ tree index header. * */ public RPTreeHeader getRPTreeHeader() { return rpTreeHeader; } /* * Method returns the total number of chromosomes or contigs in the R+ tree. * * Returns: * Total number of chromosomes or contigs in the R+ tree. * */ public long getItemCount() { return rpTreeHeader.getItemCount(); } /* * Method returns the chromosome bounding region for all R+ tree data. * * Returns: * chromosome bounding region for all R+ tree data * */ public RPChromosomeRegion getChromosomeBounds() { return chromosomeBounds; } /* * Method returns the total node count for the R+ tree. * * Returns: * Node count for R+ tree; or 0 if tree was constructed without nodes. * */ public long getNodeCount() { return nodeCount; } /* * Method finds the bounding chromosome region in R+ tree for a chromosome ID range. * * Parameters: * startChromID - start chromosome for the region * endChromID - end chromosome for the region * * Returns: * Region which bounds the extremes of chromosome ID range * */ public RPChromosomeRegion getChromosomeRegion(int startChromID, int endChromID) { RPChromosomeRegion region; // Search the R+ tree to extract the chromosome region. RPTreeNode thisNode = rootNode; RPChromosomeRegion seedRegion = null; // null until a chromosome match region = findChromosomeRegion(thisNode, startChromID, endChromID, seedRegion); return region; } /* * Method returns list of all chromosome regions found for the chromosome ID range. * * Returns: * List of all chromosome regions in the chromosome ID range. * */ public ArrayList<RPChromosomeRegion> getAllChromosomeRegions() { // Search the R+ tree to extract the chromosome regions RPTreeNode thisNode = rootNode; ArrayList<RPChromosomeRegion> regionList = new ArrayList<RPChromosomeRegion>(); findAllChromosomeRegions(thisNode, regionList); return regionList; } /* * Method extracts a hit list of chromosome data file locations for a specified chromosome region. * * Parameters: * chromosomeRegion - chromosome region for feature extraction consists of: * startChromID - start chromosome ID for region * mStartBase - starting base for data extraction * endChromID - end chromosome ID for region * mEndBase - ending base for data extraction * contained - if true indicates all returned data must be * completely contained within the extraction region; * else if false, returns all intersecting region features * * Note: The selection region will be limited to accommodate maxLeafHits; which terminates * selection at the leaf node at which maxLeafHits is reached. Total number of selected * items may exceed maxLeafHits, but only by the number of leaves in the cutoff leaf node. * * Returns: * List of chromosome leaf items which identify file locations for bed data * of a chromosome region, or a sub-region subject to maxLeafHits. * * Check returned leaf item bounds for cutoff limits on selection region due to maxLeafHits. * */ public ArrayList<RPTreeLeafNodeItem> getChromosomeDataHits(RPChromosomeRegion selectionRegion, boolean contained) { ArrayList<RPTreeLeafNodeItem> leafHitItems = new ArrayList<RPTreeLeafNodeItem>(); // check for valid selection region - return empty collection if null if (selectionRegion == null) return leafHitItems; // limit the hit list size /* if(maxLeafHits > 0) mMaxLeafHits = maxLeafHits; else mMaxLeafHits = mRPTreeHeader.getBlockSize(); */ findChromosomeRegionItems(rootNode, selectionRegion, leafHitItems); return leafHitItems; } // prints out the R+ tree header, nodes, and leaves public void print() { // check if read in if (!rpTreeHeader.isHeaderOK()) { int badMagic = rpTreeHeader.getMagic(); log.error("Error reading R+ tree header: bad magic = " + badMagic); return; } // print R+ tree header rpTreeHeader.print(); // print R+ tree node and leaf items - recursively if (rootNode != null) rootNode.printItems(); } /* * Method finds and returns the bounding chromosome region for the specified * chromosome ID range. * * Parameters: * thisNode - tree node to start search * startChromID - start chromosome ID for region * endChromID - end chromosome ID for region * region - leaf region contains extremes for given chromosome ID range * * Note: region grows recursively to match the extremes found for the * specified chromosome ID range. Starting base comes from the startChromID * match and ending base comes form the endChromID match. * * Returns: * Chromosome region if found in the R+ tree node passed in; * else null region * */ private RPChromosomeRegion findChromosomeRegion(RPTreeNode thisNode, int startChromID, int endChromID, RPChromosomeRegion region) { int hitValue; RPChromosomeRegion bounds; // search down the tree recursively starting with the root node if (thisNode.isLeaf()) { int nLeaves = thisNode.getItemCount(); for (int index = 0; index < nLeaves; ++index) { RPTreeLeafNodeItem leaf = (RPTreeLeafNodeItem) thisNode.getItem(index); // get leaf region bounds bounds = leaf.getChromosomeBounds(); // test this leaf's chromosome ID's for chromosome hit, then include its base bounds if (startChromID >= bounds.getStartChromID() && startChromID <= bounds.getEndChromID() || endChromID >= bounds.getStartChromID() && endChromID <= bounds.getEndChromID()) { // Note: need a start region before comparing other regions for extremes if (region == null) region = new RPChromosomeRegion(bounds); // seed extreme region else region = region.getExtremes(bounds); // update seed extreme region } } } else { // check all child nodes int nNodes = thisNode.getItemCount(); for (int index = 0; index < nNodes; ++index) { RPTreeChildNodeItem childItem = (RPTreeChildNodeItem) thisNode.getItem(index); // get bounding region and compare chromosome ID's bounds = childItem.getChromosomeBounds(); // test node chromosome ID range for any leaf hits for either startChromID or endChromID if (startChromID >= bounds.getStartChromID() && startChromID <= bounds.getEndChromID() || endChromID >= bounds.getStartChromID() && endChromID <= bounds.getEndChromID()) { RPTreeNode childNode = childItem.getChildNode(); region = findChromosomeRegion(childNode, startChromID, endChromID, region); } // check next node } } return region; } /* * Method finds and returns all chromosome regions in the R+ chromosome data tree. * * parameters: * thisNode - tree node to start search * chromosomeList - list of all chromosome names found. * * Returns: * Adds chromosome regions if found in the chromosome region list passed in. * */ private void findAllChromosomeRegions(RPTreeNode thisNode, ArrayList<RPChromosomeRegion> regionList) { // search down the tree recursively starting with the root node if (thisNode.isLeaf()) { int nLeaves = thisNode.getItemCount(); for (int index = 0; index < nLeaves; ++index) { RPTreeLeafNodeItem leaf = (RPTreeLeafNodeItem) thisNode.getItem(index); // add all leaf regions RPChromosomeRegion region = leaf.getChromosomeBounds(); regionList.add(region); } } else { // get all child nodes int nNodes = thisNode.getItemCount(); for (int index = 0; index < nNodes; ++index) { RPTreeChildNodeItem childItem = (RPTreeChildNodeItem) thisNode.getItem(index); RPTreeNode childNode = childItem.getChildNode(); findAllChromosomeRegions(childNode, regionList); } } } /* * Method returns an array of chromosome leaf items for the chromosome test region. * * Note: At the leaf item level, any hit is valid. Use of contained is exercised * when the leaf item data region is examined. * Parameters: * * thisNode - tree node to start search * testRegion - bounding region for feature extraction consists of: * start chromosome ID for region * starting base for data extraction * end chromosome ID for region * ending base for data extraction * leafHitItems - array containing previous leaf hit items * * Note: leaf hit items will be limited to leaves in the current leaf node, * once the maximum number of leaf hits mMaxLeafHits is reached. * * Returns: * ArrayList of leaf hit items containing updated hit regions and data offsets; * else an empty array if hit regions not found. * */ private void findChromosomeRegionItems(RPTreeNode thisNode, RPChromosomeRegion selectionRegion, ArrayList<RPTreeLeafNodeItem> leafHitItems) { int hitValue; // check for valid selection region - ignore request if null if (selectionRegion == null) return; // check if node is disjoint hitValue = thisNode.compareRegions(selectionRegion); if (Math.abs(hitValue) >= 2) return; // search down the tree recursively starting with the root node if (thisNode.isLeaf()) { int nLeaves = thisNode.getItemCount(); for (int index = 0; index < nLeaves; ++index) { RPTreeLeafNodeItem leafItem = (RPTreeLeafNodeItem) thisNode.getItem(index); // compute the region hit value hitValue = leafItem.compareRegions(selectionRegion); // select contained or intersected leaf regions - item selection is by iterator if (Math.abs(hitValue) < 2) { leafHitItems.add(leafItem); } // ascending regions will continue to be disjoint so terminate nodal search else if (hitValue > 1) break; // check next leaf } } else { // check all child nodes int nNodes = thisNode.getItemCount(); for (int index = 0; index < nNodes; ++index) { RPTreeChildNodeItem childItem = (RPTreeChildNodeItem) thisNode.getItem(index); // check for region intersection at the node level hitValue = childItem.compareRegions(selectionRegion); // test this node and get any leaf hits; intersections and containing if (Math.abs(hitValue) < 2) { RPTreeNode childNode = childItem.getChildNode(); findChromosomeRegionItems(childNode, selectionRegion, leafHitItems); } // ascending regions will continue to be disjoint so terminate nodal search else if (hitValue > 1) break; } } } /* * Method reads in the R+ tree nodes recursively. * * Note: If node is a child node, the node is examined recursively, * until the leaves are found. * * Parameters: * fis - file input stream handle * fileOffset - file location for node specification (Table L) * parent - parent node of this node * isLowToHigh - indicates formatted data is low to high byte order if true; * else is high to low byte order * * Returns: * A tree node, for success, or null for failure to find the node information. * */ static RPTreeNode readRPTreeNode(SeekableStream fis, long fileOffset, boolean isLowToHigh, boolean forceDescend) { LittleEndianInputStream lbdis = null; // low o high byte stream reader DataInputStream bdis = null; // high to low byte stream reader byte[] buffer = new byte[RPTREE_NODE_FORMAT_SIZE]; RPTreeNode thisNode = null; try { // Read node format into a buffer fis.seek(fileOffset); fis.readFully(buffer); if (isLowToHigh) { lbdis = new LittleEndianInputStream(new ByteArrayInputStream(buffer)); } else { bdis = new DataInputStream(new ByteArrayInputStream(buffer)); } // find node type byte type; if (isLowToHigh) { type = lbdis.readByte(); } else { type = bdis.readByte(); } boolean isLeaf; int itemSize; if (type == 1) { isLeaf = true; itemSize = RPTREE_NODE_LEAF_ITEM_SIZE; thisNode = new RPTreeNode(true); } else { isLeaf = false; itemSize = RPTREE_NODE_CHILD_ITEM_SIZE; thisNode = new RPTreeNode(false); } //nodeCount++; int itemCount; if (isLowToHigh) { lbdis.readByte(); // reserved - not currently used itemCount = lbdis.readUShort(); } else { bdis.readByte(); // reserved - not currently used itemCount = bdis.readUnsignedShort(); } int itemBlockSize = itemCount * itemSize; buffer = new byte[itemBlockSize]; // allocate buffer for item sisze fis.readFully(buffer); if (isLowToHigh) lbdis = new LittleEndianInputStream(new ByteArrayInputStream(buffer)); else bdis = new DataInputStream(new ByteArrayInputStream(buffer)); // get the node items - leaves or child nodes int startChromID, endChromID; int startBase, endBase; for (int item = 0; item < itemCount; ++item) { // always extract the bounding rectangle if (isLowToHigh) { startChromID = lbdis.readInt(); startBase = lbdis.readInt(); endChromID = lbdis.readInt(); endBase = lbdis.readInt(); } else { startChromID = bdis.readInt(); startBase = bdis.readInt(); endChromID = bdis.readInt(); endBase = bdis.readInt(); } if (isLeaf) { long dataOffset; long dataSize; if (isLowToHigh) { dataOffset = lbdis.readLong(); dataSize = lbdis.readLong(); } else { dataOffset = bdis.readLong(); dataSize = bdis.readLong(); } thisNode.insertItem(new RPTreeLeafNodeItem(startChromID, startBase, endChromID, endBase, dataOffset, dataSize)); } else { // get the child node pointed to in the node item long nodeOffset; if (isLowToHigh) { nodeOffset = lbdis.readLong(); } else { nodeOffset = bdis.readLong(); } // Recursive call to read next child node // The test on chromIds is designed to stop the descent when the tree reaches the level of an // individual chromosome. These are loaded later on demand. RPTreeChildNodeItem childNodeItem; if (startChromID != endChromID || forceDescend) { RPTreeNode childNode = readRPTreeNode(fis, nodeOffset, isLowToHigh, forceDescend); childNodeItem = new RPTreeChildNodeItem(startChromID, startBase, endChromID, endBase, childNode); } else { RPTreeNodeProxy proxy = new RPTreeNodeProxy(fis, nodeOffset, isLowToHigh, startChromID); childNodeItem = new RPTreeChildNodeItem(startChromID, startBase, endChromID, endBase, proxy); } thisNode.insertItem(childNodeItem); } fileOffset += itemSize; } } catch (IOException ex) { log.error("Error reading in R+ tree nodes: " + ex); throw new RuntimeException("Error reading R+ tree nodes: \n", ex); } // return success return thisNode; } }