/* * Copyright (c) 2007-2010 by The Broad Institute, Inc. and the Massachusetts Institute of Technology. * All Rights Reserved. * * This software is licensed under the terms of the GNU Lesser General Public License (LGPL), Version 2.1 which * is available at http://www.opensource.org/licenses/lgpl-2.1.php. * * THE SOFTWARE IS PROVIDED "AS IS." THE BROAD AND MIT MAKE NO REPRESENTATIONS OR WARRANTIES OF * ANY KIND CONCERNING THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT * OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. IN NO EVENT SHALL THE BROAD OR MIT, OR THEIR * RESPECTIVE TRUSTEES, DIRECTORS, OFFICERS, EMPLOYEES, AND AFFILIATES BE LIABLE FOR ANY DAMAGES OF * ANY KIND, INCLUDING, WITHOUT LIMITATION, INCIDENTAL OR CONSEQUENTIAL DAMAGES, ECONOMIC * DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER THE BROAD OR MIT SHALL * BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE * FOREGOING. */ package htsjdk.tribble.index; import htsjdk.tribble.Tribble; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.LittleEndianInputStream; import htsjdk.tribble.util.LittleEndianOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** * <p/> * An abstract implementation of the index class. This class takes care of the basics that are common * to all of the current indexing classes; including the version information, common header properties, * and reading and writing the header to disk. * * @author jrobinso */ public abstract class AbstractIndex implements MutableIndex { public enum IndexType { LINEAR(1), INTERVAL_TREE(2); public final int fileHeaderTypeIdentifier; IndexType(int fileHeaderTypeIdentifier) { this.fileHeaderTypeIdentifier = fileHeaderTypeIdentifier; } } // todo -- up to version 4 and use ETag to detect out of date // todo -- inode number + size in bytes + modification time // todo -- remove MD5 // the current version of the index public static final int VERSION = 3; public static final int MAGIC_NUMBER = 1480870228; // byte[]{'T', 'I', 'D', 'X'}; private final static String NO_MD5 = ""; private final static long NO_FILE_SIZE = -1L; private final static long NO_TS = -1L; protected int version; // Our version value protected File indexedFile = null; // The file we've created this index for protected long indexedFileSize = NO_FILE_SIZE; // The size of the indexed file protected long indexedFileTS = NO_TS; // The timestamp protected String indexedFileMD5 = NO_MD5; // The MD5 value, generally not filled in (expensive to calc) protected int flags; public boolean hasFileSize() { return indexedFileSize != NO_FILE_SIZE; } public boolean hasTimestamp() { return indexedFileTS != NO_TS; } public boolean hasMD5() { return indexedFileMD5 != NO_MD5; } private LinkedHashMap<String, String> properties; /** * the map of our chromosome bins */ protected LinkedHashMap<String, ChrIndex> chrIndices; /** * Any flags we're using */ private static final int SEQUENCE_DICTIONARY_FLAG = 0x8000; // if we have a sequence dictionary in our header /** * @param obj * @return true if this and obj are 'effectively' equivalent data structures. */ public boolean equalsIgnoreProperties(final Object obj) { if (this == obj) return true; if (!(obj instanceof AbstractIndex)) { System.err.printf("equals: %s not instance of AbstractIndex", obj); return false; } final AbstractIndex other = (AbstractIndex) obj; if (version != other.version) { System.err.printf("equals version: this %d != other %d%n", version, other.version); return false; } if (indexedFile != other.indexedFile && (indexedFile == null || !indexedFile.equals(other.indexedFile))) { System.err.printf("equals indexedFile: this %s != other %s%n", indexedFile, other.indexedFile); return false; } if (indexedFileSize != other.indexedFileSize) { System.err.printf("equals indexedFileSize: this %d != other %d%n", indexedFileSize, other.indexedFileSize); return false; } if (!indexedFileMD5.equals(other.indexedFileMD5)) { System.err.printf("equals indexedFileMD5: this %s != other %s%n", indexedFileMD5, other.indexedFileMD5); return false; } if (flags != other.flags) { System.err.printf("equals flags: this %d != other %d%n", flags, other.flags); return false; } if (!chrIndices.equals(other.chrIndices)) { System.err.printf("equals chrIndeces: this %s != other %s%n", chrIndices, other.chrIndices); return false; } return true; } /** * create an abstract index, with defaults for the version value, and empty properties and chromosome lists */ public AbstractIndex() { this.version = VERSION; // <= is overriden when file is read this.properties = new LinkedHashMap<String, String>(); chrIndices = new LinkedHashMap(); } /** * create an index file from the target feature file * * @param featureFile the feature file to create an index from */ public AbstractIndex(final String featureFile) { this(new File(featureFile)); } public AbstractIndex(final File featureFile) { this(); this.indexedFile = featureFile; } public AbstractIndex(final AbstractIndex parent) { this(); this.version = parent.version; this.indexedFile = parent.indexedFile; this.indexedFileSize = parent.indexedFileSize; this.indexedFileTS = parent.indexedFileTS; this.indexedFileMD5 = parent.indexedFileMD5; this.flags = parent.flags; this.properties = (LinkedHashMap<String, String>) parent.properties.clone(); } protected void validateIndexHeader(final int indexType, final LittleEndianInputStream dis) throws IOException { final int magicNumber = dis.readInt(); if (magicNumber != MAGIC_NUMBER) { throw new TribbleException(String.format("Unexpected magic number %d", magicNumber)); } final int type = dis.readInt(); if (type != indexType) { throw new TribbleException(String.format("Unexpected index type %d", type)); } } /** * check the current version against the version we read in * * @return true if we're up to date, false otherwise */ public boolean isCurrentVersion() { return version == VERSION; } public File getIndexedFile() { return indexedFile; } public long getIndexedFileSize() { return indexedFileSize; } public long getIndexedFileTS() { return indexedFileTS; } public String getIndexedFileMD5() { return indexedFileMD5; } public int getFlags() { return flags; } public int getVersion() { return version; } public void setMD5(final String md5) { this.indexedFileMD5 = md5; } public boolean containsChromosome(final String chr) { return chrIndices.containsKey(chr); } public void finalizeIndex() { // these two functions must be called now because the file may be being written during on the fly indexing if (indexedFile != null) { this.indexedFileSize = indexedFile.length(); this.indexedFileTS = indexedFile.lastModified(); } } /** * write the header to the target output stream * * @param dos the little endian output stream * @throws IOException an exception when we can't write to the file */ private void writeHeader(final LittleEndianOutputStream dos) throws IOException { dos.writeInt(MAGIC_NUMBER); dos.writeInt(getType()); dos.writeInt(version); dos.writeString(indexedFile.getAbsolutePath()); dos.writeLong(indexedFileSize); dos.writeLong(indexedFileTS); dos.writeString(indexedFileMD5); dos.writeInt(flags); // Properties (Version 3 and later) dos.writeInt(properties.size()); for (final Map.Entry<String, String> prop : properties.entrySet()) { dos.writeString(prop.getKey()); dos.writeString(prop.getValue()); } } /** * read the header from the input stream * * @param dis the little endian input stream * @throws IOException if we fail to read from the file at any point */ private void readHeader(final LittleEndianInputStream dis) throws IOException { version = dis.readInt(); indexedFile = new File(dis.readString()); indexedFileSize = dis.readLong(); indexedFileTS = dis.readLong(); indexedFileMD5 = dis.readString(); flags = dis.readInt(); if (version < 3 && (flags & SEQUENCE_DICTIONARY_FLAG) == SEQUENCE_DICTIONARY_FLAG) { readSequenceDictionary(dis); } if (version >= 3) { int nProperties = dis.readInt(); while (nProperties-- > 0) { final String key = dis.readString(); final String value = dis.readString(); properties.put(key, value); } } } /** * Kept to maintain backward compatibility with pre version 3 indexes. The sequence dictionary is no longer * used, use getSequenceNames() instead. * * @param dis * @throws IOException */ private void readSequenceDictionary(final LittleEndianInputStream dis) throws IOException { final int size = dis.readInt(); if (size < 0) throw new IllegalStateException("Size of the sequence dictionary entries is negative"); for (int x = 0; x < size; x++) { dis.readString(); dis.readInt(); } } public List<String> getSequenceNames() { return new ArrayList<String>(chrIndices.keySet()); } public List<Block> getBlocks(final String chr, final int start, final int end) { return getChrIndex(chr).getBlocks(start, end); } public List<Block> getBlocks(final String chr) { return getChrIndex(chr).getBlocks(); } /** * @param chr * @return return the ChrIndex associated with chr, * @throws IllegalArgumentException if {@code chr} not found */ private final ChrIndex getChrIndex(final String chr) { final ChrIndex chrIdx = chrIndices.get(chr); if (chrIdx == null) { throw new IllegalArgumentException("getBlocks() called with of unknown contig " + chr); } else { return chrIdx; } } public void write(final LittleEndianOutputStream stream) throws IOException { writeHeader(stream); //# of chromosomes stream.writeInt(chrIndices.size()); for (final ChrIndex chrIdx : chrIndices.values()) { chrIdx.write(stream); } } @Override public void writeBasedOnFeatureFile(final File featureFile) throws IOException { if (!featureFile.isFile()) return; final LittleEndianOutputStream idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(featureFile))); write(idxStream); idxStream.close(); } public void read(final LittleEndianInputStream dis) throws IOException { try { readHeader(dis); int nChromosomes = dis.readInt(); chrIndices = new LinkedHashMap<String, ChrIndex>(nChromosomes); while (nChromosomes-- > 0) { final ChrIndex chrIdx = (ChrIndex) getChrIndexClass().newInstance(); chrIdx.read(dis); chrIndices.put(chrIdx.getName(), chrIdx); } } catch (final InstantiationException e) { throw new TribbleException.UnableToCreateCorrectIndexType("Unable to create class " + getChrIndexClass(), e); } catch (final IllegalAccessException e) { throw new TribbleException.UnableToCreateCorrectIndexType("Unable to create class " + getChrIndexClass(), e); } finally { dis.close(); } //printIndexInfo(); } protected void printIndexInfo() { System.out.println(String.format("Index for %s with %d indices", indexedFile, chrIndices.size())); final BlockStats stats = getBlockStats(true); System.out.println(String.format(" total blocks %d", stats.total)); System.out.println(String.format(" total empty blocks %d", stats.empty)); } protected static class BlockStats { long total = 0, empty = 0, objects = 0, size = 0; } protected BlockStats getBlockStats(final boolean logDetails) { final BlockStats stats = new BlockStats(); for (final Map.Entry<String, ChrIndex> elt : chrIndices.entrySet()) { final List<Block> blocks = elt.getValue().getBlocks(); if (blocks != null) { final int nBlocks = blocks.size(); int nEmptyBlocks = 0; for (final Block b : elt.getValue().getBlocks()) { if (b.getSize() == 0) nEmptyBlocks++; } stats.empty += nEmptyBlocks; stats.total += nBlocks; if (logDetails) System.out.println(String.format(" %s => %d blocks, %d empty, %.2f", elt.getKey(), nBlocks, nEmptyBlocks, (100.0 * nEmptyBlocks) / nBlocks)); } } return stats; } protected String statsSummary() { final BlockStats stats = getBlockStats(false); return String.format("%12d blocks (%12d empty (%.2f%%))", stats.total, stats.empty, (100.0 * stats.empty) / stats.total); } public void addProperty(final String key, final String value) { properties.put(key, value); } public void addProperties(final Map<String, String> properties) { this.properties.putAll(properties); } /** * return a mapping of name to property value * * @return the mapping of values as an unmodifiable map */ public Map<String, String> getProperties() { return Collections.unmodifiableMap(properties); } /** * get the index type * * @return The index type */ protected abstract int getType(); /** * returns the class for the index type * * @return a Class, from which a new instance can be created */ public abstract Class getChrIndexClass(); }