/* * The MIT License * * Copyright (c) 2013 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.tribble.index; import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.LocationAware; import htsjdk.tribble.CloseableTribbleIterator; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import htsjdk.tribble.FeatureCodecHeader; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.interval.IntervalIndexCreator; import htsjdk.tribble.index.interval.IntervalTreeIndex; import htsjdk.tribble.index.linear.LinearIndex; import htsjdk.tribble.index.linear.LinearIndexCreator; import htsjdk.tribble.index.tabix.TabixFormat; import htsjdk.tribble.index.tabix.TabixIndex; import htsjdk.tribble.index.tabix.TabixIndexCreator; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.util.LittleEndianInputStream; import htsjdk.tribble.util.LittleEndianOutputStream; import htsjdk.tribble.util.ParsingUtils; import htsjdk.tribble.util.TabixUtils; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Constructor; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.zip.GZIPInputStream; /** * Factory class for creating indexes. It is the responsibility of this class to determine and create the * correct index type from the input file or stream. Only LinearIndex and IntervalTreeIndex are supported * by this factory. */ public class IndexFactory { /** We can optimize index-file-creation for different factors. As of this writing, those are index-file size or seeking time. */ public enum IndexBalanceApproach { FOR_SIZE, FOR_SEEK_TIME } /** * an enum that contains all of the information about the index types, and how to create them */ public enum IndexType { LINEAR(LinearIndex.MAGIC_NUMBER, LinearIndex.INDEX_TYPE, LinearIndexCreator.class, LinearIndex.class, LinearIndexCreator.DEFAULT_BIN_WIDTH), INTERVAL_TREE(IntervalTreeIndex.MAGIC_NUMBER, IntervalTreeIndex.INDEX_TYPE, IntervalIndexCreator.class, IntervalTreeIndex.class, IntervalIndexCreator.DEFAULT_FEATURE_COUNT), // Tabix index initialization requires additional information, so generic construction won't work, thus indexCreatorClass is null. TABIX(TabixIndex.MAGIC_NUMBER, null, null, TabixIndex.class, -1); private final int magicNumber; private final Integer tribbleIndexType; private final Class<IndexCreator> indexCreatorClass; private final int defaultBinSize; private final Class<Index> indexType; public int getDefaultBinSize() { return defaultBinSize; } public IndexCreator getIndexCreator() { try { return indexCreatorClass.newInstance(); } catch ( final InstantiationException e ) { throw new TribbleException("Couldn't make index creator in " + this, e); } catch ( final IllegalAccessException e ) { throw new TribbleException("Couldn't make index creator in " + this, e); } } public boolean canCreate() { return indexCreatorClass != null; } IndexType(final int magicNumber, final Integer tribbleIndexType, final Class creator, final Class indexClass, final int defaultBinSize) { this.magicNumber = magicNumber; this.tribbleIndexType = tribbleIndexType; indexCreatorClass = creator; indexType = indexClass; this.defaultBinSize = defaultBinSize; } public Integer getTribbleIndexType() { return tribbleIndexType; } public Class getIndexType() { return indexType; } public int getMagicNumber() { return magicNumber; } /** * * @param is InputStream of index. This will be reset to location it was at when method was invoked. * @return The {@code IndexType} based on the {@code headerValue} * @throws TribbleException.UnableToCreateCorrectIndexType */ public static IndexType getIndexType(final BufferedInputStream is) { // Currently only need 8 bytes, so this should be plenty is.mark(128); final LittleEndianInputStream dis = new LittleEndianInputStream(is); final int magicNumber; final int type; try { // Read the type and version, then create the appropriate type magicNumber = dis.readInt(); // This is not appropriate for all types, but it doesn't hurt to read it. type = dis.readInt(); is.reset(); for (final IndexType indexType : IndexType.values()) { if (indexType.magicNumber == magicNumber && (indexType.tribbleIndexType == null || indexType.tribbleIndexType == type)) { return indexType; } } } catch (final IOException e) { throw new TribbleException("Problem detecting index type", e); } throw new TribbleException.UnableToCreateCorrectIndexType( String.format("Unknown index type. magic number: 0x%x; type %d", magicNumber, type)); } } /** * Load in index from the specified file. The type of index (LinearIndex or IntervalTreeIndex) is determined * at run time by reading the type flag in the file. * * @param indexFile from which to load the index */ public static Index loadIndex(final String indexFile) { final Index idx = null; BufferedInputStream bufferedInputStream = null; final LittleEndianInputStream dis = null; try { InputStream inputStream = ParsingUtils.openInputStream(indexFile); if (indexFile.endsWith(".gz")) { inputStream = new GZIPInputStream(inputStream); } else if (indexFile.endsWith(TabixUtils.STANDARD_INDEX_EXTENSION)) { inputStream = new BlockCompressedInputStream(inputStream); } // Must be buffered, because getIndexType uses mark and reset bufferedInputStream = new BufferedInputStream(inputStream, Defaults.NON_ZERO_BUFFER_SIZE); final Class indexClass = IndexType.getIndexType(bufferedInputStream).getIndexType(); final Constructor ctor = indexClass.getConstructor(InputStream.class); return (Index) ctor.newInstance(bufferedInputStream); } catch (final IOException ex) { throw new TribbleException.UnableToReadIndexFile("Unable to read index file", indexFile, ex); } catch (final Exception ex) { throw new RuntimeException(ex); } finally { try { if (bufferedInputStream != null) bufferedInputStream.close(); if (dis != null) dis.close(); //log.info(String.format("Closed %s and %s", is, dis)); } catch (final IOException e) { //log.error("Error closing indexFile: " + indexFile, e); } } } /** * a helper method for creating a linear binned index with default bin size * * @param inputFile the input file to load features from * @param codec the codec to use for decoding records */ public static LinearIndex createLinearIndex(final File inputFile, final FeatureCodec codec) { return createLinearIndex(inputFile, codec, LinearIndexCreator.DEFAULT_BIN_WIDTH); } /** * a helper method for creating a linear binned index * * @param inputFile the input file to load features from * @param codec the codec to use for decoding records * @param binSize the bin size */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> LinearIndex createLinearIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec, final int binSize) { final LinearIndexCreator indexCreator = new LinearIndexCreator(inputFile, binSize); return (LinearIndex)createIndex(inputFile, new FeatureIterator<FEATURE_TYPE, SOURCE_TYPE>(inputFile, codec), indexCreator); } /** * create an interval-tree index with the default features per bin count * * @param inputFile the file containing the features * @param codec to decode the features */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> IntervalTreeIndex createIntervalIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec) { return createIntervalIndex(inputFile, codec, IntervalIndexCreator.DEFAULT_FEATURE_COUNT); } /** * a helper method for creating an interval-tree index * * @param inputFile the input file to load features from * @param codec the codec to use for decoding records * @param featuresPerInterval */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> IntervalTreeIndex createIntervalIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec, final int featuresPerInterval) { final IntervalIndexCreator indexCreator = new IntervalIndexCreator(inputFile, featuresPerInterval); return (IntervalTreeIndex)createIndex(inputFile, new FeatureIterator<FEATURE_TYPE, SOURCE_TYPE>(inputFile, codec), indexCreator); } /** * Create a dynamic index with the default balancing approach * * @param inputFile the input file to load features from * @param codec the codec to use for decoding records */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> Index createDynamicIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec) { return createDynamicIndex(inputFile, codec, IndexBalanceApproach.FOR_SEEK_TIME); } /** * Create a index of the specified type with default binning parameters * * @param inputFile the input file to load features from * @param codec the codec to use for decoding records * @param type the type of index to create */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> Index createIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec, final IndexType type) { switch (type) { case INTERVAL_TREE: return createIntervalIndex(inputFile, codec); case LINEAR: return createLinearIndex(inputFile, codec); // Tabix index initialization requires additional information, so this construction method won't work. case TABIX: throw new UnsupportedOperationException("Tabix indices cannot be created through a generic interface"); } throw new IllegalArgumentException("Unrecognized IndexType " + type); } /** * Write the index to a file; little endian. * @param idx * @param idxFile * @throws IOException */ public static void writeIndex(final Index idx, final File idxFile) throws IOException { LittleEndianOutputStream stream = null; try { stream = new LittleEndianOutputStream(new BufferedOutputStream(new FileOutputStream(idxFile))); idx.write(stream); } finally { if(stream != null) { stream.close(); } } } /** * create a dynamic index, given an input file, codec, and balance approach * * @param inputFile the input file to load features from * @param codec the codec to use for decoding records * @param iba the index balancing approach */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> Index createDynamicIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec, final IndexBalanceApproach iba) { // get a list of index creators final DynamicIndexCreator indexCreator = new DynamicIndexCreator(inputFile, iba); return createIndex(inputFile, new FeatureIterator<FEATURE_TYPE, SOURCE_TYPE>(inputFile, codec), indexCreator); } /** * @param inputFile The file to be indexed. * @param codec Mechanism for reading inputFile. * @param tabixFormat Header fields for TabixIndex to be produced. * @param sequenceDictionary May be null, but if present may reduce memory footprint for index creation. Features * in inputFile must be in the order defined by sequenceDictionary, if it is present. */ public static <FEATURE_TYPE extends Feature, SOURCE_TYPE> TabixIndex createTabixIndex(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE_TYPE> codec, final TabixFormat tabixFormat, final SAMSequenceDictionary sequenceDictionary) { final TabixIndexCreator indexCreator = new TabixIndexCreator(sequenceDictionary, tabixFormat); return (TabixIndex)createIndex(inputFile, new FeatureIterator<FEATURE_TYPE, SOURCE_TYPE>(inputFile, codec), indexCreator); } private static Index createIndex(final File inputFile, final FeatureIterator iterator, final IndexCreator creator) { Feature lastFeature = null; Feature currentFeature; final Map<String, Feature> visitedChromos = new HashMap<String, Feature>(40); while (iterator.hasNext()) { final long position = iterator.getPosition(); currentFeature = iterator.next(); checkSorted(inputFile, lastFeature, currentFeature); //should only visit chromosomes once final String curChr = currentFeature.getChr(); final String lastChr = lastFeature != null ? lastFeature.getChr() : null; if(!curChr.equals(lastChr)){ if(visitedChromos.containsKey(curChr)){ String msg = "Input file must have contiguous chromosomes."; msg += " Saw feature " + featToString(visitedChromos.get(curChr)); msg += " followed later by " + featToString(lastFeature); msg += " and then " + featToString(currentFeature); throw new TribbleException.MalformedFeatureFile(msg, inputFile.getAbsolutePath()); }else{ visitedChromos.put(curChr, currentFeature); } } creator.addFeature(currentFeature, position); lastFeature = currentFeature; } iterator.close(); return creator.finalizeIndex(iterator.getPosition()); } private static String featToString(final Feature feature){ return feature.getChr() + ":" + feature.getStart() + "-" + feature.getEnd(); } private static void checkSorted(final File inputFile, final Feature lastFeature, final Feature currentFeature){ // if the last currentFeature is after the current currentFeature, exception out if (lastFeature != null && currentFeature.getStart() < lastFeature.getStart() && lastFeature.getChr().equals(currentFeature.getChr())) throw new TribbleException.MalformedFeatureFile("Input file is not sorted by start position. \n" + "We saw a record with a start of " + currentFeature.getChr() + ":" + currentFeature.getStart() + " after a record with a start of " + lastFeature.getChr() + ":" + lastFeature.getStart(), inputFile.getAbsolutePath()); } /** * Iterator for reading features from a file, given a {@code FeatureCodec}. */ static class FeatureIterator<FEATURE_TYPE extends Feature, SOURCE> implements CloseableTribbleIterator<Feature> { // the stream we use to get features private final SOURCE source; // the next feature private Feature nextFeature; // our codec private final FeatureCodec<FEATURE_TYPE, SOURCE> codec; private final File inputFile; // we also need cache our position private long cachedPosition; /** * * @param inputFile The file from which to read. Stream for reading is opened on construction. * @param codec */ public FeatureIterator(final File inputFile, final FeatureCodec<FEATURE_TYPE, SOURCE> codec) { this.codec = codec; this.inputFile = inputFile; final FeatureCodecHeader header = readHeader(); source = (SOURCE) codec.makeIndexableSourceFromStream(initStream(inputFile, header.getHeaderEnd())); readNextFeature(); } /** * Some codecs, e.g. VCF files, need the header to decode features. This is a rather poor design, * the internal header is set as a side-affect of reading it, but we have to live with it for now. */ private FeatureCodecHeader readHeader() { try { final SOURCE source = this.codec.makeSourceFromStream(initStream(inputFile, 0)); final FeatureCodecHeader header = this.codec.readHeader(source); codec.close(source); return header; } catch (final IOException e) { throw new TribbleException.InvalidHeader("Error reading header " + e.getMessage()); } } private PositionalBufferedStream initStream(final File inputFile, final long skip) { try { final FileInputStream is = new FileInputStream(inputFile); final PositionalBufferedStream pbs = new PositionalBufferedStream(is); if ( skip > 0 ) pbs.skip(skip); return pbs; } catch (final FileNotFoundException e) { throw new TribbleException.FeatureFileDoesntExist("Unable to open the input file, most likely the file doesn't exist.", inputFile.getAbsolutePath()); } catch (final IOException e) { throw new TribbleException.MalformedFeatureFile("Error initializing stream", inputFile.getAbsolutePath(), e); } } public boolean hasNext() { return nextFeature != null; } public Feature next() { final Feature ret = nextFeature; readNextFeature(); return ret; } /** * @throws UnsupportedOperationException */ public void remove() { throw new UnsupportedOperationException("We cannot remove"); } /** * @return the file position from the underlying reader */ public long getPosition() { return (hasNext()) ? cachedPosition : ((LocationAware) source).getPosition(); } @Override public Iterator<Feature> iterator() { return this; } @Override public void close() { codec.close(source); } /** * Read the next feature from the stream * @throws TribbleException.MalformedFeatureFile */ private void readNextFeature() { cachedPosition = ((LocationAware) source).getPosition(); try { nextFeature = null; while (nextFeature == null && !codec.isDone(source)) { nextFeature = codec.decodeLoc(source); } } catch (final IOException e) { throw new TribbleException.MalformedFeatureFile("Unable to read a line from the file", inputFile.getAbsolutePath(), e); } } } }