/* * Copyright (c) 2010, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.tribble.index; import htsjdk.tribble.Feature; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.interval.IntervalIndexCreator; import htsjdk.tribble.index.linear.LinearIndexCreator; import htsjdk.tribble.util.MathUtils; import java.io.File; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.TreeMap; /** * A DynamicIndexCreator creates the proper index based on an {@link IndexFactory.IndexBalanceApproach} and * the characteristics of the file. Ultimately this is either a LinearIndex or an IntervalTreeIndex, with index * parameters based on whether seek time or file size is to be minimized. */ public class DynamicIndexCreator extends TribbleIndexCreator { IndexFactory.IndexBalanceApproach iba; Map<IndexFactory.IndexType,TribbleIndexCreator> creators; /** * we're interested in two stats: * the longest feature and the density of features */ int longestFeatureLength = 0; long featureCount = 0; MathUtils.RunningStat stats = new MathUtils.RunningStat(); long basesSeen = 0; Feature lastFeature = null; File inputFile; public DynamicIndexCreator(final File inputFile, final IndexFactory.IndexBalanceApproach iba) { this.iba = iba; // get a list of index creators this.inputFile = inputFile; creators = getIndexCreators(inputFile,iba); } public Index finalizeIndex(final long finalFilePosition) { // finalize all of the indexes // return the score of the indexes we've generated final Map<Double,TribbleIndexCreator> mapping = scoreIndexes((double)featureCount/(double)basesSeen, creators, longestFeatureLength, iba); final TribbleIndexCreator creator = getMinIndex(mapping, this.iba); for (final Map.Entry<String, String> entry : properties.entrySet()) { creator.addProperty(entry.getKey(), entry.getValue()); } // add our statistics to the file creator.addProperty("FEATURE_LENGTH_MEAN",String.valueOf(stats.mean())); creator.addProperty("FEATURE_LENGTH_STD_DEV",String.valueOf(stats.standardDeviation())); creator.addProperty("MEAN_FEATURE_VARIANCE",String.valueOf(stats.variance())); // add the feature count creator.addProperty("FEATURE_COUNT",String.valueOf(featureCount)); // Now let's finalize and create the index itself return creator.finalizeIndex(finalFilePosition); } /** * create a list of index creators (initialized) representing the common index types we'd suspect they'd like to use * @param inputFile the input file to use to create the indexes * @return a map of index type to the best index for that balancing approach */ private Map<IndexFactory.IndexType,TribbleIndexCreator> getIndexCreators(final File inputFile, final IndexFactory.IndexBalanceApproach iba) { final Map<IndexFactory.IndexType,TribbleIndexCreator> creators = new HashMap<IndexFactory.IndexType,TribbleIndexCreator>(); if (iba == IndexFactory.IndexBalanceApproach.FOR_SIZE) { // add a linear index with the default bin size final LinearIndexCreator linearNormal = new LinearIndexCreator(inputFile, LinearIndexCreator.DEFAULT_BIN_WIDTH); creators.put(IndexFactory.IndexType.LINEAR,linearNormal); // create a tree index with the default size final IntervalIndexCreator treeNormal = new IntervalIndexCreator(inputFile, IntervalIndexCreator.DEFAULT_FEATURE_COUNT); creators.put(IndexFactory.IndexType.INTERVAL_TREE,treeNormal); } // this section is a little more arbitrary; we're creating indexes with a bin size that's a portion of the default; these // values were determined experimentally if (iba == IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME) { // create a linear index with a small bin size final LinearIndexCreator linearSmallBin = new LinearIndexCreator(inputFile, Math.max(200, LinearIndexCreator.DEFAULT_BIN_WIDTH / 4)); creators.put(IndexFactory.IndexType.LINEAR,linearSmallBin); // create a tree index with a small index size final IntervalIndexCreator treeSmallBin = new IntervalIndexCreator(inputFile, Math.max(20, IntervalIndexCreator.DEFAULT_FEATURE_COUNT / 8)); creators.put(IndexFactory.IndexType.INTERVAL_TREE,treeSmallBin); } return creators; } public void addFeature(final Feature f, final long filePosition) { // protected static Map<Double,Index> createIndex(FileBasedFeatureIterator<Feature> iterator, Map<IndexType,IndexCreator> creators, IndexBalanceApproach iba) { // feed each feature to the indexes we've created // first take care of the stats featureCount++; // calculate the number of bases seen - we have to watch out for the situation where the last record was on the previous chromosome basesSeen = (lastFeature == null) ? basesSeen + f.getStart() : ((f.getStart() - lastFeature.getStart() >= 0) ? basesSeen + (f.getStart() - lastFeature.getStart()) : basesSeen + f.getStart()); longestFeatureLength = Math.max(longestFeatureLength,(f.getEnd()-f.getStart()) + 1); // push the longest feature to the running stats stats.push(longestFeatureLength); // now feed the feature to each of our creators for (final IndexCreator creator : creators.values()) { creator.addFeature(f,filePosition); } //Redundant check, done in IndexFactory // if the last feature is after the current feature, exception out // if (lastFeature != null && f.getStart() < lastFeature.getStart() && lastFeature.getChr().equals(f.getChr())) // throw new TribbleException.MalformedFeatureFile("We saw a record with a start of " + f.getChr() + ":" + f.getStart() + // " after a record with a start of " + lastFeature.getChr() + ":" + lastFeature.getStart(), inputFile.getAbsolutePath()); // save the last feature lastFeature = f; } /** * score the available indexes for the specified density and feature lengths * * The scoring method is trying to determine how many features would be returned for a sample one base query; or: * (features/seek). For the interval index this is clear: it's the bin size (interval is binned by feature count). * for Linear indexes it's the density of features X the number of bins we need to retrieve (which is determined * by the bin size X the longest feature). * * @param densityOfFeatures the density of features (features/base) * @param indexes Map from IndexType -> IndexCreator * @param longestFeature the longest feature we've found * @param iba the index balancing approach * @return the best index available for the target indexes */ protected static LinkedHashMap<Double,TribbleIndexCreator> scoreIndexes(final double densityOfFeatures, final Map<IndexFactory.IndexType,TribbleIndexCreator> indexes, final int longestFeature, final IndexFactory.IndexBalanceApproach iba) { if (indexes.size() < 1) throw new IllegalArgumentException("Please specify at least one index to evaluate"); final LinkedHashMap<Double,TribbleIndexCreator> scores = new LinkedHashMap<Double,TribbleIndexCreator>(); for (final Map.Entry<IndexFactory.IndexType,TribbleIndexCreator> entry : indexes.entrySet()) { // we have different scoring if (entry.getValue() instanceof LinearIndexCreator) { final double binSize = ((LinearIndexCreator)(entry.getValue())).getBinSize(); scores.put(binSize * densityOfFeatures * Math.ceil((double) longestFeature / binSize), entry.getValue()); } else if (entry.getValue() instanceof IntervalIndexCreator) { scores.put((double) ((IntervalIndexCreator)entry.getValue()).getFeaturesPerInterval(), entry.getValue()); } else { throw new TribbleException.UnableToCreateCorrectIndexType("Unknown index type, we don't have a scoring method for " + entry.getValue().getClass()); } } return scores; } /** * utility function to find the min of a list * @param scores the list of scaled features/bin scores for each index type * @return the best score <b>index value</b> */ private TribbleIndexCreator getMinIndex(final Map<Double,TribbleIndexCreator> scores, final IndexFactory.IndexBalanceApproach iba) { final TreeMap<Double,TribbleIndexCreator> map = new TreeMap<Double,TribbleIndexCreator>(); map.putAll(scores); // if we are optimizing for seek time, choose the lowest score (adjusted features/bin value), if for storage size, choose the opposite final TribbleIndexCreator idx = (iba != IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME) ? map.get(map.lastKey()) : map.get(map.firstKey()); return idx; } @Override public void addProperty(final String key, final String value) { for (final TribbleIndexCreator creator : creators.values()) { creator.addProperty(key, value); } } }