/** * Copyright Copyright 2014 Simon Andrews * * This file is part of BamQC. * * BamQC is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * BamQC is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with BamQC; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Changelog: * - Piero Dalle Pezze: Optimised data structures (removed unneeded concurrency), optimised algorithm. * Merged with SeqMonk:AnnotationSet, use of ShortRead for caching. * - Simon Andrews: Class creation. */ package uk.ac.babraham.BamQC.DataTypes.Genome; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import uk.ac.babraham.BamQC.Modules.ModuleConfig; import net.sf.samtools.SAMRecord; /** * * @author Simon Andrews * @author Piero Dalle Pezze * */ public class AnnotationSet { /** The reference file for this annotation set */ private File file = null; private ChromosomeFactory factory = new ChromosomeFactory(); private HashMap<String, FeatureClass> features = new HashMap<String, FeatureClass>(); private HashSet<Feature> allFeatures = new HashSet<Feature>(); private final int cacheCapacity = ModuleConfig.getParam("AnnotationSet_annotation_cache_capacity", "ignore").intValue(); private List<ShortRead> readCache = new ArrayList<ShortRead>(cacheCapacity); public AnnotationSet() { } public File getFile() { return file; } public void setFile(File file) { this.file = file; } public ChromosomeFactory chromosomeFactory () { return factory; } public void addFeature (Feature f) { if (!features.containsKey(f.type())) { features.put(f.type(), new FeatureClass(this)); allFeatures.add(f); } features.get(f.type()).addFeature(f); } public Feature[] getAllFeatures() { return allFeatures.toArray(new Feature[0]); } public boolean hasFeatures () { return !features.isEmpty(); } public String [] listFeatureTypes () { return features.keySet().toArray(new String [0]); } public FeatureClass getFeatureClassForType (String type) { return features.get(type); } public void processSequenceNoCache(SAMRecord r) { // implementation using ShortRead processCachedSequence(new ShortRead(r.getReferenceName(), r.getAlignmentStart(), r.getAlignmentEnd())); } public void processSequence (SAMRecord r) { // implementation using ShortRead if(readCache.size() < cacheCapacity) { readCache.add(new ShortRead(r.getReferenceName(), r.getAlignmentStart(), r.getAlignmentEnd())); } else { flushCache(); } } public void flushCache() { // sort the cache Collections.sort(readCache); // now parse the sorted cache for(int i=0; i < readCache.size(); i++) { processCachedSequence(readCache.get(i)); } // let's clear and reuse the array for now, instead of reallocating a new one every time. // Tricky to say what's the best is.. an O(n)remove vs allocation+GC ... readCache.clear(); } private void processCachedSequence(ShortRead r) { if (!r.getReferenceName().equals("*")) { Chromosome c = factory.getChromosome(r.getReferenceName()); c.processSequence(r); } for(FeatureClass value : features.values()) { value.processSequence(r); } } }