package org.genedb.crawl.bam; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.log4j.Logger; import org.genedb.crawl.model.BioDataFile; import org.genedb.crawl.model.MappedSAMSequence; import org.springframework.util.StringUtils; /** * A store of data files that facilitates querying by sequence, organism and fileID. * * @author gv1 * * @param <T> */ public class BioDataFileStore <T extends BioDataFile> { private static Logger logger = Logger.getLogger(BioDataFileStore.class); private Integer fileID = 0; private List<T> files; private Map<String, String> sequences; /** * Setup empty arrays if nothing passed. */ BioDataFileStore () { files = new ArrayList<T>(); sequences = new HashMap<String, String>(); } BioDataFileStore (List<T> files, Map<String, String> sequences) throws IOException { this.files = files; this.sequences = sequences; generateMetaFields(); assignFileIDs(); initialiseReaders(); } public Map<String, String> getSequences() { return sequences; } void generateMetaFields() { Set<String> found = new HashSet<String>(); Set<String> uniques = new HashSet<String>(); for (BioDataFile file : files) { String[] metas = file.file.split("/"); for (String meta : metas) { if (found.contains(meta)) { if (uniques.contains(meta)) { uniques.remove(meta); } continue; } found.add(meta); uniques.add(meta); } } for (BioDataFile file : files) { // do not bother if a meta has been supplied if (file.meta != null) { continue; } String[] metas = file.file.split("/"); List<String> path_elements = new ArrayList<String>(); for (String meta : metas) { if (uniques.contains(meta)) { path_elements.add(meta); } } file.meta = StringUtils.collectionToDelimitedString(path_elements, " > "); } } void assignFileIDs() { for (BioDataFile file : files) { file.fileID = fileID++; } } private void initialiseReaders() throws IOException { for (BioDataFile file : files) { file.init(); logger.info(String.format("%d (%s) %s", file.fileID, file.getClass().getName() , file.file)); } } public T getFile(int fileID) { if (fileID < files.size()) { return files.get(fileID); } return null; } public List<T> getFiles() { return files; } public List<T> listfororganism(String organism) { List<T> list = new ArrayList<T>(); for (T file : getFiles()) { if (file.organism.equals(organism)) { list.add(file); } } return list; } public String getActualSequenceName(int fileID, String sequenceName) throws Exception { for (MappedSAMSequence sequence : getSequences(fileID)) { String currentName = sequence.name; //logger.info(String.format("%s = %s", currentName, sequenceName)); if (currentName.equals(sequenceName)) { return currentName; } if (sequences.containsKey(sequenceName)) { return sequences.get(sequenceName); } } return null; } public String getAlignmentFromName(String sequenceName) { if (sequences.containsKey(sequenceName)) { return sequences.get(sequenceName); } return sequenceName; } public String getReferenceFromName(String sequenceName) { for (Entry<String, String> entry : sequences.entrySet()) { if (entry.getValue().equals(sequenceName)) { return entry.getKey(); } } return sequenceName; } public List<MappedSAMSequence> getSequences(int fileID) throws IOException { return getFile(fileID).getSequences(); } public List<T> listforsequence(String sequence) throws Exception { Map<Integer, T> map = new HashMap<Integer, T>(); for (T file : files) { Integer fileID = file.fileID; if (map.containsKey(fileID)) { continue; } String actualSequenceName = getActualSequenceName(fileID, sequence); if (actualSequenceName == null) { continue; } for (MappedSAMSequence fileSequence : getSequences(fileID)) { if (sequence.equals(fileSequence.name) || actualSequenceName.equals(fileSequence.name)) { map.put(fileID, file); } } } return new ArrayList<T>(map.values()); } }