package com.compomics.util.experiment.identification.protein_inference.proteintree; import com.compomics.util.Util; import com.compomics.util.db.DerbyUtil; import com.compomics.util.exceptions.ExceptionHandler; import com.compomics.util.experiment.biology.AminoAcid; import com.compomics.util.experiment.biology.AminoAcidPattern; import com.compomics.util.experiment.biology.AminoAcidSequence; import com.compomics.util.experiment.biology.Enzyme; import com.compomics.util.experiment.biology.Peptide; import com.compomics.util.experiment.biology.Protein; import com.compomics.util.experiment.identification.protein_sequences.SequenceFactory; import com.compomics.util.experiment.identification.protein_sequences.SequenceFactory.ProteinIterator; import com.compomics.util.experiment.identification.TagFactory; import com.compomics.util.experiment.identification.amino_acid_tags.Tag; import com.compomics.util.experiment.identification.amino_acid_tags.TagComponent; import com.compomics.util.experiment.identification.amino_acid_tags.matchers.TagMatcher; import com.compomics.util.experiment.identification.protein_inference.PeptideMapper; import com.compomics.util.experiment.identification.protein_inference.PeptideProteinMapping; import com.compomics.util.math.BasicMathFunctions; import com.compomics.util.preferences.SequenceMatchingPreferences; import com.compomics.util.preferences.SequenceMatchingPreferences.MatchingType; import com.compomics.util.preferences.UtilitiesUserPreferences; import com.compomics.util.waiting.WaitingHandler; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; /** * This class sorts the proteins into groups. * * @author Marc Vaudel */ public class ProteinTree implements PeptideMapper { /** * The memory allocation in MB. */ private int memoryAllocation; /** * Approximate number of accession*node one can store in a GB of memory * (empirical value). */ private static final long cacheScale = 10000; /** * Instance of the sequence factory. */ private SequenceFactory sequenceFactory = SequenceFactory.getInstance(); /** * The tree containing the accessions indexed by sequence tags. */ private HashMap<String, Node> tree = new HashMap<String, Node>(); /** * List of the nodes in tree. */ private ArrayDeque<String> tagsInTree = new ArrayDeque<String>(); /** * The size of the tree in memory in accession*node. */ private long treeSize = 0; /** * Indicates whether a debug file with speed metrics shall be created. */ private boolean debugSpeed = false; /** * Indicates whether the number of passages shall be displayed. */ private boolean debugPassages = false; /** * The writer used to send the output to a debug file. */ private BufferedWriter debugSpeedWriter = null; /** * The node factory when operating in indexed mode. */ private ProteinTreeComponentsFactory componentsFactory = null; /** * Size of the cache of the most queried peptides. */ private int cacheSize = 100; /** * Indicates whether the cache should be used. */ private boolean useCache = true; /** * Cache of the last queried peptides. */ private HashMap<String, ArrayList<PeptideProteinMapping>> lastQueriedPeptidesCache; /** * Peptide sequences in cache. */ private ArrayDeque<String> lastQueriedPeptidesCacheContent = new ArrayDeque<String>(cacheSize); /** * Time in ms after which a query is considered as slow. */ private int queryTimeThreshold = 50; /** * Cache of the last queried peptides where the query took long. */ private HashMap<String, ArrayList<PeptideProteinMapping>> lastSlowQueriedPeptidesCache; /** * Peptide sequences in slow cache. */ private ArrayDeque<String> lastSlowQueriedPeptidesCacheContent = new ArrayDeque<String>(cacheSize); /** * The version of the protein tree. */ public static final String version = "1.1.2"; /** * The sequence matching preferences of the matches in cache. */ private SequenceMatchingPreferences cacheSequenceMatchingPreferences = null; /** * Indicates whether the main thread is listening or preparing to wait. */ private boolean listening = true; /** * The number of proteins which should be imported at a time. */ public static final int proteinBatchSize = 100; /** * Cache for the protein lengths. */ private HashMap<String, Integer> proteinLengthsCache = new HashMap<String, Integer>(); /** * Creates a tree based on the proteins present in the sequence factory. * * @param memoryAllocation the number of MB available for the tree in * memory. * @param cacheSize the peptide queries caches size (note, there are two of * them) * * @throws IOException if an IOException occurs */ public ProteinTree(int memoryAllocation, int cacheSize) throws IOException { this.memoryAllocation = memoryAllocation; this.cacheSize = cacheSize; lastSlowQueriedPeptidesCache = new HashMap<String, ArrayList<PeptideProteinMapping>>(cacheSize); lastQueriedPeptidesCache = new HashMap<String, ArrayList<PeptideProteinMapping>>(cacheSize); if (debugSpeed) { try { debugSpeedWriter = new BufferedWriter(new FileWriter(new File("treeSpeed.txt"))); } catch (Exception e) { e.printStackTrace(); } } } /** * Returns the memory allocation. * * @return the memory allocation */ public int getMemoryAllocation() { return memoryAllocation; } /** * Sets the memory allocation. * * @param memoryAllocation the memory allocation */ public void setMemoryAllocation(int memoryAllocation) { this.memoryAllocation = memoryAllocation; } /** * Initiates the tree. * * @param nThreads the number of threads to use * @param initialTagSize the initial tag size * @param maxNodeSize the maximal size of a node. large nodes will be fast * to initiate but slow to query. I typically use 500 giving an approximate * query time <20ms. * @param maxPeptideSize the maximum peptide size * @param waitingHandler the waiting handler used to display progress to the * user and cancel the process. Can be null but strongly recommended. * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param printExpectedImportTime if true the expected import time will be * printed to the waiting handler * @param displayProgress display progress * * @throws IOException if an IOException occurs * @throws ClassNotFoundException if a ClassNotFoundException occurs * @throws InterruptedException if an InterruptedException occurs * @throws IllegalArgumentException if an IllegalArgumentException occurs * @throws SQLException if an SQLException occurs */ public void initiateTree(int initialTagSize, int maxNodeSize, int maxPeptideSize, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean printExpectedImportTime, boolean displayProgress, int nThreads) throws IOException, IllegalArgumentException, InterruptedException, ClassNotFoundException, SQLException { initiateTree(initialTagSize, maxNodeSize, maxPeptideSize, null, waitingHandler, exceptionHandler, printExpectedImportTime, displayProgress, nThreads); } /** * Initiates the tree. Note: speed and memory are calibrated for the no * enzyme case. * * @param nThreads the number of threads to use * @param initialTagSize the initial size of peptide tag. Large initial size * are fast to query, low initial size are fast to initiate. I typically use * 3 for databases containing less than 100 000 proteins giving an * approximate initiation time of 60ms per accession. * @param maxNodeSize the maximal size of a node. large nodes will be fast * to initiate but slow to query. I typically use 500 giving an approximate * query time <20ms. * @param maxPeptideSize the maximum peptide size * @param enzyme the enzyme used to select peptides. If null all possible * peptides will be indexed * @param waitingHandler the waiting handler used to display progress to the * user and cancel the process. Can be null but strongly recommended. * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param printExpectedImportTime if true the expected import time will be * printed to the waiting handler * @param displayProgress display progress * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ public void initiateTree(int initialTagSize, int maxNodeSize, int maxPeptideSize, Enzyme enzyme, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean printExpectedImportTime, boolean displayProgress, int nThreads) throws IOException, InterruptedException, IOException, InterruptedException, ClassNotFoundException, SQLException { tree.clear(); componentsFactory = ProteinTreeComponentsFactory.getInstance(); try { boolean needImport; try { needImport = !componentsFactory.initiate(); if (!needImport) { componentsFactory.loadParameters(); if (componentsFactory.isCorrupted()) { throw new IllegalArgumentException("Index is corrupted. Database will be reindexed."); } if (!componentsFactory.importComplete()) { throw new IllegalArgumentException("Database import was not successfully completed. Database will be reindexed."); } String tempVersion = componentsFactory.getVersion(); if (tempVersion == null || !tempVersion.equals(version)) { throw new IllegalArgumentException("Database index version " + tempVersion + " obsolete. Database will be reindexed."); } if (initialTagSize != componentsFactory.getInitialSize()) { throw new IllegalArgumentException("Different initial size. Database will be reindexed."); } } } catch (Exception e) { e.printStackTrace(); needImport = true; DerbyUtil.closeConnection(); componentsFactory.delete(); componentsFactory.initiate(); } if (needImport) { importDb(initialTagSize, maxNodeSize, maxPeptideSize, enzyme, waitingHandler, exceptionHandler, printExpectedImportTime, displayProgress, nThreads); } } catch (IOException e) { componentsFactory.delete(); throw e; } catch (IllegalArgumentException e) { componentsFactory.delete(); throw e; } catch (InterruptedException e) { componentsFactory.delete(); throw e; } catch (ClassNotFoundException e) { componentsFactory.delete(); throw e; } catch (SQLException e) { componentsFactory.delete(); throw e; } if (waitingHandler != null && waitingHandler.isRunCanceled()) { return; } try { componentsFactory.loadTags(); } catch (Exception e) { // ignore, tree will just be slower if (waitingHandler == null || !waitingHandler.isRunCanceled()) { e.printStackTrace(); } } } /** * Try to delete the current database. Note: The delete method will attempt * to close the connection. It is thus not needed (and not advised) to close * the connection before deleting. * * @return true of the deletion was a success */ public boolean deleteDb() { try { return componentsFactory.delete(); } catch (Exception ex) { ex.printStackTrace(); return false; } } /** * Imports the db which is in the sequence factory into the tree and saves * it in the nodeFactory. * * @param initialTagSize the initial size of peptide tag. Large initial size * are slow to query, low initial size are slow to initiate. I typically use * 3 for databases containing less than 100 000 proteins. * @param maxNodeSize the maximal size of a node. large nodes will be fast * to initiate but slow to query. I typically use 5000. * @param maxPeptideSize the maximum peptide size * @param enzyme the enzyme used to select peptides. If null all possible * peptides will be indexed * @param waitingHandler the waiting handler used to display progress to the * user and cancel the process. Can be null but strongly recommended. * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param printExpectedImportTime if true the expected import time will be * printed to the waiting handler * @param nThreads the number of threads to use * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private void importDb(int initialTagSize, int maxNodeSize, int maxPeptideSize, Enzyme enzyme, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean printExpectedImportTime, boolean displayProgress, int nThreads) throws IOException, InterruptedException, IOException, InterruptedException, ClassNotFoundException, SQLException { if (printExpectedImportTime) { int nSeconds = getExpectedImportTime(); String report = "Estimated import time: "; if (nSeconds < 120) { report += nSeconds + " seconds."; } else { int nMinutes = nSeconds / 60; if (nMinutes < 120) { report += nMinutes + " minutes."; } else { int nHours = nMinutes / 60; report += nHours + " hours."; } } if (waitingHandler != null && waitingHandler.isReport()) { waitingHandler.appendReport(report, true, true); waitingHandler.appendReport(" See http://compomics.github.io/compomics-utilities/wiki/proteininference.html.", true, true); } else { System.out.println(report); System.out.println(" See http://compomics.github.io/compomics-utilities/wiki/proteininference.html."); } } componentsFactory.saveInitialSize(initialTagSize); ArrayList<String> tags = TagFactory.getAminoAcidCombinations(initialTagSize); int nAccessions; if (sequenceFactory.isDefaultReversed()) { nAccessions = sequenceFactory.getNTargetSequences(); } else { nAccessions = sequenceFactory.getNSequences(); } long tagsSize = 500; // The space needed for tags in percent (empirical value) long criticalSize = tagsSize * nAccessions; // try to estimate the number of tags we can process at a time given the memory settings. We might want to fine tune this long capacity = memoryAllocation * cacheScale; long estimatedTreeSize = 6 * criticalSize; // as far as I tested, 6% of the proteins are covered by a tag in general (ie median) int ratio = (int) (estimatedTreeSize / capacity); if (ratio == 0) { ratio = 1; } int nPassages = (int) (ratio); if (tags.size() % ratio != 0) { nPassages += 1; } int nTags; if (ratio > 0) { nTags = tags.size() / ratio; if (nTags == 0) { nTags = 1; } } else { nTags = tags.size(); } if (nPassages > 1) { Collections.shuffle(tags); } if (debugPassages) { System.out.println("Estimated tree size: " + estimatedTreeSize); System.out.println(new Date() + " " + nPassages + " passages needed (" + nTags + " tags of " + tags.size() + " per passage)"); } if (debugSpeed) { debugSpeedWriter.write("Critical size: " + criticalSize); System.out.println("Critical size: " + criticalSize); estimatedTreeSize = estimatedTreeSize / 100; debugSpeedWriter.write("Estimated tree size: " + estimatedTreeSize); debugSpeedWriter.write(new Date() + " " + nPassages + " passages needed (" + nTags + " tags of " + tags.size() + " per passage)"); debugSpeedWriter.newLine(); debugSpeedWriter.flush(); } if (waitingHandler != null && displayProgress && !waitingHandler.isRunCanceled()) { waitingHandler.setSecondaryProgressCounterIndeterminate(false); int totalProgress = (int) (nPassages * nAccessions + tags.size() * 2); waitingHandler.setMaxSecondaryProgressCounter(totalProgress); waitingHandler.setSecondaryProgressCounter(0); } if (waitingHandler != null && waitingHandler.isRunCanceled()) { return; } long time0 = System.currentTimeMillis(); ArrayList<String> tempTags = new ArrayList<String>(nTags); int tagsLoaded = 0; boolean first = true; for (String tag : tags) { if (tempTags.size() == nTags) { loadTags(tempTags, initialTagSize, maxNodeSize, maxPeptideSize, enzyme, nThreads, waitingHandler, exceptionHandler, displayProgress); if (first) { first = false; } tagsLoaded += tempTags.size(); tempTags.clear(); if (debugSpeed) { debugSpeedWriter.write(new Date() + " " + tagsLoaded + " tags of " + tags.size() + " loaded."); System.out.println(new Date() + " " + tagsLoaded + " tags of " + tags.size() + " loaded."); debugSpeedWriter.newLine(); debugSpeedWriter.flush(); } } tempTags.add(tag); if (waitingHandler != null && waitingHandler.isRunCanceled()) { return; } } if (!tempTags.isEmpty()) { loadTags(tempTags, initialTagSize, maxNodeSize, maxPeptideSize, enzyme, nThreads, waitingHandler, exceptionHandler, displayProgress); if (debugSpeed) { debugSpeedWriter.write(new Date() + " " + tagsLoaded + " tags of " + tags.size() + " loaded."); System.out.println(new Date() + " " + tagsLoaded + " tags of " + tags.size() + " loaded."); debugSpeedWriter.newLine(); debugSpeedWriter.flush(); } } tagsInTree.addAll(tree.keySet()); for (Node node : tree.values()) { treeSize += node.getSize(); } if (waitingHandler != null && waitingHandler.isRunCanceled()) { return; } componentsFactory.setVersion(version); componentsFactory.setFastaFilePath(sequenceFactory.getCurrentFastaFile().getAbsolutePath()); componentsFactory.setImportComplete(true); long time1 = System.currentTimeMillis(); long initiationTime = time1 - time0; if (sequenceFactory.getNSequences() > 1000) { UtilitiesUserPreferences utilitiesUserPreferences = UtilitiesUserPreferences.loadUserPreferences(); utilitiesUserPreferences.addProteinTreeImportTime(sequenceFactory.getCurrentFastaFile().length(), initiationTime); UtilitiesUserPreferences.saveUserPreferences(utilitiesUserPreferences); } if (debugSpeed) { debugSpeedWriter.write("tree initiation: " + initiationTime + " ms."); System.out.println("tree initiation: " + initiationTime + " ms."); debugSpeedWriter.newLine(); debugSpeedWriter.flush(); } } /** * Estimates the import time for the database in the sequence factory. * * @return the import time in seconds */ private int getExpectedImportTime() { UtilitiesUserPreferences utilitiesUserPreferences = UtilitiesUserPreferences.loadUserPreferences(); HashMap<Long, ArrayList<Long>> importTimeMap = utilitiesUserPreferences.getProteinTreeImportTime(); if (importTimeMap.isEmpty()) { return sequenceFactory.getNTargetSequences() * 16 / 1000; } else { ArrayList<Double> ratios = new ArrayList<Double>(); for (Long size : importTimeMap.keySet()) { for (Long time : importTimeMap.get(size)) { double ratio = (double) (size / time); ratios.add(ratio); } } double ratio = BasicMathFunctions.percentile(ratios, 0.05); int timeInSeconds = (int) (1.2 * sequenceFactory.getCurrentFastaFile().length() / (1000 * ratio)); timeInSeconds = Math.max(timeInSeconds, 1); return timeInSeconds; } } /** * Loads the tags found in the given proteins in the tree and saves the end * nodes in the NodeFactory if not null. * * @param tags the tags of interest * @param waitingHandler the waiting handler used to display progress to the * user and cancel the process. Can be null but strongly recommended. * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param enzyme the enzyme restriction * @param loadLengths boolean indicating whether protein lengths should be * loaded in the db * @param loadedLengths the accessions of the proteins from which the length * is already saved * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private synchronized void loadTags(ArrayList<String> tags, int initialTagSize, int maxNodeSize, int maxPeptideSize, Enzyme enzyme, int nThreads, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean displayProgress) throws IOException, InterruptedException, ClassNotFoundException, SQLException { // find the tags in the proteins and create a node per tag found if (nThreads == 1) { indexProteinsSingleThread(tags, initialTagSize, enzyme, waitingHandler, displayProgress); } else { indexProteins(tags, initialTagSize, enzyme, waitingHandler, exceptionHandler, displayProgress, nThreads); } // split the nodes and save them in the db if (nThreads == 1) { processRawNodesSingleThread(tags, maxNodeSize, maxPeptideSize, waitingHandler, displayProgress); } else { processRawNodes(maxNodeSize, maxPeptideSize, waitingHandler, exceptionHandler, displayProgress, nThreads); } // clear memory before further processing tree.clear(); System.gc(); } /** * Iterates all the proteins and indexes the given tags in their sequences * by batches of proteinBatchSize using a SequenceIndexer in a separate * thread. When sequence indexers are finished, a node per tag is created * and stored in the tree map. * * @param tags the tags to index * @param waitingHandler waiting handler providing feedback on the process * and allowing canceling the process * @param initialTagSize the initial tag size * @param enzyme enzyme to use. Can be null * @param loadLengths boolean indicating whether protein lengths should be * loaded in the db * @param displayProgress boolean indicating whether progress shall be * displayed using the waiting handler * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private void indexProteinsSingleThread(ArrayList<String> tags, int initialTagSize, Enzyme enzyme, WaitingHandler waitingHandler, boolean displayProgress) throws IOException, InterruptedException, ClassNotFoundException, SQLException { ProteinIterator proteinIterator = sequenceFactory.getProteinIterator(sequenceFactory.isDefaultReversed()); while (proteinIterator.hasNext()) { Protein protein = proteinIterator.getNextProtein(); String accession = protein.getAccession(); if (protein.getLength() > 0) { // ignore empty protein sequences HashMap<String, ArrayList<Integer>> indexesMap = getTagToIndexesMap(protein.getSequence(), tags, enzyme, waitingHandler); for (String tag : indexesMap.keySet()) { ArrayList<Integer> indexes = indexesMap.get(tag); if (!indexes.isEmpty()) { Node node = tree.get(tag); if (node == null) { node = new Node(initialTagSize); tree.put(tag, node); } node.addAccession(accession, indexes); } if (waitingHandler != null && waitingHandler.isRunCanceled()) { break; } } if (displayProgress && waitingHandler != null) { waitingHandler.increaseSecondaryProgressCounter(); } if (waitingHandler != null && waitingHandler.isRunCanceled()) { tree.clear(); return; } } } } /** * Iterates all the proteins and indexes the given tags in their sequences * by batches of proteinBatchSize using a SequenceIndexer in a separate * thread. When sequence indexers are finished, a node per tag is created * and stored in the tree map. * * @param tags the tags to index * @param waitingHandler waiting handler providing feedback on the process * and allowing canceling the process * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param initialTagSize the initial tag size * @param enzyme enzyme to use. Can be null * @param loadLengths boolean indicating whether protein lengths should be * loaded in the db * @param displayProgress boolean indicating whether progress shall be * displayed using the waiting handler * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private void indexProteins(ArrayList<String> tags, int initialTagSize, Enzyme enzyme, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean displayProgress, int nThreads) throws IOException, InterruptedException, ClassNotFoundException, SQLException { ArrayList<Protein> sequenceBuffer = new ArrayList<Protein>(proteinBatchSize); ArrayList<SequenceIndexer> sequenceIndexers = new ArrayList<SequenceIndexer>(nThreads); ExecutorService pool = Executors.newFixedThreadPool(nThreads); ProteinIterator proteinIterator = sequenceFactory.getProteinIterator(sequenceFactory.isDefaultReversed()); while (proteinIterator.hasNext()) { Protein protein = proteinIterator.getNextProtein(); sequenceBuffer.add(protein); if (sequenceBuffer.size() == proteinBatchSize) { while (sequenceIndexers.size() == nThreads) { processFinishedIndexers(sequenceIndexers, initialTagSize); } SequenceIndexer sequenceIndexer = new SequenceIndexer(sequenceBuffer, tags, enzyme, waitingHandler, exceptionHandler, displayProgress); pool.submit(new Thread(sequenceIndexer, "sequence indexing")); sequenceBuffer = new ArrayList<Protein>(proteinBatchSize); sequenceIndexers.add(sequenceIndexer); } if (waitingHandler != null) { if (waitingHandler.isRunCanceled() || waitingHandler.isRunFinished()) { pool.shutdownNow(); emptyCache(); return; } } } if (!sequenceBuffer.isEmpty()) { SequenceIndexer sequenceIndexer = new SequenceIndexer(sequenceBuffer, tags, enzyme, waitingHandler, exceptionHandler, displayProgress); pool.submit(new Thread(sequenceIndexer, "sequence indexing")); sequenceIndexers.add(sequenceIndexer); } while (!sequenceIndexers.isEmpty()) { processFinishedIndexers(sequenceIndexers, initialTagSize); } pool.shutdown(); } /** * Splits the raw nodes and saves them in the database. * * @param tags the tags indexed * @param maxNodeSize the maximal size allowed for a node * @param maxPeptideSize the maximal peptide length allowed * @param waitingHandler waiting handler providing feedback on the process * and allowing canceling the process * @param displayProgress boolean indicating whether progress shall be * displayed using the waiting handler * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private void processRawNodesSingleThread(ArrayList<String> tags, int maxNodeSize, int maxPeptideSize, WaitingHandler waitingHandler, boolean displayProgress) throws IOException, InterruptedException, ClassNotFoundException, SQLException { int batchSize = (int) Math.ceil(tree.size() / 3); batchSize = Math.min(10000, batchSize); batchSize = Math.max(1000, batchSize); HashMap<String, Object> splittedNodes = new HashMap<String, Object>(batchSize); for (String tag : tags) { Node node = tree.get(tag); if (node != null) { node.splitNode(maxNodeSize, maxPeptideSize); splittedNodes.put(tag, node); if (splittedNodes.size() == batchSize) { componentsFactory.saveNodes(splittedNodes, waitingHandler); splittedNodes.clear(); } } if (waitingHandler != null) { if (displayProgress) { waitingHandler.increaseSecondaryProgressCounter(); if (node == null) { waitingHandler.increaseSecondaryProgressCounter(); } } if (waitingHandler.isRunCanceled() || waitingHandler.isRunFinished()) { emptyCache(); return; } } } if (!splittedNodes.isEmpty()) { componentsFactory.saveNodes(splittedNodes, waitingHandler); splittedNodes.clear(); } } /** * Splits the raw nodes and saves them in the database * * @param maxNodeSize the maximal size allowed for a node * @param maxPeptideSize the maximal peptide length allowed * @param waitingHandler waiting handler providing feedback on the process * and allowing canceling the process * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param displayProgress boolean indicating whether progress shall be * displayed using the waiting handler * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while creating the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private void processRawNodes(int maxNodeSize, int maxPeptideSize, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean displayProgress, int nThreads) throws IOException, InterruptedException, ClassNotFoundException, SQLException { ArrayList<NodeSplitter> nodeSplitters = new ArrayList<NodeSplitter>(nThreads); ExecutorService pool = Executors.newFixedThreadPool(nThreads); for (String tag : tree.keySet()) { Node node = tree.get(tag); while (nodeSplitters.size() == nThreads) { processFinishedNodeSplitters(nodeSplitters, null); // @TODO: add waiting handler } NodeSplitter nodeSplitter = new NodeSplitter(tag, node, maxNodeSize, maxPeptideSize, waitingHandler, exceptionHandler, displayProgress); pool.submit(new Thread(nodeSplitter, "Node splitting of tag " + tag)); nodeSplitters.add(nodeSplitter); if (waitingHandler != null) { if (waitingHandler.isRunCanceled() || waitingHandler.isRunFinished()) { emptyCache(); pool.shutdownNow(); return; } } } while (!nodeSplitters.isEmpty()) { processFinishedNodeSplitters(nodeSplitters, null); // @TODO: add waiting handler } pool.shutdown(); } /** * Clears the finished raw node splitters from a given list or wait for one * to finish and batch saves the splitted nodes. * * @param nodeProcessors the node processors of interest * @param waitingHandler the waiting handler * * @throws InterruptedException exception thrown whenever a threading issue * occurred while splitting nodes. */ private synchronized void processFinishedNodeSplitters(ArrayList<NodeSplitter> nodeSplitters, WaitingHandler waitingHandler) throws InterruptedException, SQLException, IOException { listening = false; ArrayList<NodeSplitter> done = new ArrayList<NodeSplitter>(); for (NodeSplitter nodeSplitter : nodeSplitters) { if (nodeSplitter.isFinished()) { done.add(nodeSplitter); } } if (done.isEmpty()) { listening = true; wait(); for (NodeSplitter nodeSplitter : nodeSplitters) { if (nodeSplitter.isFinished()) { done.add(nodeSplitter); } } } listening = true; HashMap<String, Object> splittedNodes = new HashMap<String, Object>(done.size()); for (NodeSplitter nodeSplitter : done) { splittedNodes.put(nodeSplitter.getTag(), nodeSplitter.getNode()); nodeSplitter.clear(); } componentsFactory.saveNodes(splittedNodes, waitingHandler); nodeSplitters.removeAll(done); } /** * Stores the result of the finished indexers and updates the list. Waits if * none is finished. * * @param sequenceIndexers the sequence indexers * @param initialTagSize the initial tag size * * @throws InterruptedException exception thrown whenever a threading issue * occurred while processing finished indexers. */ private synchronized void processFinishedIndexers(ArrayList<SequenceIndexer> sequenceIndexers, int initialTagSize) throws InterruptedException { listening = false; ArrayList<SequenceIndexer> done = new ArrayList<SequenceIndexer>(); for (SequenceIndexer sequenceIndexer : sequenceIndexers) { if (sequenceIndexer.isFinished()) { done.add(sequenceIndexer); } } if (done.isEmpty()) { listening = true; wait(); for (SequenceIndexer sequenceIndexer : sequenceIndexers) { if (sequenceIndexer.isFinished()) { done.add(sequenceIndexer); } } } listening = true; for (SequenceIndexer sequenceIndexer : done) { HashMap<String, HashMap<String, ArrayList<Integer>>> tagToIndexesMap = sequenceIndexer.getIndexes(); for (String accession : tagToIndexesMap.keySet()) { for (String tag : tagToIndexesMap.get(accession).keySet()) { ArrayList<Integer> indexes = tagToIndexesMap.get(accession).get(tag); if (!indexes.isEmpty()) { Node node = tree.get(tag); if (node == null) { node = new Node(initialTagSize); tree.put(tag, node); } node.addAccession(accession, indexes); } } } sequenceIndexer.clear(); } sequenceIndexers.removeAll(done); } @Override public ArrayList<PeptideProteinMapping> getProteinMapping(String peptideSequence, SequenceMatchingPreferences proteinInferencePreferences) throws IOException, InterruptedException, ClassNotFoundException, SQLException { long time0 = 0; if (debugSpeed) { time0 = System.currentTimeMillis(); } ArrayList<PeptideProteinMapping> result = getProteinMapping(peptideSequence, proteinInferencePreferences, false); if (debugSpeed) { long time1 = System.currentTimeMillis(); long queryTime = time1 - time0; debugSpeedWriter.write(peptideSequence + "\t" + result.size() + "\t" + queryTime); debugSpeedWriter.newLine(); debugSpeedWriter.flush(); } return result; } /** * Returns the protein mapping in the sequence factory for the given peptide * sequence. peptide sequence > protein accession > index in the * protein. An empty map if not. * * @param peptideSequence the peptide sequence * @param sequenceMatchingPreferences the sequence matching preferences * @param reversed boolean indicating whether we are looking at a reversed * peptide sequence * * @return the peptide to protein mapping: Accession > list of indexes * where the peptide can be found on the sequence * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private ArrayList<PeptideProteinMapping> getProteinMapping(String peptideSequence, SequenceMatchingPreferences sequenceMatchingPreferences, boolean reversed) throws IOException, InterruptedException, ClassNotFoundException, SQLException { if (useCache && this.cacheSequenceMatchingPreferences != null && !this.cacheSequenceMatchingPreferences.isSameAs(sequenceMatchingPreferences)) { emptyCache(); this.cacheSequenceMatchingPreferences = sequenceMatchingPreferences; } ArrayList<PeptideProteinMapping> result = null; if (useCache) { result = lastQueriedPeptidesCache.get(peptideSequence); } if (result == null) { if (useCache) { result = lastSlowQueriedPeptidesCache.get(peptideSequence); } if (result == null) { if (sequenceFactory.isDefaultReversed() && useCache) { String reversedSequence = SequenceFactory.reverseSequence(peptideSequence); result = lastQueriedPeptidesCache.get(reversedSequence); if (result == null) { result = lastSlowQueriedPeptidesCache.get(reversedSequence); } if (result != null) { return getReversedResults(result, reversedSequence); } } long timeStart = System.currentTimeMillis(); int initialTagSize = componentsFactory.getInitialSize(); if (peptideSequence.length() < initialTagSize) { throw new IllegalArgumentException("Peptide (" + peptideSequence + ") should be at least of length " + initialTagSize + "."); } result = new ArrayList<PeptideProteinMapping>(2); AminoAcidSequence peptideAminoAcidSequence = new AminoAcidSequence(peptideSequence); Double limitX = null; if (sequenceMatchingPreferences.hasLimitX()) { limitX = sequenceMatchingPreferences.getLimitX() * peptideSequence.length() / initialTagSize; } HashSet<String> initialTags = getInitialTags(peptideAminoAcidSequence, sequenceMatchingPreferences, limitX); for (String tag : initialTags) { Node node = getNode(tag); if (node != null) { ArrayList<PeptideProteinMapping> tagResults = node.getProteinMapping(peptideAminoAcidSequence, tag, sequenceMatchingPreferences); result.addAll(tagResults); } } if (sequenceFactory.isDefaultReversed() && !reversed) { String reversedSequence = SequenceFactory.reverseSequence(peptideSequence); ArrayList<PeptideProteinMapping> reversedResult; if (!reversedSequence.equals(peptideSequence)) { reversedResult = getProteinMapping(reversedSequence, sequenceMatchingPreferences, true); reversedResult = getReversedResults(reversedResult, reversedSequence); } else { reversedResult = getReversedResults(result, reversedSequence); } result.addAll(reversedResult); } if (!reversed && useCache) { long timeEnd = System.currentTimeMillis(); long queryTime = timeEnd - timeStart; addToCache(peptideSequence, result, queryTime); } } } return result; } /** * Adds a mapping to the cache. * * @param peptideSequence the newly mapped peptide sequence * @param mapping the protein mapping * @param queryTime the mapping time */ private synchronized void addToCache(String peptideSequence, ArrayList<PeptideProteinMapping> mapping, long queryTime) { if (queryTime <= queryTimeThreshold) { lastQueriedPeptidesCache.put(peptideSequence, mapping); lastQueriedPeptidesCacheContent.add(peptideSequence); if (lastQueriedPeptidesCacheContent.size() > cacheSize) { String key = lastQueriedPeptidesCacheContent.pollLast(); lastQueriedPeptidesCache.remove(key); } } else { lastSlowQueriedPeptidesCache.put(peptideSequence, mapping); lastSlowQueriedPeptidesCacheContent.add(peptideSequence); if (lastSlowQueriedPeptidesCacheContent.size() > cacheSize) { String key = lastSlowQueriedPeptidesCacheContent.pollLast(); lastSlowQueriedPeptidesCache.remove(key); } } } @Override public ArrayList<PeptideProteinMapping> getProteinMapping(Tag tag, TagMatcher tagMatcher, SequenceMatchingPreferences sequenceMatchingPreferences, Double massTolerance) throws IOException, InterruptedException, ClassNotFoundException, SQLException { int initialTagSize = componentsFactory.getInitialSize(); AminoAcidPattern longestAminoAcidPattern = null; AminoAcidSequence longestAminoAcidSequence = null; int componentIndex = -1; for (int i = 0; i < tag.getContent().size(); i++) { TagComponent tagComponent = tag.getContent().get(i); if (tagComponent instanceof AminoAcidPattern) { AminoAcidPattern aminoAcidPattern = (AminoAcidPattern) tagComponent; if (aminoAcidPattern.length() >= initialTagSize && (longestAminoAcidPattern == null || aminoAcidPattern.length() > longestAminoAcidPattern.length()) && (longestAminoAcidSequence == null || aminoAcidPattern.length() > longestAminoAcidSequence.length())) { componentIndex = i; longestAminoAcidPattern = aminoAcidPattern; longestAminoAcidSequence = null; } } else if (tagComponent instanceof AminoAcidSequence) { AminoAcidSequence aminoAcidSequence = (AminoAcidSequence) tagComponent; if (aminoAcidSequence.length() >= initialTagSize && (longestAminoAcidPattern == null || aminoAcidSequence.length() > longestAminoAcidPattern.length()) && (longestAminoAcidSequence == null || aminoAcidSequence.length() > longestAminoAcidSequence.length())) { componentIndex = i; longestAminoAcidSequence = aminoAcidSequence; longestAminoAcidPattern = null; } } } if (componentIndex == -1) { throw new IllegalArgumentException("No amino acid sequence longer than " + initialTagSize + " was found for tag " + tag + "."); } ArrayList<PeptideProteinMapping> seeds = new ArrayList<PeptideProteinMapping>(); if (longestAminoAcidPattern != null) { for (String peptideSequence : longestAminoAcidPattern.getAllPossibleSequences()) { double xShare = ((double) Util.getOccurrence(peptideSequence, 'X')) / peptideSequence.length(); if (!sequenceMatchingPreferences.hasLimitX() || xShare <= sequenceMatchingPreferences.getLimitX()) { seeds.addAll(getProteinMapping(peptideSequence, sequenceMatchingPreferences)); } } } else { seeds.addAll(getProteinMapping(longestAminoAcidSequence.getSequence(), sequenceMatchingPreferences)); } ArrayList<PeptideProteinMapping> results = new ArrayList<PeptideProteinMapping>(seeds.size()); for (PeptideProteinMapping peptideProteinMapping : seeds) { String accession = peptideProteinMapping.getProteinAccession(); String proteinSequence = sequenceFactory.getProtein(accession).getSequence(); int seedIndex = peptideProteinMapping.getIndex(); ArrayList<PeptideProteinMapping> tagMapping = tagMatcher.getPeptideMatches(tag, accession, proteinSequence, seedIndex, componentIndex, massTolerance); results.addAll(tagMapping); } return results; } /** * Returns a list of possible initial tags. * * @param aminoAcidSequence the peptide sequence * @param sequenceMatchingPreferences the sequence matching preferences to * use * * @return a list of possible initial tags. * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private HashSet<String> getInitialTags(AminoAcidSequence aminoAcidSequence, SequenceMatchingPreferences sequenceMatchingPreferences, Double limitX) throws SQLException, IOException, ClassNotFoundException, InterruptedException { int initialTagSize = componentsFactory.getInitialSize(); HashSet<String> result = new HashSet<String>(); for (int i = 0; i < initialTagSize; i++) { AminoAcid aminoAcid = aminoAcidSequence.getAminoAcidAt(i); if (result.isEmpty()) { if (sequenceMatchingPreferences.getSequenceMatchingType() == MatchingType.string) { String originalAa = aminoAcid.singleLetterCode; result.add(originalAa); } else { for (char originalAa : aminoAcid.getSubAminoAcids()) { String newTag = String.valueOf(originalAa); result.add(newTag); } for (char combinationAa : aminoAcid.getCombinations()) { String newTag = String.valueOf(combinationAa); result.add(newTag); } if (sequenceMatchingPreferences.getSequenceMatchingType() == MatchingType.indistiguishableAminoAcids && (aminoAcid == AminoAcid.I || aminoAcid == AminoAcid.J || aminoAcid == AminoAcid.L)) { result.add("I"); result.add("J"); result.add("L"); } } } else { HashSet<String> newResults = new HashSet<String>(); for (String sequence : result) { if (sequenceMatchingPreferences.getSequenceMatchingType() == MatchingType.string) { String originalAa = aminoAcid.singleLetterCode; newResults.add(sequence + aminoAcid.singleLetterCode); } else { for (char originalAa : aminoAcid.getSubAminoAcids()) { String newTag = sequence + originalAa; newResults.add(newTag); } for (char newAa : aminoAcid.getCombinations()) { String newTag = sequence + newAa; newResults.add(newTag); } if (sequenceMatchingPreferences.getSequenceMatchingType() == MatchingType.indistiguishableAminoAcids && (aminoAcid == AminoAcid.I || aminoAcid == AminoAcid.J || aminoAcid == AminoAcid.L)) { String newTag = sequence + "I"; newResults.add(newTag); newTag = sequence + "J"; newResults.add(newTag); newTag = sequence + "L"; newResults.add(newTag); } } } result = newResults; } } if (limitX != null && limitX < 1) { HashSet<String> filtered = new HashSet<String>(); for (String sequence : result) { double xShare = ((double) Util.getOccurrence(sequence, 'X')) / sequence.length(); if (xShare <= limitX) { filtered.add(sequence); } } result = filtered; } return result; } /** * Reverts the indexes and the protein accessions of the given mapping. * * @param forwardResults the given mapping * @param sequence the sequence of interest * * @return the reversed indexes * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private ArrayList<PeptideProteinMapping> getReversedResults(ArrayList<PeptideProteinMapping> forwardResults, String sequence) throws SQLException, ClassNotFoundException, IOException, InterruptedException { ArrayList<PeptideProteinMapping> results = new ArrayList<PeptideProteinMapping>(forwardResults.size()); for (PeptideProteinMapping peptideProteinMapping : forwardResults) { int peptideLength = sequence.length(); String reversedSequence = SequenceFactory.reverseSequence(sequence); String accession = peptideProteinMapping.getProteinAccession(); String reversedAccession; Integer proteinLength; if (accession.endsWith(SequenceFactory.getDefaultDecoyAccessionSuffix())) { reversedAccession = SequenceFactory.getDefaultTargetAccession(accession); proteinLength = getProteinLength(reversedAccession); if (proteinLength == null) { throw new IllegalArgumentException("Length of protein " + reversedAccession + " not found."); } } else { reversedAccession = SequenceFactory.getDefaultDecoyAccession(accession); proteinLength = getProteinLength(accession); if (proteinLength == null) { throw new IllegalArgumentException("Length of protein " + accession + " not found."); } } int forwardIndex = peptideProteinMapping.getIndex(); int reversedIndex = proteinLength - forwardIndex - peptideLength; if (reversedIndex < 0 || reversedIndex >= proteinLength) { throw new IllegalArgumentException("Wrong index found for peptide " + reversedSequence + " in protein " + reversedAccession + ": " + reversedIndex + "."); } PeptideProteinMapping reversedMapping = new PeptideProteinMapping(reversedAccession, reversedSequence, reversedIndex); results.add(reversedMapping); } return results; } /** * Returns a node related to a tag and updates the cache. Null if not found. * * @param tag the tag of interest * * @return the corresponding node * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private Node getNode(String tag) throws SQLException, ClassNotFoundException, IOException, InterruptedException { Node result = tree.get(tag); if (result == null) { result = getNodeSynchronized(tag); } return result; } /** * Returns a node related to a tag and updates the cache. Null if not found. * * @param tag the tag of interest * * @return the corresponding node * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private synchronized Node getNodeSynchronized(String tag) throws SQLException, ClassNotFoundException, IOException, InterruptedException { Node result = tree.get(tag); if (result == null) { result = componentsFactory.getNode(tag); if (result != null) { long capacity = memoryAllocation * cacheScale; while (treeSize > capacity && !tagsInTree.isEmpty()) { String tempTag = tagsInTree.pollLast(); Node tempNode = tree.get(tempTag); treeSize -= tempNode.getSize(); tree.remove(tempTag); } tree.put(tag, result); treeSize += result.getSize(); tagsInTree.addFirst(tag); } } return result; } @Override public void close() throws IOException, SQLException { if (debugSpeed) { try { debugSpeedWriter.flush(); debugSpeedWriter.close(); } catch (Exception e) { e.printStackTrace(); } } emptyCache(); componentsFactory.close(); // delete outdated trees try { ProteinTreeComponentsFactory.deletOutdatedTrees(); } catch (Exception e) { e.printStackTrace(); } } /** * Returns the size of the cache used for peptide mappings (note that there * are two of them). * * @return the size of the cache used for peptide mappings */ public int getCacheSize() { return cacheSize; } /** * Sets the size of the cache used for peptide mappings (note that there are * two of them). * * @param cacheSize the size of the cache used for peptide mappings */ public void setCacheSize(int cacheSize) { this.cacheSize = cacheSize; } /** * Empties the cache. */ public void emptyCache() { tree.clear(); tagsInTree.clear(); lastQueriedPeptidesCache.clear(); lastQueriedPeptidesCacheContent.clear(); lastSlowQueriedPeptidesCache.clear(); lastSlowQueriedPeptidesCacheContent.clear(); proteinLengthsCache.clear(); } /** * Reduces the node cache size by the given share. If less than 100 nodes * are left they will all be removed. * * @param share the share of the cache to remove. 0.5 means 50% */ public synchronized void reduceNodeCacheSize(double share) { double limit = tree.size(); if (limit > 100) { limit = share * limit; } for (int i = 0; i < limit; i++) { String tempTag = tagsInTree.pollLast(); Node tempNode = tree.get(tempTag); if (tempNode == null) { // another thread already reduced the cache size break; } treeSize -= tempNode.getSize(); tree.remove(tempTag); } } /** * Returns the number of nodes currently loaded in cache. * * @return the number of nodes currently loaded in cache */ public int getNodesInCache() { return tree.size(); } /** * Returns a PeptideIterator which iterates alphabetically all peptides * corresponding to the end of a branch in the tree. * * @return a PeptideIterator which iterates alphabetically all peptides * corresponding to the end of a branch in the tree * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ public PeptideIterator getPeptideIterator() throws SQLException, IOException, ClassNotFoundException, InterruptedException { return new PeptideIterator(); } /** * Notifies the tree that a runnable has finished working. * * @throws InterruptedException exception thrown whenever a threading issue * occurred while waiting. */ private synchronized void runnableFinished() throws InterruptedException { while (!listening) { wait(10); } notify(); } /** * Alphabetical iterator for the tree. */ public class PeptideIterator implements Iterator { /** * The initial tag size of the tree. */ private Integer initialTagSize; /** * The list of possible initial tags. */ private ArrayList<String> tags; /** * The current node. */ private Node currentNode = null; /** * The parent node. */ private Node parentNode = null; /** * The current peptide sequence. */ private String currentSequence = null; /** * List of amino acids found in the current node subtree if any. */ private ArrayList<Character> aas = null; /** * The current iterator position in the tags. */ private int i = -1; /** * The current iterator position in the amino acid list. */ private int j = 0; /** * Constructor. * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error * occurs while deserializing an object. * @throws InterruptedException exception thrown whenever a threading * issue occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private PeptideIterator() throws SQLException, IOException, ClassNotFoundException, InterruptedException { initialTagSize = componentsFactory.getInitialSize(); tags = TagFactory.getAminoAcidCombinations(initialTagSize); } @Override public boolean hasNext() { try { if (currentNode != null && currentNode.getDepth() == initialTagSize && currentNode.getAccessions() != null && i < tags.size() - 1) { // ok we're done with this node parentNode = null; aas = null; j = 0; currentSequence = tags.get(++i); currentNode = getNode(currentSequence); } while (++i < tags.size() && currentNode == null && parentNode == null) { currentSequence = tags.get(i); currentNode = getNode(currentSequence); } if (i < tags.size()) { if (aas != null) { int parentDepth = currentSequence.length() - 1; currentSequence = currentSequence.substring(0, parentDepth); if (++j == aas.size()) { if (!parentNode.getTermini().isEmpty()) { currentNode = null; return true; } else { j++; } } if (j == aas.size() + 1) { if (parentDepth <= initialTagSize) { // ok we're done with this node currentSequence = null; currentNode = null; parentNode = null; aas = null; j = 0; } else { parentDepth = currentSequence.length() - 1; String parentSequence = currentSequence.substring(0, parentDepth); char aa = currentSequence.charAt(parentDepth); if (parentDepth == initialTagSize) { parentNode = getNode(parentSequence); } else { String tag = parentSequence.substring(0, initialTagSize); parentNode = getNode(tag).getSubNode(parentSequence); } currentNode = parentNode.getSubtree().get(aa); aas = new ArrayList<Character>(parentNode.getSubtree().keySet()); Collections.sort(aas); j = aas.indexOf(aa); } return hasNext(); } char aa = aas.get(j); currentSequence += aa; currentNode = parentNode.getSubtree().get(aa); } while (currentNode.getAccessions() == null) { j = 0; aas = new ArrayList<Character>(currentNode.getSubtree().keySet()); parentNode = currentNode; if (!aas.isEmpty()) { Collections.sort(aas); char aa = aas.get(j); currentSequence += aa; currentNode = currentNode.getSubtree().get(aa); } else { currentNode = null; return true; } } return true; } return false; } catch (Exception e) { e.printStackTrace(); throw new IllegalArgumentException("An error occurred while iterating the tree. See previous exception."); } } @Override public Object next() { return currentSequence; } @Override public void remove() { throw new UnsupportedOperationException("ProteinTrees are not editable."); } /** * Returns the protein mapping of the current peptide. * * @return the protein mapping of the current peptide. */ public HashMap<String, ArrayList<Integer>> getMapping() { if (currentNode != null) { return currentNode.getAccessions(); } else { return parentNode.getTermini(); } } } /** * Returns all the positions of the given tags on the given sequence in a * map: tag > list of indexes in the sequence. * * @param sequence the sequence of interest * @param tags the tags of interest * @param enzyme the enzyme restriction * @param waitingHandler waiting handler * * @return all the positions of the given tags * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database.F */ private HashMap<String, ArrayList<Integer>> getTagToIndexesMap(String sequence, ArrayList<String> tags, Enzyme enzyme, WaitingHandler waitingHandler) throws SQLException, IOException, ClassNotFoundException, InterruptedException { HashMap<String, ArrayList<Integer>> tagToIndexesMap = new HashMap<String, ArrayList<Integer>>(tags.size()); Integer initialTagSize = componentsFactory.getInitialSize(); for (String tag : tags) { tagToIndexesMap.put(tag, new ArrayList<Integer>()); } for (int i = 0; i < sequence.length() - initialTagSize; i++) { if (enzyme == null || i == 0 || enzyme.isCleavageSite(sequence.charAt(i - 1), sequence.charAt(i))) { char[] tagValue = new char[initialTagSize]; for (int j = 0; j < initialTagSize; j++) { char aa = sequence.charAt(i + j); tagValue[j] = aa; } String tag = new String(tagValue); ArrayList<Integer> tempIndexes = tagToIndexesMap.get(tag); if (tempIndexes != null) { tempIndexes.add(i); } } if (waitingHandler != null && waitingHandler.isRunCanceled()) { break; } } return tagToIndexesMap; } /** * Retrieves the length of a protein. * * @param accession the accession of the protein of interest * * @return the length of this protein * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ public Integer getProteinLength(String accession) throws SQLException, ClassNotFoundException, IOException, InterruptedException { Integer length = proteinLengthsCache.get(accession); if (length == null) { return getProteinLengthSynchronized(accession); } return length; } /** * Retrieves the length of a protein. * * @param accession the accession of the protein of interest * * @return the length of this protein * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ private synchronized Integer getProteinLengthSynchronized(String accession) throws SQLException, ClassNotFoundException, IOException, InterruptedException { Integer length = proteinLengthsCache.get(accession); if (length == null) { Protein protein = sequenceFactory.getProtein(accession); if (protein != null) { length = protein.getLength(); } else { throw new IllegalArgumentException("Length of protein " + accession + " not found."); } proteinLengthsCache.put(accession, length); } return length; } /** * Returns the initial tag size of the tree. * * @return the initial tag size of the tree * * @throws IOException exception thrown whenever an error occurs while * reading or writing a file. * @throws ClassNotFoundException exception thrown whenever an error occurs * while deserializing an object. * @throws InterruptedException exception thrown whenever a threading issue * occurred while interacting with the tree. * @throws SQLException if an SQLException exception thrown whenever a * problem occurred while interacting with the tree database. */ public Integer getInitialTagSize() throws SQLException, IOException, ClassNotFoundException, InterruptedException { return componentsFactory.getInitialSize(); } /** * Runnable used for the indexing of a protein sequence. */ private class SequenceIndexer implements Runnable { /** * The proteins to process. */ private ArrayList<Protein> proteins; /** * Boolean indicating whether the thread shall be interrupted. */ private boolean finished = false; /** * List of tags to inspect. */ private ArrayList<String> tags; /** * The enzyme to use. */ private Enzyme enzyme; /** * The result of the indexing. */ private HashMap<String, HashMap<String, ArrayList<Integer>>> indexes = new HashMap<String, HashMap<String, ArrayList<Integer>>>(proteinBatchSize); /** * The waiting handler. */ private WaitingHandler waitingHandler; /** * Boolean indicating whether progress should be displayed. */ private boolean displayProgress; /** * Handler for the exceptions. */ private ExceptionHandler exceptionHandler; /** * Constructor. * * @param proteins the proteins to process * @param tags the tags to process * @param enzyme enzyme to use (can be null) * @param waitingHandler waiting handler providing feedback on the * process and allowing canceling the process * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param displayProgress boolean indicating whether progress shall be * displayed on the progress bar of the waiting handler */ public SequenceIndexer(ArrayList<Protein> proteins, ArrayList<String> tags, Enzyme enzyme, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean displayProgress) { this.proteins = proteins; this.tags = tags; this.enzyme = enzyme; this.waitingHandler = waitingHandler; this.exceptionHandler = exceptionHandler; this.displayProgress = displayProgress; } @Override public synchronized void run() { try { for (Protein protein : proteins) { if (waitingHandler != null && waitingHandler.isRunCanceled()) { return; } indexes.put(protein.getAccession(), getTagToIndexesMap(protein.getSequence(), tags, enzyme, waitingHandler)); if (displayProgress && waitingHandler != null && !waitingHandler.isRunCanceled()) { waitingHandler.increaseSecondaryProgressCounter(); } if (waitingHandler != null && waitingHandler.isRunCanceled()) { return; } } } catch (Exception ex) { if (exceptionHandler != null) { exceptionHandler.catchException(ex); } else { ex.printStackTrace(); } } finished = true; try { runnableFinished(); } catch (InterruptedException ex) { ex.printStackTrace(); } } /** * Indicates whether the run is finished. * * @return true if the thread is finished. */ public boolean isFinished() { return finished; } /** * Returns the indexes: protein accession > tag > indexes of the * tag on the protein sequence * * @return the indexes */ public HashMap<String, HashMap<String, ArrayList<Integer>>> getIndexes() { return indexes; } /** * Clears the content of the runnable. */ public void clear() { proteins.clear(); tags = null; indexes.clear(); } } /** * Runnable used to process raw nodes and store them in the database. */ private class NodeSplitter implements Runnable { /** * The tag of the node. */ private String tag; /** * The node. */ private Node node; /** * the max node size. */ private int maxNodeSize; /** * The max peptide size. */ private int maxPeptideSize; /** * Boolean indicating whether the thread shall be interrupted. */ private boolean finished = false; /** * The waiting handler. */ private WaitingHandler waitingHandler; /** * Boolean indicating whether progress should be displayed. */ private boolean displayProgress; /** * Handler for the exceptions. */ private ExceptionHandler exceptionHandler; /** * Constructor. * * * @param maxNodeSize the maximal size allowed for a node * @param maxPeptideSize the maximal peptide length allowed * @param waitingHandler waiting handler providing feedback on the * process and allowing canceling the process * @param exceptionHandler handler for the exceptions encountered while * creating the tree * @param displayProgress boolean indicating whether progress shall be * displayed using the waiting handler */ public NodeSplitter(String tag, Node node, int maxNodeSize, int maxPeptideSize, WaitingHandler waitingHandler, ExceptionHandler exceptionHandler, boolean displayProgress) { this.tag = tag; this.node = node; this.waitingHandler = waitingHandler; this.exceptionHandler = exceptionHandler; this.displayProgress = displayProgress; } @Override public synchronized void run() { try { node.splitNode(maxNodeSize, maxPeptideSize); } catch (Exception ex) { if (exceptionHandler != null) { exceptionHandler.catchException(ex); } else { ex.printStackTrace(); } } finished = true; if (displayProgress && waitingHandler != null && !waitingHandler.isRunCanceled()) { waitingHandler.increaseSecondaryProgressCounter(); } try { runnableFinished(); } catch (Exception ex) { if (exceptionHandler != null) { exceptionHandler.catchException(ex); } else { ex.printStackTrace(); } } } /** * Indicates whether the run is finished. * * @return true if the thread is finished. */ public boolean isFinished() { return finished; } /** * Clears the content of the runnable. */ public void clear() { node = null; } /** * Returns the tag of the split node. * * @return the tag of the split node */ public String getTag() { return tag; } /** * Returns the split node. * * @return the split node */ public Node getNode() { return node; } } }