package com.compomics.util.experiment.identification.protein_sequences;
import com.compomics.util.Util;
import com.compomics.util.exceptions.ExceptionHandler;
import com.compomics.util.experiment.biology.AminoAcid;
import com.compomics.util.experiment.biology.Protein;
import com.compomics.util.experiment.biology.taxonomy.SpeciesFactory;
import com.compomics.util.experiment.identification.identification_parameters.PtmSettings;
import com.compomics.util.experiment.identification.protein_inference.PeptideMapper;
import com.compomics.util.experiment.identification.protein_inference.PeptideMapperType;
import com.compomics.util.experiment.identification.protein_inference.fm_index.FMIndex;
import com.compomics.util.experiment.identification.protein_inference.proteintree.ProteinTree;
import com.compomics.util.waiting.WaitingHandler;
import com.compomics.util.io.SerializationUtils;
import com.compomics.util.preferences.IdentificationParameters;
import com.compomics.util.preferences.PeptideVariantsPreferences;
import com.compomics.util.preferences.SequenceMatchingPreferences;
import com.compomics.util.preferences.UtilitiesUserPreferences;
import com.compomics.util.protein.Header;
import java.io.*;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.Semaphore;
import javax.swing.JProgressBar;
import uk.ac.ebi.pride.tools.braf.BufferedRandomAccessFile;
/**
* Factory retrieving the information of the loaded FASTA file.
*
* @author Marc Vaudel
* @author Harald Barsnes
* @author Dominik Kopczynski
*/
public class SequenceFactory {
/**
* Instance of the factory.
*/
private static SequenceFactory instance = null;
/**
* Map of the currently loaded Headers.
*/
private HashMap<String, Header> currentHeaderMap = new HashMap<String, Header>();
/**
* Map of the currently loaded proteins.
*/
private HashMap<String, Protein> currentProteinMap = new HashMap<String, Protein>();
/**
* Index of the FASTA file.
*/
private FastaIndex fastaIndex = null;
/**
* Random access file of the current FASTA file.
*/
private BufferedRandomAccessFile currentRandomAccessFile = null;
/**
* The FASTA file currently loaded.
*/
private File currentFastaFile = null;
/**
* Number of proteins to keep in cache. By default 1000000, which
* corresponds to approx. 120MB.
*/
private int nCache = 1000000;
/**
* List of accessions of the loaded proteins.
*/
private ArrayList<String> loadedProteins = new ArrayList<String>();
/**
* Recognized flags for a decoy protein.
*/
private static final String[] DECOY_FLAGS = {"REVERSED", "RND", "SHUFFLED", "DECOY"};
/**
* HashMap of the currently calculated protein molecular weights.
*/
private HashMap<String, Double> molecularWeights = new HashMap<String, Double>();
/**
* The default peptide to protein mapper.
*/
private PeptideMapper defaultPeptideMapper = null;
/**
* Boolean indicating that the factory is reading the file.
*/
private boolean reading = false;
/**
* The time out in milliseconds when querying the file.
*/
public final static long TIME_OUT = 10000;
/**
* Indicates whether the decoy hits should be kept in memory.
*/
private boolean decoyInMemory = true;
/**
* The minimal protein count required for reliable target/decoy based
* statistics.
*/
public static int minProteinCount = 1000; // @TODO: use a better metric
/**
* Constructor.
*/
private SequenceFactory() {
}
/**
* Static method returning the instance of the factory.
*
* @return the instance of the factory
*/
public static SequenceFactory getInstance() {
if (instance == null) {
instance = new SequenceFactory();
}
return instance;
}
/**
* Returns the instance of the factory with the specified cache size.
*
* @param nCache the new cache size
* @return the instance of the factory with the specified cache size
*/
public static SequenceFactory getInstance(int nCache) {
if (instance == null) {
instance = new SequenceFactory();
}
instance.setnCache(nCache);
return instance;
}
/**
* Indicates whether the database contained enough protein sequences for
* reliability of the target/decoy based statistics.
*
* @return a boolean indicating whether the database contained enough
* protein sequences for reliability of the target/decoy based statistics
*/
public boolean hasEnoughSequences() {
return getNTargetSequences() > minProteinCount;
}
/**
* Clears the factory getInstance() needs to be called afterwards.
*
* @throws IOException if an IOException occurs
* @throws SQLException if an SQLException occurs
*/
public void clearFactory() throws IOException, SQLException {
closeFile();
defaultPeptideMapper = null;
currentHeaderMap.clear();
currentProteinMap.clear();
fastaIndex = null;
currentRandomAccessFile = null;
currentFastaFile = null;
loadedProteins.clear();
molecularWeights.clear();
}
/**
* Empties the cache of the factory.
*/
public void emptyCache() {
currentHeaderMap.clear();
currentProteinMap.clear();
loadedProteins.clear();
molecularWeights.clear();
if (defaultPeptideMapper != null) {
defaultPeptideMapper.emptyCache();
}
}
/**
* Returns the desired protein. If the protein is not found, the database
* will be re-indexed.
*
* @param accession accession of the desired protein
* @return the desired protein
* @throws IOException thrown whenever an error is encountered while reading
* the FASTA file
* @throws IllegalArgumentException thrown whenever an error is encountered
* while reading the FASTA file
* @throws InterruptedException if an InterruptedException occurs
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public Protein getProtein(String accession) throws IOException, IllegalArgumentException, InterruptedException, FileNotFoundException {
return getProtein(accession, true);
}
/**
* Returns the desired protein. Eventually re-indexes the database if the
* protein is not found.
*
* @param accession accession of the desired protein
* @param reindex a boolean indicating whether the database should be
* re-indexed in case the protein is not found.
* @return the desired protein
* @throws IOException thrown whenever an error is encountered while reading
* the FASTA file
* @throws IllegalArgumentException thrown whenever an error is encountered
* while reading the FASTA file
* @throws InterruptedException if an InterruptedException occurs
*/
private Protein getProtein(String accession, boolean reindex) throws IOException, IllegalArgumentException, InterruptedException, FileNotFoundException {
if (fastaIndex == null) {
throw new IllegalArgumentException("Protein sequences not loaded in the sequence factory.");
}
Protein currentProtein = currentProteinMap.get(accession);
if (currentProtein == null && isDefaultReversed() && isDecoyAccession(accession)) {
if (decoyInMemory) {
currentProtein = getDecoyProteinFromTargetSynchronized(accession, reindex);
} else {
currentProtein = getDecoyProteinFromTarget(accession, reindex);
}
}
if (currentProtein == null) {
currentProtein = getProteinSynchronized(accession, reindex);
}
if (currentProtein == null) {
throw new IllegalArgumentException("Protein not found: " + accession + ".");
}
return currentProtein;
}
/**
* Returns a decoy protein from a target protein or looks for the sequence
* in the cache if not found.
*
* @param accession the accession of the decoy protein to look for
* @param reindex a boolean indicating whether the database should be
* re-indexed in case the protein is not found.
*
* @return the protein of interest, null if not found
*
* @throws IOException if an IOException occurs
* @throws IllegalArgumentException if an IllegalArgumentException occurs
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public synchronized Protein getDecoyProteinFromTargetSynchronized(String accession, boolean reindex)
throws IOException, IllegalArgumentException, FileNotFoundException {
// check whether another thread did the job already
Protein currentProtein = currentProteinMap.get(accession);
if (currentProtein == null) {
currentProtein = getDecoyProteinFromTarget(accession, reindex);
}
return currentProtein;
}
/**
* Returns a decoy protein from a target protein or looks for the sequence
* in the cache if not found.
*
* @param accession the accession of the decoy protein to look for
* @param reindex a boolean indicating whether the database should be
* re-indexed in case the protein is not found.
*
* @return the protein of interest, null if not found
*
* @throws IOException if an IOException occurs
* @throws IllegalArgumentException if an IllegalArgumentException occurs
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public Protein getDecoyProteinFromTarget(String accession, boolean reindex) throws IOException, IllegalArgumentException, FileNotFoundException {
Protein currentProtein = null;
String targetAccession = getDefaultTargetAccession(accession);
try {
Protein targetProtein = currentProteinMap.get(targetAccession);
if (targetProtein == null && decoyInMemory) {
currentProtein = getProteinSynchronized(accession, reindex);
} else {
if (targetProtein == null) {
targetProtein = getProtein(targetAccession, reindex);
}
currentProtein = new Protein(accession, targetProtein.getDatabaseType(), reverseSequence(targetProtein.getSequence()), true);
if (decoyInMemory) {
addProteinToCache(accession, currentProtein);
}
}
} catch (Exception e) {
// ignore
}
return currentProtein;
}
/**
* Returns the desired protein. Eventually re-indexes the database if the
* protein is not found. Synchronized version serving as a queue for
* threads.
*
* @param accession accession of the desired protein
* @param reindex a boolean indicating whether the database should be
* re-indexed in case the protein is not found.
* @return the desired protein
* @throws IOException thrown whenever an error is encountered while reading
* the FASTA file
* @throws IllegalArgumentException thrown whenever an error is encountered
* while reading the FASTA file
* @throws InterruptedException
*/
private synchronized Protein getProteinSynchronized(String accession, boolean reindex) throws IOException, IllegalArgumentException, InterruptedException, FileNotFoundException {
Protein currentProtein = currentProteinMap.get(accession);
if (currentProtein == null) {
Long index = fastaIndex.getIndex(accession);
if (index == null) {
if (reindex) {
fastaIndex = getFastaIndex(true, null);
return getProtein(accession, false);
}
throw new IllegalArgumentException("Protein not found: " + accession + ".");
}
return getProtein(accession, index, 1);
}
return currentProtein;
}
/**
* Returns the protein indexed by the given index. It can be that the IO is
* busy (especially when working on distant servers) thus returning an
* error. The method will then retry after waiting waitingTime milliseconds.
* The waitingTime is doubled for the next try. The method throws an
* exception after timeout (see timeOut attribute).
*
* @param index the index where to look at
* @param waitingTime the waiting time before retry
* @return the header indexed by the given index
* @throws InterruptedException
* @throws IOException
* @throws IllegalArgumentException
*/
private synchronized Protein getProtein(String accession, long index, long waitingTime) throws IOException, IllegalArgumentException, InterruptedException, FileNotFoundException {
if (waitingTime <= 0) {
throw new IllegalArgumentException("Waiting time should be a positive number.");
}
try {
if (reading) {
throw new IllegalStateException("Attempting to read new line before current read operation is completed.");
}
reading = true;
currentRandomAccessFile.seek(index);
String line;
StringBuilder sequence = new StringBuilder();
Header currentHeader = currentHeaderMap.get(accession);
boolean headerFound = false;
while ((line = currentRandomAccessFile.readLine()) != null) {
line = line.trim();
if (line.startsWith(">")) {
if (sequence.length() != 0 || headerFound) {
break;
}
if (currentHeader == null) {
currentHeader = Header.parseFromFASTA(line);
if (currentHeader == null) {
throw new IllegalArgumentException("Could not parse FASTA header \"" + line + "\".");
}
currentHeaderMap.put(accession, currentHeader);
}
headerFound = true;
} else {
sequence.append(line.trim());
}
}
Protein currentProtein = new Protein(accession, currentHeader.getDatabaseType(), importSequenceFromFasta(sequence), isDecoyAccession(accession));
addProteinToCache(accession, currentProtein);
reading = false;
return currentProtein;
} catch (IOException e) {
reading = false;
if (waitingTime < TIME_OUT) {
wait(waitingTime);
e.printStackTrace();
return getProtein(accession, index, 2 * waitingTime);
} else {
throw e;
}
}
}
/**
* Processes the sequence as present in the FASTA file.
*
* @param fastaSequence the sequence as present in the FASTA file
*
* @return the protein sequence
*/
public static String importSequenceFromFasta(StringBuilder fastaSequence) {
String sequence;
if (fastaSequence.charAt(fastaSequence.length() - 1) == '*') {
sequence = fastaSequence.substring(0, fastaSequence.length() - 1);
} else {
sequence = fastaSequence.toString();
}
return sequence;
}
/**
* Verifies that the sequence can be parsed into a series of amino acids.
*
* @param proteinSequence the protein sequence
*/
public static void validateSequence(String proteinSequence) {
for (int i = 0; i < proteinSequence.length(); i++) {
char aa = proteinSequence.charAt(i);
AminoAcid aminoAcid = AminoAcid.getAminoAcid(aa);
if (aminoAcid == null) {
throw new IllegalArgumentException("Found character in protein sequence that cannot be mapped to an amino acid (" + aa + ").");
}
}
}
/**
* Adds a protein to the cache and keeps it under the desired size.
*
* @param accession the accession of the protein to add
* @param protein the protein to add
*/
private synchronized void addProteinToCache(String accession, Protein protein) {
while (loadedProteins.size() >= nCache) {
String accessionToRemove = loadedProteins.get(0);
currentProteinMap.remove(accessionToRemove);
currentHeaderMap.remove(accessionToRemove);
loadedProteins.remove(0);
}
loadedProteins.add(accession);
currentProteinMap.put(accession, protein);
}
/**
* Returns the desired header for the protein in the FASTA file.
*
* @param accession accession of the desired protein
*
* @return the corresponding header
*
* @throws IOException exception thrown whenever an error occurred while
* reading the FASTA file
* @throws java.lang.InterruptedException exception thrown whenever an error
* occurred while waiting for the connection to the FASTA file to recover.
*/
public Header getHeader(String accession) throws IOException, InterruptedException {
return getHeader(accession, true);
}
/**
* Returns the desired header for the protein in the FASTA file.
*
* @param accession accession of the desired protein
*
* @return the corresponding header
*
* @throws IOException exception thrown whenever an error occurred while
* reading the FASTA file
* @throws java.lang.InterruptedException exception thrown whenever an error
* occurred while waiting for the connection to the FASTA file to recover.
*/
private Header getHeader(String accession, boolean reindex) throws IOException, InterruptedException {
Header result = currentHeaderMap.get(accession);
if (result == null) {
Long index = fastaIndex.getIndex(accession);
if (index == null) {
if (reindex) {
fastaIndex = getFastaIndex(true, null);
result = getHeader(accession, false);
}
throw new IllegalArgumentException("Protein not found: " + accession + ".");
}
result = getHeader(index, 0);
currentHeaderMap.put(accession, result);
}
return result;
}
/**
* Returns the header indexed by the given index. It can be that the IO is
* busy (especially when working on distant servers) thus returning an
* error. The method will then try 100 times at 0.01 second intervals.
*
* @param index the index where to look at
* @param nTries the number of tries already made
*
* @return the header indexed by the given index
*
* @throws IOException exception thrown whenever an error occurred while
* reading the FASTA file
* @throws java.lang.InterruptedException exception thrown whenever an error
* occurred while waiting for the connection to the FASTA file to recover.
*/
private synchronized Header getHeader(long index, int nTries) throws InterruptedException, IOException {
if (reading) {
throw new IllegalStateException("Attempting to read new line before current read operation is completed.");
}
try {
reading = true;
currentRandomAccessFile.seek(index);
Header result = Header.parseFromFASTA(currentRandomAccessFile.readLine());
reading = false;
return result;
} catch (IOException e) {
reading = false;
if (nTries <= 100) {
wait(10);
return getHeader(index, nTries + 1);
} else {
throw e;
}
}
}
/**
* Loads a new FASTA file in the factory. Only one FASTA file can be loaded
* at a time.
*
* @param fastaFile the FASTA file to load
*
* @throws IOException exception thrown if an error occurred while reading
* the FASTA file
* @throws ClassNotFoundException exception thrown whenever an error
* occurred while deserializing the file index
*/
public void loadFastaFile(File fastaFile) throws IOException, ClassNotFoundException {
loadFastaFile(fastaFile, null);
}
/**
* Loads a new FASTA file in the factory. Only one FASTA file can be loaded
* at a time.
*
* @param fastaFile the FASTA file to load
* @param waitingHandler a waitingHandler showing the progress
*
* @throws IOException exception thrown if an error occurred while reading
* the FASTA file
* @throws ClassNotFoundException exception thrown whenever an error
* occurred while deserializing the file index
*/
public void loadFastaFile(File fastaFile, WaitingHandler waitingHandler) throws IOException, ClassNotFoundException {
if (!fastaFile.exists()) {
throw new FileNotFoundException("The FASTA file \'" + fastaFile.getAbsolutePath() + "\' could not be found!");
}
defaultPeptideMapper = null;
currentFastaFile = fastaFile;
currentRandomAccessFile = new BufferedRandomAccessFile(fastaFile, "r", 1024 * 100);
fastaIndex = getFastaIndex(false, waitingHandler);
}
/**
* Indicates whether the connection to the random access file has been
* closed.
*
* @return a boolean indicating whether the connection to the random access
* file has been closed.
*/
public boolean isClosed() {
return currentFastaFile == null;
}
/**
* Resets the connection to the random access file.
*
* @throws IOException if an IOException occurs
*/
public void resetConnection() throws IOException {
currentRandomAccessFile.close();
currentRandomAccessFile = new BufferedRandomAccessFile(currentFastaFile, "r", 1024 * 100);
}
/**
* Returns the file index of a FASTA file.
*
* @param fastaFile the FASTA file
*
* @return the index of the FASTA file
*
* @throws IOException exception thrown if an error occurred while reading
* the FASTA file
* @throws ClassNotFoundException exception thrown whenever an error
* occurred while deserializing the file index
*/
private FastaIndex getFastaIndex() throws IOException, ClassNotFoundException {
return getFastaIndex(false, null);
}
/**
* Returns the file index of the FASTA file loaded in the factory (see
* currentFastaFile attribute). If a deserialization problem occurs the file
* will be automatically overwritten and the stacktrace printed.
*
* @param overwrite boolean indicating whether the index .cui file shall be
* overwritten if present, even if the file has not been changed
* @param waitingHandler a waitingHandler showing the progress
*
* @return the index of the FASTA file
*
* @throws IOException exception thrown if an error occurred while reading
* the FASTA file
* @throws StringIndexOutOfBoundsException thrown if issues occur during the
* parsing of the protein headers
* @throws IllegalArgumentException if non unique accession numbers are
* found
*/
private synchronized FastaIndex getFastaIndex(boolean overwrite, WaitingHandler waitingHandler) throws IOException, StringIndexOutOfBoundsException {
return getFastaIndex(currentFastaFile, overwrite, waitingHandler);
}
/**
* Returns the file index of the given FASTA file. If a problem occurs while
* reading an older index the file will be automatically overwritten and the
* stacktrace printed.
*
* @param fastaFile the FASTA file to index
* @param overwrite boolean indicating whether the index .cui file shall be
* overwritten if present, even if the file has not been changed
* @param waitingHandler a waitingHandler showing the progress
*
* @return the index of the FASTA file
*
* @throws IOException exception thrown if an error occurred while reading
* the FASTA file
* @throws StringIndexOutOfBoundsException thrown if issues occur during the
* parsing of the protein headers
* @throws IllegalArgumentException if non unique accession numbers are
* found
*/
public static synchronized FastaIndex getFastaIndex(File fastaFile, boolean overwrite, WaitingHandler waitingHandler) throws IOException, StringIndexOutOfBoundsException {
FastaIndex tempFastaIndex;
String fileName = fastaFile.getName();
if (!overwrite) {
File indexFile = new File(fastaFile.getParent(), fastaFile.getName() + ".cui");
if (indexFile.exists()) {
try {
tempFastaIndex = (FastaIndex) SerializationUtils.readObject(indexFile);
Long indexLastModified = tempFastaIndex.getLastModified();
if (indexLastModified != null) {
long fileLastModified = fastaFile.lastModified();
if (indexLastModified == fileLastModified) {
return tempFastaIndex;
} else {
System.err.println("Reindexing: " + fileName + ". (changes in the file detected)");
}
}
} catch (InvalidClassException e) {
System.err.println("Reindexing: " + fileName + ". (Reason: " + e.getLocalizedMessage() + ")");
} catch (Exception e) {
System.err.println("Reindexing: " + fileName + ". (Reason: " + e.getLocalizedMessage() + ")");
}
}
}
// try to rescue user settings
String decoyTag = null;
String name = null;
String version = null;
String description = null;
String accessionParsingRule = null;
File indexFile = new File(fastaFile.getParent(), fileName + ".cui");
if (indexFile.exists()) {
try {
tempFastaIndex = (FastaIndex) SerializationUtils.readObject(indexFile);
decoyTag = tempFastaIndex.getDecoyTag();
version = tempFastaIndex.getVersion();
name = tempFastaIndex.getName();
description = tempFastaIndex.getDescription();
accessionParsingRule = tempFastaIndex.getAccessionParsingRule();
} catch (Exception e) {
// Fail silently
}
}
System.out.println("Reindexing: " + fileName + ".");
tempFastaIndex = createFastaIndex(fastaFile, name, decoyTag, version, waitingHandler);
tempFastaIndex.setDescription(description);
tempFastaIndex.setAccessionParsingRule(accessionParsingRule);
if (waitingHandler == null || !waitingHandler.isRunCanceled()) {
try {
writeIndex(tempFastaIndex, fastaFile.getParentFile());
} catch (Exception e) {
e.printStackTrace();
}
}
return tempFastaIndex;
}
/**
* Static method to create a FASTA index for a FASTA file. Non-valid fasta
* files will throw an exception.
*
* @param fastaFile the FASTA file
* @param progressBar a progress bar showing the progress
* @param decoyTag the decoy tag. Will be inferred if null.
* @param mainDatabaseType the main database type. Will be inferred if null.
* @param version the version. last modification of the file will be used if
* null.
* @param name the name of the database. Set to file name if null.
*
* @return the corresponding FASTA index
*
* @throws IOException exception thrown whenever an error occurred while
* reading the file
*/
private static FastaIndex createFastaIndex(File fastaFile, String name, String decoyTag, String version,
WaitingHandler waitingHandler) throws IOException {
HashMap<String, Long> indexes = new HashMap<String, Long>();
HashSet<String> decoyAccessions = new HashSet<String>();
BufferedRandomAccessFile bufferedRandomAccessFile = new BufferedRandomAccessFile(fastaFile, "r", 1024 * 100);
if (waitingHandler != null) {
waitingHandler.resetSecondaryProgressCounter();
waitingHandler.setMaxSecondaryProgressCounter(100);
}
long progressUnit = bufferedRandomAccessFile.length() / 100;
String line;
boolean decoy = false, defaultReversed = false;
int nTarget = 0;
long index = bufferedRandomAccessFile.getFilePointer();
// a map of the database header types
HashMap<Header.DatabaseType, Integer> databaseTypes = new HashMap<Header.DatabaseType, Integer>();
// a map of the species
HashMap<String, Integer> species = new HashMap<String, Integer>();
StringBuilder sequenceBuilder = new StringBuilder();
String accession = null;
int lineNumber = 0;
while ((line = bufferedRandomAccessFile.readLine()) != null) {
lineNumber++;
if (line.startsWith(">")) {
if (sequenceBuilder.length() != 0 && accession != null) {
String sequence = importSequenceFromFasta(sequenceBuilder);
try {
validateSequence(sequence);
} catch (Exception e) {
throw new IllegalArgumentException("An error occurred while parsing the sequence of " + accession + " at line " + lineNumber + ": " + e.toString());
}
}
Header fastaHeader = Header.parseFromFASTA(line);
accession = fastaHeader.getAccessionOrRest();
if (accession == null) {
throw new IllegalArgumentException("No accession found for header at line " + lineNumber + ".");
}
// if (fastaHeader.getStartLocation() != -1) {
// accession += " (" + fastaHeader.getStartLocation() + "-" + fastaHeader.getEndLocation() + ")"; // special dbtoolkit pattern
// }
// if (indexes.containsKey(accession)) {
// throw new IllegalArgumentException("Non unique accession number found \'" + accession + "\'!\nPlease check the FASTA file.");
// }
indexes.put(accession, index);
if (decoyTag == null) {
decoyTag = getDecoyFlag(accession);
}
if (decoyTag == null || !isDecoy(accession, decoyTag)) {
nTarget++;
// get the database type
Header.DatabaseType tempDatabaseType = fastaHeader.getDatabaseType();
Integer typeCounter = databaseTypes.get(tempDatabaseType);
if (typeCounter == null) {
databaseTypes.put(tempDatabaseType, 1);
} else {
databaseTypes.put(tempDatabaseType, typeCounter + 1);
}
// get the species
String taxonomy = fastaHeader.getTaxonomy();
if (taxonomy == null || taxonomy.equals("")) {
taxonomy = SpeciesFactory.UNKNOWN;
}
Integer occurrence = species.get(taxonomy);
if (occurrence == null) {
species.put(taxonomy, 1);
} else {
species.put(taxonomy, occurrence + 1);
}
} else {
decoyAccessions.add(accession);
if (!decoy) {
decoy = true;
if (accession.endsWith(getDefaultDecoyAccessionSuffix())) {
defaultReversed = true;
}
}
}
if (waitingHandler != null && progressUnit != 0) {
waitingHandler.setSecondaryProgressCounter((int) (index / progressUnit));
if (waitingHandler.isRunCanceled()) {
break;
}
}
index = bufferedRandomAccessFile.getFilePointer();
sequenceBuilder = new StringBuilder();
} else {
index = bufferedRandomAccessFile.getFilePointer();
sequenceBuilder.append(line.trim());
}
}
if (waitingHandler != null) {
waitingHandler.setSecondaryProgressCounterIndeterminate(true);
}
bufferedRandomAccessFile.close();
long lastModified = fastaFile.lastModified();
if (version == null) {
version = FastaIndex.getDefaultVersion(lastModified);
}
String fileName = fastaFile.getName();
if (name == null) {
name = Util.removeExtension(fileName);
}
// find the main database type
Header.DatabaseType mainDatabaseType = null;
int maxCounter = 0;
Iterator<Header.DatabaseType> iterator = databaseTypes.keySet().iterator();
while (iterator.hasNext()) {
Header.DatabaseType tempDatabaseType = iterator.next();
if (databaseTypes.get(tempDatabaseType) > maxCounter) {
maxCounter = databaseTypes.get(tempDatabaseType);
mainDatabaseType = tempDatabaseType;
}
}
return new FastaIndex(indexes, decoyAccessions, fileName, name, decoy, defaultReversed, nTarget, lastModified, mainDatabaseType, databaseTypes, decoyTag, version, species);
}
/**
* Serializes the FASTA file index in a given directory.
*
* @param fastaIndex the index of the FASTA file
* @param directory the directory where to write the file
*
* @throws IOException exception thrown whenever an error occurred while
* writing the file
*/
public static void writeIndex(FastaIndex fastaIndex, File directory) throws IOException {
// Serialize the file index as compomics utilities index
File destinationFile = new File(directory, getIndexName(fastaIndex.getFileName()));
SerializationUtils.writeObject(fastaIndex, destinationFile);
}
/**
* Returns the name of the FASTA index corresponding to the given FASTA file
* name.
*
* @param fastaName the name of the FASTA file
*
* @return the name of the index
*/
public static String getIndexName(String fastaName) {
return fastaName + ".cui";
}
/**
* Saves the index.
*
* @throws IOException if an IOException occurs
*/
public void saveIndex() throws IOException {
writeIndex(fastaIndex, currentFastaFile.getParentFile());
}
/**
* Closes the opened file.
*
* @throws IOException exception thrown whenever an error occurred while
* closing the file
* @throws SQLException if an SQLException occurs
*/
public void closeFile() throws IOException, SQLException {
if (currentRandomAccessFile != null) {
currentRandomAccessFile.close();
currentFastaFile = null;
}
if (defaultPeptideMapper != null) {
defaultPeptideMapper.close();
}
}
/**
* Returns a boolean indicating whether a protein is decoy or not based on
* the protein accession and a given decoy flag. Note: in most cases the
* faster isDecoyAccession method should be used instead!
*
* @param proteinAccession The accession of the protein
* @param decoyFlag the decoy flag
* @return a boolean indicating whether the protein is Decoy.
*/
public static boolean isDecoy(String proteinAccession, String decoyFlag) {
// test if the decoy tag is empty, and return false if it is
if (decoyFlag == null || decoyFlag.isEmpty()) {
return false;
}
String start = decoyFlag + ".*";
String end = ".*" + decoyFlag;
return proteinAccession.matches(start) || proteinAccession.matches(end);
}
/**
* Returns the default tag matched in the sequence if any. Null else.
*
* @param proteinAccession the protein accession
*
* @return the decoy tag matched by this protein
*/
private static String getDecoyFlag(String proteinAccession) {
for (String flag : DECOY_FLAGS) {
if (isDecoy(proteinAccession, flag)) {
return flag;
}
}
return null;
}
/**
* Indicates whether a protein is a decoy in the selected loaded FASTA file.
*
* @param proteinAccession the protein accession of interest.
* @return true if decoy
*/
public boolean isDecoyAccession(String proteinAccession) {
return fastaIndex.isDecoy(proteinAccession);
}
/**
* Indicates whether the database loaded contains decoy sequences.
*
* @return a boolean indicating whether the database loaded contains decoy
* sequences
*/
public boolean concatenatedTargetDecoy() {
return fastaIndex.isConcatenatedTargetDecoy();
}
/**
* Indicates whether the decoy sequences are reversed versions of the target
* and the decoy accessions built based on the sequence factory methods. See
* getDefaultDecoyAccession(String targetAccession).
*
* @return true if the the decoy sequences are reversed versions of the
* target and the decoy accessions built based on the sequence factory
* method
*/
public boolean isDefaultReversed() {
return fastaIndex.isDefaultReversed();
}
/**
* Returns the number of target sequences in the database.
*
* @return the number of target sequences in the database
*/
public int getNTargetSequences() {
return fastaIndex.getNTarget();
}
/**
* Returns the number of sequences in the FASTA file.
*
* @return the number of sequences in the FASTA file
*/
public int getNSequences() {
return fastaIndex.getNSequences();
}
/**
* Appends decoy sequences to the desired file.
*
* @param destinationFile the destination file
*
* @throws IOException exception thrown whenever an error occurred while
* reading or writing a file
* @throws InterruptedException if an InterruptedException occurs
* @throws ClassNotFoundException if an ClassNotFoundException occurs
*/
public void appendDecoySequences(File destinationFile) throws IOException, InterruptedException, ClassNotFoundException {
appendDecoySequences(destinationFile, null);
}
/**
* Appends decoy sequences to the desired file while displaying progress.
*
* @param destinationFile the destination file
* @param waitingHandler the waiting handler
*
* @throws IOException exception thrown whenever an error occurred while
* reading or writing a file
* @throws InterruptedException if an InterruptedException occurs
* @throws ClassNotFoundException if an ClassNotFoundException occurs
*/
public void appendDecoySequences(File destinationFile, WaitingHandler waitingHandler)
throws IOException, InterruptedException, ClassNotFoundException {
if (waitingHandler != null) {
waitingHandler.resetSecondaryProgressCounter();
waitingHandler.setMaxSecondaryProgressCounter(fastaIndex.getNTarget());
}
// first create the new target-decoy file
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(destinationFile));
String lineBreak = System.getProperty("line.separator");
try {
ProteinIterator proteinIterator = getProteinIterator(true);
while (proteinIterator.hasNext()) {
if (waitingHandler != null && waitingHandler.isRunCanceled()) {
break;
}
if (waitingHandler != null) {
waitingHandler.increaseSecondaryProgressCounter();
}
Protein currentProtein = proteinIterator.getNextProtein();
String accession = currentProtein.getAccession();
Header currentHeader = getHeader(accession);
String decoyAccession = getDefaultDecoyAccession(currentProtein.getAccession());
// String decoyAccession ="rev_" + currentProtein.getAccession(); Header for the TPP
String currentRawHeader = currentHeader.getRawHeader();
// replace the accession number
String escapedString = java.util.regex.Pattern.quote(accession);
currentRawHeader = currentRawHeader.replaceAll(escapedString, decoyAccession);
// add decoy to the description
if (currentHeader.getDescription() != null && !currentHeader.getDescription().isEmpty()) {
escapedString = java.util.regex.Pattern.quote(currentHeader.getDescription());
currentRawHeader = currentRawHeader.replaceAll(escapedString, getDefaultDecoyDescription(currentHeader.getDescription()));
}
// write the target protein to the fasta file
bufferedWriter.write(currentHeader.getRawHeader() + lineBreak);
bufferedWriter.write(currentProtein.getSequence() + lineBreak);
// write the decoy protein to the fasta file
bufferedWriter.write(currentRawHeader + lineBreak);
bufferedWriter.write(reverseSequence(currentProtein.getSequence()) + lineBreak);
// possible fix for the dbtoolkit uniprot format
// Protein currentProtein = getProtein(accession);
// Header currentHeader = getHeader(accession);
// String reversedSequence = reverseSequence(currentProtein.getSequence());
//
// bufferedWriter.write(currentHeader.toString() + System.getProperty("line.separator"));
// bufferedWriter.write(currentProtein.getSequence() + System.getProperty("line.separator"));
//
// // @TODO: this might not be the best way of doing this, but was easier than trying to change the parsing in the Header class...
// if (currentHeader.toString("_" + decoyFlags[0]).equalsIgnoreCase(currentHeader.toString())) {
// currentHeader.setRest(currentProtein.getAccession() + "_" + decoyFlags[0]);
// }
//
// bufferedWriter.write(currentHeader.toString("_" + decoyFlags[0]) + System.getProperty("line.separator"));
// bufferedWriter.write(reversedSequence + System.getProperty("line.separator"));
}
} finally {
bufferedWriter.close();
}
if (waitingHandler != null) {
waitingHandler.setSecondaryProgressCounterIndeterminate(true);
}
boolean indexFile = true;
if (waitingHandler != null) {
if (waitingHandler.isRunCanceled()) {
indexFile = false;
}
}
if (indexFile) {
// now (re-)index the new target-decoy file
loadFastaFile(destinationFile, waitingHandler);
} else {
destinationFile.delete();
}
}
/**
* Reverses a protein sequence.
*
* @param sequence the protein sequence
* @return the reversed protein sequence
*/
public static String reverseSequence(String sequence) {
return new StringBuilder(sequence).reverse().toString();
}
/**
* Returns the sequences present in the database. An empty list if no file
* is loaded.
*
* @return the sequences present in the database
*/
public Set<String> getAccessions() {
Set<String> setToFill = new HashSet<String>();
if (fastaIndex != null) {
setToFill = fastaIndex.getIndexes().keySet();
}
return setToFill;
}
/**
* Returns the size of the cache.
*
* @return the size of the cache
*/
public int getnCache() {
return nCache;
}
/**
* Sets the size of the cache.
*
* @param nCache the new size of the cache
*/
public void setnCache(int nCache) {
this.nCache = nCache;
}
/**
* Returns the occurrence of every amino acid in the database.
*
* @param progressBar a progress bar, can be null
*
* @return a map containing all amino acid occurrence in the database
*
* @throws IOException exception thrown whenever an error occurred while
* reading the database
* @throws InterruptedException if an InterruptedException occurs
* @throws ClassNotFoundException if an ClassNotFoundException occurs
*/
public HashMap<String, Integer> getAAOccurrences(JProgressBar progressBar) throws IOException, InterruptedException, ClassNotFoundException {
HashMap<String, Integer> aaMap = new HashMap<String, Integer>();
Set<String> accessions = getAccessions();
if (progressBar != null) {
progressBar.setIndeterminate(false);
progressBar.setMaximum(accessions.size());
progressBar.setValue(0);
}
for (String accession : accessions) {
if (!isDecoyAccession(accession)) {
Protein protein = getProtein(accession);
for (String aa : protein.getSequence().split("")) {
Integer n = aaMap.get(aa);
if (n == null) {
n = 0;
}
aaMap.put(aa, n + 1);
}
}
if (progressBar != null) {
progressBar.setValue(progressBar.getValue() + 1);
}
}
if (progressBar != null) {
progressBar.setIndeterminate(true);
}
return aaMap;
}
/**
* Returns the protein's molecular weight in kDa.
*
* @param accession the protein's accession number
*
* @return the protein's molecular weight
*
* @throws IOException exception thrown whenever an error occurred while
* reading the protein sequence
* @throws InterruptedException exception thrown whenever an error occurred
* while reading the protein sequence
* @throws ClassNotFoundException exception thrown whenever an error
* occurred while reading the protein sequence
*/
public double computeMolecularWeight(String accession) throws IOException, InterruptedException, ClassNotFoundException {
if (isDefaultReversed() && isDecoyAccession(accession)) {
// Don't really see where we would need that...
try {
return computeMolecularWeight(getDefaultTargetAccession(accession));
} catch (Exception e) {
// back to standard mode
}
}
// see if we've already calculated the weight of this protein
Double molecularWeight = molecularWeights.get(accession);
if (molecularWeight == null) {
Protein protein = getProtein(accession);
molecularWeight = protein.computeMolecularWeight() / 1000;
molecularWeights.put(accession, molecularWeight);
}
return molecularWeight;
}
/**
* Returns the name of the loaded FASTA file. Null if none loaded.
*
* @return the name of the loaded FASTA file
*/
public String getFileName() {
if (fastaIndex == null) {
return null;
}
return fastaIndex.getFileName();
}
/**
* Returns the currently loaded FASTA file.
*
* @return the currently loaded FASTA file
*/
public File getCurrentFastaFile() {
return currentFastaFile;
}
/**
* Returns the default suffix for a decoy accession.
*
* @return the default suffix for a decoy accession
*/
public static String getDefaultDecoyAccessionSuffix() {
return "_" + DECOY_FLAGS[0];
}
/**
* Returns the default decoy accession for a target accession.
*
* @param targetAccession the target accession
* @return the default decoy accession
*/
public static String getDefaultDecoyAccession(String targetAccession) {
return targetAccession + getDefaultDecoyAccessionSuffix();
}
/**
* Returns the default description for a decoy protein.
*
* @param targetDescription the description of a target protein
* @return the default description of the decoy protein
*/
public static String getDefaultDecoyDescription(String targetDescription) {
return targetDescription + "-" + DECOY_FLAGS[0];
}
/**
* Returns the default target accession of a given decoy protein. Note:
* works only for the accessions constructed according to
* getDefaultDecoyAccession(String targetAccession).
*
* @param decoyAccession the decoy accession
*
* @return the target accession
*/
public static String getDefaultTargetAccession(String decoyAccession) {
return decoyAccession.substring(0, decoyAccession.length() - getDefaultDecoyAccessionSuffix().length());
}
/**
* Returns the FASTA index of the currently loaded file.
*
* @return the FASTA index of the currently loaded file
*/
public FastaIndex getCurrentFastaIndex() {
return fastaIndex;
}
/**
* Returns the default peptide mapper. Null if none created.
*
* @return the default peptide mapper
*/
public PeptideMapper getDefaultPeptideMapper() {
return defaultPeptideMapper;
}
/**
* Returns the default peptide to protein mapper for the database loaded in
* factory according to the sequence matching preferences. Creates a new one
* if none found.
*
* @param sequenceMatchingPreferences the sequences matching preferences
* @param ptmSettings contains modification parameters for identification
* @param peptideVariantsPreferences the peptide variants preferences set by
* the user
* @param waitingHandler waiting handler displaying progress to the user
* during the indexation of the database
* @param exceptionHandler handler for the exceptions encountered while
* indexing the database
*
* @return the default peptide mapper
*
* @throws IOException exception thrown whenever an error occurs while
* reading or writing a file
* @throws ClassNotFoundException exception thrown whenever an error occurs
* while deserializing an object
* @throws InterruptedException exception thrown whenever a threading issue
* occurred while indexing the database
* @throws SQLException exception thrown whenever a problem occurred while
* interacting with an SQL database
*/
public PeptideMapper getDefaultPeptideMapper(SequenceMatchingPreferences sequenceMatchingPreferences, PtmSettings ptmSettings, PeptideVariantsPreferences peptideVariantsPreferences,
WaitingHandler waitingHandler, ExceptionHandler exceptionHandler) throws IOException, InterruptedException, ClassNotFoundException, SQLException {
int nThreads = Math.max(Runtime.getRuntime().availableProcessors(), 1);
return getDefaultPeptideMapper(sequenceMatchingPreferences, ptmSettings, peptideVariantsPreferences, waitingHandler, exceptionHandler, true, nThreads);
}
/**
* Returns the default peptide to protein mapper for the database loaded in
* factory according to the sequence matching preferences. Creates a new one
* if none found.
*
* @param waitingHandler waiting handler displaying progress to the user
* during the indexation of the database
* @param exceptionHandler handler for the exceptions encountered while
* indexing the database
* @param nThreads the number of threads to use during indexing
* @param identificationParameters the identification parameters
*
* @return the default peptide mapper
*
* @throws IOException exception thrown whenever an error occurs while
* reading or writing a file
* @throws ClassNotFoundException exception thrown whenever an error occurs
* while deserializing an object
* @throws InterruptedException exception thrown whenever a threading issue
* occurred while indexing the database
* @throws SQLException exception thrown whenever a problem occurred while
* interacting with an SQL database
*/
public PeptideMapper getDefaultPeptideMapper(WaitingHandler waitingHandler,
ExceptionHandler exceptionHandler, int nThreads, IdentificationParameters identificationParameters) throws IOException, InterruptedException, ClassNotFoundException, SQLException {
return getDefaultPeptideMapper(identificationParameters.getSequenceMatchingPreferences(), identificationParameters.getSearchParameters().getPtmSettings(), identificationParameters.getPeptideVariantsPreferences(), waitingHandler, exceptionHandler, true, nThreads); // @TODO: why add identificationParameters? it's not used?
}
/**
* Returns the default peptide to protein mapper for the database loaded in
* factory according to the sequence matching preferences. Creates a new one
* if none found.
*
* @param sequenceMatchingPreferences the sequences matching preferences
* @param waitingHandler waiting handler displaying progress to the user
* during the indexation of the database
* @param peptideVariantsPreferences the peptide variants preferences set by
* the user
* @param exceptionHandler handler for the exceptions encountered while
* indexing the database
* @param displayProgress boolean indicating whether the progress of the
* indexing should be displayed
* @param nThreads the number of threads to use during indexing
* @param ptmSettings contains modification parameters for identification
*
* @return the default peptide mapper
*
* @throws IOException exception thrown whenever an error occurs while
* reading or writing a file
* @throws ClassNotFoundException exception thrown whenever an error occurs
* while deserializing an object
* @throws InterruptedException exception thrown whenever a threading issue
* occurred while indexing the database
* @throws SQLException exception thrown whenever a problem occurred while
* interacting with an SQL database
*/
public synchronized PeptideMapper getDefaultPeptideMapper(SequenceMatchingPreferences sequenceMatchingPreferences, PtmSettings ptmSettings, PeptideVariantsPreferences peptideVariantsPreferences, WaitingHandler waitingHandler,
ExceptionHandler exceptionHandler, boolean displayProgress, int nThreads) throws IOException, InterruptedException, ClassNotFoundException, SQLException {
if (defaultPeptideMapper == null) {
PeptideMapperType peptideMapperType = sequenceMatchingPreferences.getPeptideMapperType();
switch (peptideMapperType) {
case fm_index:
defaultPeptideMapper = new FMIndex(waitingHandler, displayProgress, ptmSettings, peptideVariantsPreferences);
break;
case tree:
UtilitiesUserPreferences userPreferences = UtilitiesUserPreferences.loadUserPreferences();
int memoryPreference = userPreferences.getMemoryPreference();
int memoryAllocated = 3 * memoryPreference / 4;
int cacheSize = 250000;
if (memoryPreference < 2500) {
cacheSize = 5000;
} else if (memoryPreference < 10000) {
cacheSize = 25000;
}
ProteinTree defaultProteinTree = new ProteinTree(memoryAllocated, cacheSize);
int tagLength = 3;
defaultProteinTree.initiateTree(tagLength, 50, 50, waitingHandler, exceptionHandler, true, displayProgress, nThreads);
emptyCache();
int treeSize = memoryPreference / 4;
defaultProteinTree.setMemoryAllocation(treeSize);
// close and delete the database if the process was canceled
if (waitingHandler != null && waitingHandler.isRunCanceled()) {
defaultProteinTree.deleteDb();
}
defaultPeptideMapper = defaultProteinTree;
break;
default:
throw new UnsupportedOperationException("Peptide mapper type " + peptideMapperType + " not supported.");
}
}
return defaultPeptideMapper;
}
/**
* Returns an iterator of all the headers in the FASTA file. Note: when
* reaching the end of the file the connection will be closed. Do it using
* the close() method if the end is never reached.
*
* @param targetOnly boolean indicating whether only target accessions shall
* be iterated
* @return a header iterator
*
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public HeaderIterator getHeaderIterator(boolean targetOnly) throws FileNotFoundException {
return new HeaderIterator(currentFastaFile, targetOnly);
}
/**
* Returns an iterator of all the proteins in the FASTA file. Note: when
* reaching the end of the file the connection will be closed. Do it using
* the close() method if the end is never reached.
*
* @param targetOnly boolean indicating whether only target accessions shall
* be iterated
*
* @return a protein iterator
*
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public ProteinIterator getProteinIterator(boolean targetOnly) throws FileNotFoundException {
return new ProteinIterator(currentFastaFile, targetOnly);
}
/**
* Returns whether decoys should be kept in memory.
*
* @return true if decoys should be kept in memory
*/
public boolean isDecoyInMemory() {
return decoyInMemory;
}
/**
* Sets whether decoys should be kept in memory.
*
* @param decoyInMemory true if decoys should be kept in memory
*/
public void setDecoyInMemory(boolean decoyInMemory) {
this.decoyInMemory = decoyInMemory;
}
/**
* Convenience iterator iterating the headers of a FASTA file without using
* the cache. The order is the one in the FASTA file.
*/
public class HeaderIterator {
/**
* The header of the next protein.
*/
private Header nextHeader = null;
/**
* Semaphore avoiding the overwriting of the nextHeader.
*/
private final Semaphore nextHeaderMutex = new Semaphore(1);
/**
* Semaphore avoiding multiple threads to use the reader.
*/
private final Semaphore readerMutex = new Semaphore(1);
/**
* The buffered reader.
*/
private BufferedReader br;
/**
* Boolean indicating whether target protein only should be iterated.
*/
private final boolean targetOnly;
/**
* Constructor.
*
* @param targetOnly if true only target proteins will be iterated
* @param file the FASTA file to iterate
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public HeaderIterator(File file, boolean targetOnly) throws FileNotFoundException {
this.targetOnly = targetOnly;
br = new BufferedReader(new FileReader(file));
}
/**
* Returns true if there is a next header. Note: all other threads
* calling hasNext() are locked until getNext() is called.
*
* @return true if there is a next header
*
* @throws IOException if a IOException occurs
* @throws java.lang.InterruptedException exception thrown if a
* threading issue occurred
*/
public boolean hasNext() throws IOException, InterruptedException {
readerMutex.acquire();
Header threadNextHeader = null;
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
if (!line.equals("")) {
if (line.startsWith(">")) {
threadNextHeader = Header.parseFromFASTA(line);
if (!targetOnly || !isDecoyAccession(threadNextHeader.getAccession())) {
break;
} else {
threadNextHeader = null;
}
}
}
}
readerMutex.release();
if (threadNextHeader != null) {
nextHeaderMutex.acquire();
nextHeader = threadNextHeader;
return true;
} else {
close();
return false;
}
}
/**
* Returns the next header in the FASTA file.
*
* @return the next header in the FASTA file
*/
public Header getNext() {
Header result = nextHeader;
nextHeaderMutex.release();
return result;
}
/**
* Closes the connection to the file.
*
* @throws IOException if a IOException occurs
*/
public void close() throws IOException {
br.close();
}
}
/**
* Convenience iterator iterating all proteins in a FASTA file without using
* index or cache.
*/
public class ProteinIterator {
/**
* The header of the next protein.
*/
private Header nextHeader = null;
/**
* The next protein.
*/
private Protein nextProtein = null;
/**
* Semaphore avoiding the overwriting of the nextProtein.
*/
private final Semaphore nextProteinMutex = new Semaphore(1);
/**
* Semaphore avoiding multiple threads to use the reader.
*/
private final Semaphore readerMutex = new Semaphore(1);
/**
* The buffered reader.
*/
private final BufferedReader br;
/**
* Boolean indicating whether target protein only should be iterated.
*/
private final boolean targetOnly;
/**
* Constructor.
*
* @param targetOnly if true only target proteins will be iterated
* @param file the FASTA file.
*
* @throws FileNotFoundException if a FileNotFoundException occurs
*/
public ProteinIterator(File file, boolean targetOnly) throws FileNotFoundException {
this.targetOnly = targetOnly;
br = new BufferedReader(new FileReader(file));
}
/**
* Returns true if there is another protein. Note: all other threads
* calling hasNext() are locked until getNext() is called.
*
* @return true if there is another protein
*
* @throws IOException if an IOException occurs
* @throws java.lang.InterruptedException exception thrown if a
* threading issue occurred
*/
public boolean hasNext() throws IOException, InterruptedException {
readerMutex.acquire();
String line = br.readLine();
// reached end of file
if (line == null) {
readerMutex.release();
return false;
}
StringBuilder sequence = new StringBuilder();
Header header = nextHeader;
boolean newHeaderFound = false;
while (line != null) {
if (line.startsWith(">")) {
Header tempHeader = Header.parseFromFASTA(line);
String accession = tempHeader.getAccessionOrRest();
if (targetOnly && isDecoyAccession(accession)) {
while ((line = br.readLine()) != null) {
if (line.startsWith(">")) {
tempHeader = Header.parseFromFASTA(line);
if (!isDecoyAccession(tempHeader.getAccessionOrRest())) {
break;
}
}
}
if (line == null) {
break;
}
}
if (header == null) {
header = tempHeader;
} else {
nextHeader = tempHeader;
newHeaderFound = true;
break;
}
} else {
sequence.append(line.trim());
}
line = br.readLine();
}
readerMutex.release();
if (newHeaderFound || line == null) { // line == null means that we read the last protein
String accession = header.getAccessionOrRest();
Header.DatabaseType databaseType = header.getDatabaseType();
String cleanedSequence = importSequenceFromFasta(sequence);
boolean decoy = isDecoyAccession(accession);
nextProteinMutex.acquire();
nextProtein = new Protein(accession, databaseType, cleanedSequence, decoy);
return true;
} else {
close();
return false;
}
}
/**
* Returns the next protein.
*
* @return the next protein
*/
public Protein getNextProtein() {
Protein result = nextProtein;
nextProteinMutex.release();
return result;
}
/**
* Closes the connection to the file.
*
* @throws IOException if an IOException occurs
*/
public void close() throws IOException {
br.close();
}
}
}