package com.compomics.util.experiment.identification.amino_acid_tags.matchers;
import com.compomics.util.experiment.biology.AminoAcid;
import com.compomics.util.experiment.biology.AminoAcidPattern;
import com.compomics.util.experiment.biology.AminoAcidSequence;
import com.compomics.util.experiment.biology.PTM;
import com.compomics.util.experiment.biology.PTMFactory;
import com.compomics.util.experiment.biology.Peptide;
import com.compomics.util.experiment.identification.amino_acid_tags.SequenceSegment;
import com.compomics.util.experiment.identification.matches.ModificationMatch;
import com.compomics.util.experiment.identification.amino_acid_tags.Tag;
import com.compomics.util.experiment.identification.amino_acid_tags.TagComponent;
import com.compomics.util.experiment.biology.MassGap;
import com.compomics.util.experiment.identification.protein_inference.PeptideProteinMapping;
import com.compomics.util.preferences.SequenceMatchingPreferences;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
/**
* This class matches tags to peptides.
*
* @author Marc Vaudel
*/
public class TagMatcher {
/**
* Mass of the fixed peptide N-term modifications.
*/
private double fixedNTermPeptideModificationsMass = 0;
/**
* mass of the fixed peptide C-term modifications.
*/
private double fixedCTermPeptideModificationsMass = 0;
/**
* Mass of the fixed protein N-term modifications.
*/
private double fixedNTermProteinModificationsMass = 0;
/**
* Mass of the fixed protein C-term modifications.
*/
private double fixedCTermProteinModificationsMass = 0;
/**
* Map of the masses of the fixed modifications at specific amino acids:
* targeted amino acid > list of modifications.
*/
private HashMap<Character, Double> fixedAaModificationsMasses = new HashMap<Character, Double>(1);
/**
* Map of the masses of the fixed modifications at specific amino acids on
* peptide N-terminus: targeted amino acid > list of modifications.
*/
private HashMap<Character, Double> fixedAaModificationsPeptideNtermMasses = new HashMap<Character, Double>(1);
/**
* Map of the masses of the fixed modifications at specific amino acids on
* protein N-terminus: targeted amino acid > list of modifications.
*/
private HashMap<Character, Double> fixedAaModificationsProteinNtermMasses = new HashMap<Character, Double>(1);
/**
* Map of the masses of the fixed modifications at specific amino acids on
* peptide C-terminus: targeted amino acid > list of modifications.
*/
private HashMap<Character, Double> fixedAaModificationsPeptideCtermMasses = new HashMap<Character, Double>(1);
/**
* Map of the masses of the fixed modifications at specific amino acids on
* protein C-terminus: targeted amino acid > list of modifications.
*/
private HashMap<Character, Double> fixedAaModificationsProteinCtermMasses = new HashMap<Character, Double>(1);
/**
* Map of variable N-terminal peptide modifications: modification name >
* mass.
*/
private HashMap<String, Double> variableNTermPeptideModifications = null;
/**
* Map of variable C-terminal peptide modifications: modification name >
* mass.
*/
private HashMap<String, Double> variableCTermPeptideModifications = null;
/**
* Map of variable N-terminal protein modifications: modification name >
* mass.
*/
private HashMap<String, Double> variableNTermProteinModifications = null;
/**
* Map of variable C-terminal protein modifications: modification name >
* mass.
*/
private HashMap<String, Double> variableCTermProteinModifications = null;
/**
* Map of the variable modifications at specific amino acid: possible target
* > list of modifications.
*/
private HashMap<Character, HashMap<String, Double>> variableAaModifications = new HashMap<Character, HashMap<String, Double>>(1);
/**
* Map of the variable modifications at specific amino acid on peptide
* N-terminus: possible target > list of modifications.
*/
private HashMap<Character, HashMap<String, Double>> variableAaModificationsAtPeptideNterm = new HashMap<Character, HashMap<String, Double>>(1);
/**
* Map of the variable modifications at specific amino acid on protein
* N-terminus: possible target > list of modifications.
*/
private HashMap<Character, HashMap<String, Double>> variableAaModificationsAtProteinNterm = new HashMap<Character, HashMap<String, Double>>(1);
/**
* Map of the variable modifications at specific amino acid on peptide
* C-terminus: possible target > list of modifications.
*/
private HashMap<Character, HashMap<String, Double>> variableAaModificationsAtPeptideCterm = new HashMap<Character, HashMap<String, Double>>(1);
/**
* Map of the variable modifications at specific amino acid on protein
* C-terminus: possible target > list of modifications.
*/
private HashMap<Character, HashMap<String, Double>> variableAaModificationsAtProteinCterm = new HashMap<Character, HashMap<String, Double>>(1);
/**
* Smallest variable modification mass to account for when sequencing to the
* N-terminus.
*/
private double minNtermMod = 0;
/**
* Smallest variable modification mass to account for when sequencing to the
* C-terminus.
*/
private double minCtermMod = 0;
/**
* Biggest variable modification mass to account for when sequencing to the
* N-terminus.
*/
private double maxNtermMod = 0;
/**
* Biggest variable modification mass to account for when sequencing to the
* C-terminus.
*/
private double maxCtermMod = 0;
/**
* If true the possible sequence segments will be stored in caches.
*/
private boolean useCache = true;
/**
* If true the indexing of the sequence will be executed in a synchronized
* method. Use this in case different threads might attempt to sequence the
* same sequence at the same index at the same time.
*/
private boolean synchronizedIndexing = false;
/**
* The sequence segments cache for N-term sequencing
*
* Protein accession > Starting index on protein > end index on
* protein > Mass > sequence segment.
*/
private HashMap<String, HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>>> nTermCache = new HashMap<String, HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>>>();
/**
* The sequence segments cache for C-term sequencing.
*
* Protein accession > Starting index on protein > end index on
* protein > Mass > sequence segment.
*/
private HashMap<String, HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>>> cTermCache = new HashMap<String, HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>>>();
/**
* The sequence matching preferences
*/
private SequenceMatchingPreferences sequenceMatchingPreferences;
private BufferedWriter debugbw = null;
/**
* Constructor.
*
* @param fixedModifications list of fixed modifications
* @param variableModifications list of variable modifications
* @param sequenceMatchingPreferences the sequence matching preferences
*/
public TagMatcher(ArrayList<String> fixedModifications, ArrayList<String> variableModifications, SequenceMatchingPreferences sequenceMatchingPreferences) {
this.sequenceMatchingPreferences = sequenceMatchingPreferences;
importModificationMapping(fixedModifications, variableModifications);
}
/**
* Imports the modifications in the attribute maps.
*
* @param fixedModifications list of fixed modifications
* @param variableModifications list of variable modifications
*/
private void importModificationMapping(ArrayList<String> fixedModifications, ArrayList<String> variableModifications) {
// Sort modifications according to potential targets
for (String modificationName : fixedModifications) {
PTM ptm = PTMFactory.getInstance().getPTM(modificationName);
if (ptm.getType() == PTM.MODN) {
fixedNTermProteinModificationsMass += ptm.getMass();
} else if (ptm.getType() == PTM.MODC) {
fixedCTermProteinModificationsMass += ptm.getMass();
} else if (ptm.getType() == PTM.MODNP) {
fixedNTermPeptideModificationsMass += ptm.getMass();
} else if (ptm.getType() == PTM.MODCP) {
fixedCTermPeptideModificationsMass += ptm.getMass();
} else if (ptm.getType() == PTM.MODAA) {
AminoAcidPattern ptmPattern = ptm.getPattern();
if (ptmPattern.length() > 1) {
throw new UnsupportedOperationException("Fixed modifications on patterns is not supported, try variable.");
}
for (Character aa : ptmPattern.getAminoAcidsAtTarget()) {
Double fixedMass = fixedAaModificationsMasses.get(aa);
if (fixedMass == null) {
fixedMass = 0.0;
}
fixedAaModificationsMasses.put(aa, fixedMass + ptm.getMass());
}
} else if (ptm.getType() == PTM.MODNAA) {
AminoAcidPattern ptmPattern = ptm.getPattern();
if (ptmPattern.length() > 1) {
throw new UnsupportedOperationException("Fixed modifications on patterns is not supported, try variable.");
}
for (Character aa : ptmPattern.getAminoAcidsAtTarget()) {
Double fixedMass = fixedAaModificationsProteinNtermMasses.get(aa);
if (fixedMass == null) {
fixedMass = 0.0;
}
fixedAaModificationsProteinNtermMasses.put(aa, fixedMass + ptm.getMass());
}
} else if (ptm.getType() == PTM.MODNPAA) {
AminoAcidPattern ptmPattern = ptm.getPattern();
if (ptmPattern.length() > 1) {
throw new UnsupportedOperationException("Fixed modifications on patterns is not supported, try variable.");
}
for (Character aa : ptmPattern.getAminoAcidsAtTarget()) {
Double fixedMass = fixedAaModificationsPeptideNtermMasses.get(aa);
if (fixedMass == null) {
fixedMass = 0.0;
}
fixedAaModificationsPeptideNtermMasses.put(aa, fixedMass + ptm.getMass());
}
} else if (ptm.getType() == PTM.MODCAA) {
AminoAcidPattern ptmPattern = ptm.getPattern();
if (ptmPattern.length() > 1) {
throw new UnsupportedOperationException("Fixed modifications on patterns is not supported, try variable.");
}
for (Character aa : ptmPattern.getAminoAcidsAtTarget()) {
Double fixedMass = fixedAaModificationsProteinCtermMasses.get(aa);
if (fixedMass == null) {
fixedMass = 0.0;
}
fixedAaModificationsProteinCtermMasses.put(aa, fixedMass + ptm.getMass());
}
} else if (ptm.getType() == PTM.MODCPAA) {
AminoAcidPattern ptmPattern = ptm.getPattern();
if (ptmPattern.length() > 1) {
throw new UnsupportedOperationException("Fixed modifications on patterns is not supported, try variable.");
}
for (Character aa : ptmPattern.getAminoAcidsAtTarget()) {
Double fixedMass = fixedAaModificationsPeptideCtermMasses.get(aa);
if (fixedMass == null) {
fixedMass = 0.0;
}
fixedAaModificationsPeptideCtermMasses.put(aa, fixedMass + ptm.getMass());
}
}
}
for (String modificationName : variableModifications) {
PTM ptm = PTMFactory.getInstance().getPTM(modificationName);
if (ptm.getType() == PTM.MODNP) {
if (variableNTermPeptideModifications == null) {
variableNTermPeptideModifications = new HashMap<String, Double>(1);
}
variableNTermPeptideModifications.put(modificationName, ptm.getMass());
if (ptm.getMass() < minNtermMod) {
minNtermMod = ptm.getMass();
}
if (ptm.getMass() > maxNtermMod) {
maxNtermMod = ptm.getMass();
}
} else if (ptm.getType() == PTM.MODCP) {
if (variableCTermPeptideModifications == null) {
variableCTermPeptideModifications = new HashMap<String, Double>(1);
}
variableCTermPeptideModifications.put(modificationName, ptm.getMass());
if (ptm.getMass() < minCtermMod) {
minCtermMod = ptm.getMass();
}
if (ptm.getMass() > maxCtermMod) {
maxCtermMod = ptm.getMass();
}
} else if (ptm.getType() == PTM.MODN) {
if (variableNTermProteinModifications == null) {
variableNTermProteinModifications = new HashMap<String, Double>(1);
}
variableNTermProteinModifications.put(modificationName, ptm.getMass());
} else if (ptm.getType() == PTM.MODC) {
if (variableCTermProteinModifications == null) {
variableCTermProteinModifications = new HashMap<String, Double>(1);
}
variableCTermProteinModifications.put(modificationName, ptm.getMass());
} else if (ptm.getType() == PTM.MODAA) {
for (Character aa : ptm.getPattern().getAminoAcidsAtTarget()) {
HashMap<String, Double> ptmMap = variableAaModifications.get(aa);
if (ptmMap == null) {
ptmMap = new HashMap<String, Double>(1);
variableAaModifications.put(aa, ptmMap);
}
ptmMap.put(modificationName, ptm.getMass());
}
} else if (ptm.getType() == PTM.MODNAA) {
for (Character aa : ptm.getPattern().getAminoAcidsAtTarget()) {
HashMap<String, Double> ptmMap = variableAaModificationsAtProteinNterm.get(aa);
if (ptmMap == null) {
ptmMap = new HashMap<String, Double>(1);
variableAaModificationsAtProteinNterm.put(aa, ptmMap);
}
ptmMap.put(modificationName, ptm.getMass());
}
} else if (ptm.getType() == PTM.MODNPAA) {
for (Character aa : ptm.getPattern().getAminoAcidsAtTarget()) {
HashMap<String, Double> ptmMap = variableAaModificationsAtPeptideNterm.get(aa);
if (ptmMap == null) {
ptmMap = new HashMap<String, Double>(1);
variableAaModificationsAtPeptideNterm.put(aa, ptmMap);
}
ptmMap.put(modificationName, ptm.getMass());
}
if (ptm.getMass() < minNtermMod) {
minNtermMod = ptm.getMass();
}
if (ptm.getMass() > maxNtermMod) {
maxNtermMod = ptm.getMass();
}
} else if (ptm.getType() == PTM.MODCAA) {
for (Character aa : ptm.getPattern().getAminoAcidsAtTarget()) {
HashMap<String, Double> ptmMap = variableAaModificationsAtProteinNterm.get(aa);
if (ptmMap == null) {
ptmMap = new HashMap<String, Double>(1);
variableAaModificationsAtProteinNterm.put(aa, ptmMap);
}
ptmMap.put(modificationName, ptm.getMass());
}
} else if (ptm.getType() == PTM.MODCPAA) {
for (Character aa : ptm.getPattern().getAminoAcidsAtTarget()) {
HashMap<String, Double> ptmMap = variableAaModificationsAtPeptideNterm.get(aa);
if (ptmMap == null) {
ptmMap = new HashMap<String, Double>(1);
variableAaModificationsAtPeptideNterm.put(aa, ptmMap);
}
ptmMap.put(modificationName, ptm.getMass());
}
if (ptm.getMass() < minCtermMod) {
minCtermMod = ptm.getMass();
}
if (ptm.getMass() > maxCtermMod) {
maxCtermMod = ptm.getMass();
}
}
}
}
/**
* Returns the possible peptides which can be created on this sequence
* indexed by their start index. Null if not found. Note: PTMs must be in
* the PTM factory. PTMs are considered at a target amino acid only, longer
* patterns are not taken into account.
*
* @param tag the tag to look for
* @param accession the accession of the protein
* @param sequence the sequence where to look for the tag
* @param tagIndex the index where the tag is located
* @param componentIndex the index of the component of the tag indexed by
* tagIndex in the content list
* @param massTolerance the ms2 tolerance
*
* @return the possible peptides which can be created on this sequence
* indexed by their start index
*/
public ArrayList<PeptideProteinMapping> getPeptideMatches(Tag tag, String accession, String sequence, Integer tagIndex,
Integer componentIndex, double massTolerance) {
ArrayList<TagComponent> content = tag.getContent();
// Get information about the reference sequence
TagComponent componentAtIndex = content.get(componentIndex);
int componentAtIndexLength;
HashMap<Integer, ArrayList<ModificationMatch>> modificationsAtIndex = null;
if (componentAtIndex instanceof AminoAcidPattern) {
AminoAcidPattern tagPattern = (AminoAcidPattern) componentAtIndex;
componentAtIndexLength = tagPattern.length();
modificationsAtIndex = tagPattern.getModificationMatches();
} else if (componentAtIndex instanceof AminoAcidSequence) {
AminoAcidSequence tagSequence = (AminoAcidSequence) componentAtIndex;
componentAtIndexLength = tagSequence.length();
modificationsAtIndex = tagSequence.getModificationMatches();
} else {
throw new UnsupportedOperationException("Tag mapping not supported for tag component " + componentAtIndex.getClass() + ".");
}
String seedSequence = sequence.substring(tagIndex, tagIndex + componentAtIndexLength);
// Check tag components to the N-term
ArrayList<SequenceSegment> nTermPossibleSequences = new ArrayList<SequenceSegment>(1);
nTermPossibleSequences.add(new SequenceSegment(tagIndex, true));
for (int i = componentIndex - 1; i >= 0; i--) {
TagComponent tagComponent = content.get(i);
nTermPossibleSequences = mapTagComponent(accession, sequence, tagComponent, nTermPossibleSequences, massTolerance, useCache && i == componentIndex - 1, true, i == 0);
if (nTermPossibleSequences.isEmpty()) {
return new ArrayList<PeptideProteinMapping>(0);
}
}
// Check tag components to the C-term
ArrayList<SequenceSegment> cTermPossibleSequences = new ArrayList<SequenceSegment>(1);
cTermPossibleSequences.add(new SequenceSegment(tagIndex + componentAtIndexLength - 1, false));
for (int i = componentIndex + 1; i < content.size(); i++) {
TagComponent tagComponent = content.get(i);
cTermPossibleSequences = mapTagComponent(accession, sequence, tagComponent, cTermPossibleSequences, massTolerance, useCache && i == componentIndex + 1, false, i == content.size() - 1);
if (cTermPossibleSequences.isEmpty()) {
return new ArrayList<PeptideProteinMapping>(0);
}
}
// create all possible peptide sequences by adding all possible N and C term to the seed sequence
ArrayList<PeptideProteinMapping> result = buildPeptides(accession, sequence, nTermPossibleSequences, seedSequence, cTermPossibleSequences, modificationsAtIndex, 0);
return result;
}
/**
* Builds the possible peptides based on the given terminal segments and the
* seed sequence.
*
* @param sequence the protein sequence
* @param accession the protein accession
* @param nTermPossibleSequences the N-terminal possible segments
* @param seedSequence the seed sequence
* @param cTermPossibleSequences the C-terminal possible segments
* @param modificationsAtIndex the seed modifications
* @param mutationsAtIndex the seeds mutations
*
* @return the possible peptides in a map: index on protein > list of
* peptides
*/
public ArrayList<PeptideProteinMapping> buildPeptides(String accession, String sequence, ArrayList<SequenceSegment> nTermPossibleSequences, String seedSequence, ArrayList<SequenceSegment> cTermPossibleSequences, HashMap<Integer, ArrayList<ModificationMatch>> modificationsAtIndex, int mutationsAtIndex) {
ArrayList<PeptideProteinMapping> result = new ArrayList<PeptideProteinMapping>(nTermPossibleSequences.size() * cTermPossibleSequences.size());
for (SequenceSegment nTermSegment : nTermPossibleSequences) {
StringBuilder nTermSequence = new StringBuilder(nTermSegment.length() + seedSequence.length());
nTermSequence.append(nTermSegment.getSegmentSequence(sequence));
nTermSequence.append(seedSequence);
for (SequenceSegment cTermSegment : cTermPossibleSequences) {
StringBuilder peptideSequence = new StringBuilder(nTermSegment.length() + seedSequence.length() + cTermSegment.length());
peptideSequence.append(nTermSequence);
ArrayList<ModificationMatch> modificationMatches = new ArrayList<ModificationMatch>(1);
HashMap<Integer, String> nTermModifications = nTermSegment.getModificationMatches();
if (nTermModifications != null) {
for (Integer site : nTermModifications.keySet()) {
String ptmName = nTermModifications.get(site);
int remappedSite = nTermSegment.length() + 1 - site;
modificationMatches.add(new ModificationMatch(ptmName, true, remappedSite));
}
}
if (modificationsAtIndex != null) {
for (Integer i : modificationsAtIndex.keySet()) {
for (ModificationMatch modificationMatch : modificationsAtIndex.get(i)) {
modificationMatches.add(new ModificationMatch(modificationMatch.getTheoreticPtm(), modificationMatch.isVariable(), nTermSegment.length() + i));
}
}
}
peptideSequence.append(cTermSegment.getSegmentSequence(sequence));
HashMap<Integer, String> cTermModifications = cTermSegment.getModificationMatches();
if (cTermModifications != null) {
for (Integer site : cTermModifications.keySet()) {
String ptmName = cTermModifications.get(site);
int remappedSite = nTermSegment.length() + seedSequence.length() + site;
modificationMatches.add(new ModificationMatch(ptmName, true, remappedSite));
}
}
Integer nTermIndex = nTermSegment.getTerminalIndex() + 1;
PeptideProteinMapping peptideProteinMapping = new PeptideProteinMapping(accession, peptideSequence.toString(), nTermIndex, modificationMatches);
result.add(peptideProteinMapping);
}
}
return result;
}
/**
* Maps a tag component on the protein sequence and returns the
* corresponding possible sequence segments.
*
* @param accession the accession of the protein
* @param sequence the protein sequence
* @param tagComponent the tag component to map
* @param terminalPreviousSequences the possible previous terminal sequences
* @param reportFixedPtms if true the fixed PTMs will be reported as
* ModificationMatch
* @param massTolerance the ms2 mass tolerance to use
* @param nTerminus if true the sequencing will go toward the N-terminus, to
* the C-terminus otherwise
*
* @return the possible sequence fragment of this tag component appended to
* the given previous segments
*/
private ArrayList<SequenceSegment> mapTagComponent(String accession, String sequence, TagComponent tagComponent, ArrayList<SequenceSegment> terminalPreviousSequences, double massTolerance, boolean useCache, boolean nTerminus, boolean lastComponent) {
if (tagComponent instanceof AminoAcidPattern) {
for (SequenceSegment terminalSequence : terminalPreviousSequences) {
Integer aaIndex = terminalSequence.getTerminalIndex();
AminoAcidPattern aminoAcidPattern = (AminoAcidPattern) tagComponent;
String subSequence = null;
if (nTerminus) {
Integer startIndex = aaIndex - aminoAcidPattern.length();
if (startIndex >= 0) {
subSequence = sequence.substring(startIndex, aaIndex);
}
} else {
Integer endIndex = aaIndex + aminoAcidPattern.length();
if (endIndex <= sequence.length() - 1) {
subSequence = sequence.substring(aaIndex, endIndex);
}
}
if (subSequence != null && aminoAcidPattern.matches(subSequence, sequenceMatchingPreferences)) {
terminalSequence.appendTerminus((SequenceSegment) tagComponent);
}
}
return terminalPreviousSequences;
} else if (tagComponent instanceof AminoAcidSequence) {
for (SequenceSegment terminalSequence : terminalPreviousSequences) {
Integer aaIndex = terminalSequence.getTerminalIndex();
AminoAcidSequence aminoAcidPattern = (AminoAcidSequence) tagComponent;
String subSequence = null;
if (nTerminus) {
Integer startIndex = aaIndex - aminoAcidPattern.length();
if (startIndex >= 0) {
subSequence = sequence.substring(startIndex, aaIndex);
}
} else {
Integer endIndex = aaIndex + aminoAcidPattern.length();
if (endIndex <= sequence.length() - 1) {
subSequence = sequence.substring(aaIndex, endIndex);
}
}
if (subSequence != null && aminoAcidPattern.matches(subSequence, sequenceMatchingPreferences)) {
terminalSequence.appendTerminus((SequenceSegment) tagComponent);
}
}
return terminalPreviousSequences;
} else if (tagComponent instanceof MassGap) {
double massGap = tagComponent.getMass();
ArrayList<SequenceSegment> newSequences = new ArrayList<SequenceSegment>(1);
for (int i = 0; i < terminalPreviousSequences.size(); i++) {
SequenceSegment terminalSequence = terminalPreviousSequences.get(i);
int aaIndex = terminalSequence.getTerminalIndex();
Integer currentIndex = aaIndex;
ArrayList<SequenceSegment> possibleSequences = null;
ArrayList<SequenceSegment> validSequences = new ArrayList<SequenceSegment>(1);
HashMap<Integer, ArrayList<SequenceSegment>> indexCache = getIndexCache(accession, currentIndex, nTerminus);
if (nTerminus) {
aaIndex--;
} else {
aaIndex++;
}
while (aaIndex >= 0 && aaIndex < sequence.length()) {
char sequenceAa = sequence.charAt(aaIndex);
AminoAcid sequenceAminoAcid = AminoAcid.getAminoAcid(sequenceAa);
int segmentLength = Math.abs(aaIndex - currentIndex);
if (useCache && segmentLength <= 12) {
possibleSequences = indexCache.get(aaIndex);
if (possibleSequences == null) {
if (synchronizedIndexing) {
possibleSequences = addSequenceSegmentsToCacheSynchronized(indexCache, sequence, sequenceAminoAcid, currentIndex, aaIndex, nTerminus);
} else {
possibleSequences = addSequenceSegmentsToCache(indexCache, sequence, sequenceAminoAcid, currentIndex, aaIndex, nTerminus);
}
}
} else {
possibleSequences = getCombinationsForAminoAcid(sequence, possibleSequences, sequenceAminoAcid, currentIndex, aaIndex, nTerminus);
}
if (validateSegments(possibleSequences, validSequences, massGap, massTolerance, sequence, sequenceAa, nTerminus)) {
if (debugbw != null) {
try {
debugbw.write(segmentLength + "\n");
debugbw.flush();
} catch (IOException ex) {
ex.printStackTrace();
}
}
break;
}
if (nTerminus) {
aaIndex--;
} else {
aaIndex++;
}
}
if (!lastComponent) {
for (SequenceSegment validSegment : validSequences) {
SequenceSegment sequenceSegment = new SequenceSegment(validSegment);
sequenceSegment.appendTerminus(terminalSequence);
newSequences.add(sequenceSegment);
}
} else {
newSequences.addAll(validSequences);
}
}
return newSequences;
} else {
throw new IllegalArgumentException("Tag component " + tagComponent.getClass() + " not implemented for sequence matching.");
}
}
/**
* Returns the index cache for the given segment seed, accession and index.
*
* @param accession the accession of the protein
* @param currentIndex the index on the protein
* @param nTerminus boolean indicating whether the N or C terminus cache
* should be used
*
* @return the index cache
*/
public HashMap<Integer, ArrayList<SequenceSegment>> getIndexCache(String accession, Integer currentIndex, boolean nTerminus) {
HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> proteinCache;
if (nTerminus) {
proteinCache = nTermCache.get(accession);
} else {
proteinCache = cTermCache.get(accession);
}
if (proteinCache == null) {
if (synchronizedIndexing) {
proteinCache = addProteinCacheSynchronized(accession, currentIndex, nTerminus);
} else {
proteinCache = addProteinCache(accession, currentIndex, nTerminus);
}
}
HashMap<Integer, ArrayList<SequenceSegment>> indexCache = proteinCache.get(currentIndex);
if (indexCache == null) {
if (synchronizedIndexing) {
indexCache = addIndexCacheSynchronized(proteinCache, currentIndex);
} else {
indexCache = addIndexCache(proteinCache, currentIndex);
}
}
return indexCache;
}
/**
* Adds a cache for the given segment seed and returns it.
*
* @param seed the segment seed
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return a cache for the given protein
*/
private synchronized HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> addProteinCacheSynchronized(String accession, Integer currentIndex, boolean nTerminus) {
HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> proteinCache;
if (nTerminus) {
proteinCache = nTermCache.get(accession);
} else {
proteinCache = cTermCache.get(accession);
}
if (proteinCache == null) {
proteinCache = addProteinCache(accession, currentIndex, nTerminus);
}
return proteinCache;
}
/**
* Adds a cache for the given segment seed and returns it.
*
* @param seed the segment seed
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return a cache for the given protein
*/
private HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> addProteinCache(String accession, Integer currentIndex, boolean nTerminus) {
HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> proteinCache = new HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>>();
if (nTerminus) {
nTermCache.put(accession, proteinCache);
} else {
cTermCache.put(accession, proteinCache);
}
HashMap<Integer, ArrayList<SequenceSegment>> indexCache = new HashMap<Integer, ArrayList<SequenceSegment>>(1);
proteinCache.put(currentIndex, indexCache);
return proteinCache;
}
/**
* Adds a cache for the given index and returns it.
*
* @param proteinCache the protein cache
* @param currentIndex the index of interest
*
* @return a cache for the given index
*/
private HashMap<Integer, ArrayList<SequenceSegment>> addIndexCacheSynchronized(HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> proteinCache, Integer currentIndex) {
HashMap<Integer, ArrayList<SequenceSegment>> indexCache = proteinCache.get(currentIndex);
if (indexCache == null) {
addIndexCache(proteinCache, currentIndex);
}
return indexCache;
}
/**
* Adds a cache for the given index and returns it.
*
* @param proteinCache the protein cache
* @param currentIndex the index of Integererest
*
* @return a cache for the given index
*/
private HashMap<Integer, ArrayList<SequenceSegment>> addIndexCache(HashMap<Integer, HashMap<Integer, ArrayList<SequenceSegment>>> proteinCache, Integer currentIndex) {
HashMap<Integer, ArrayList<SequenceSegment>> indexCache = new HashMap<Integer, ArrayList<SequenceSegment>>(1);
proteinCache.put(currentIndex, indexCache);
return indexCache;
}
/**
* Adds the possible new sequence segments generated when appending the
* given amino acid to the given cache and returns the list of possible
* segments.
*
* @param indexCache the cache for this index on the sequence
* @param sequence the protein sequence
* @param aminoAcid the amino acid object
* @param currentIndex the current indexing level on the protein sequence
* @param aaIndex the amino acid index
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return the new possible sequences
*/
public synchronized ArrayList<SequenceSegment> addSequenceSegmentsToCacheSynchronized(HashMap<Integer, ArrayList<SequenceSegment>> indexCache, String sequence, AminoAcid aminoAcid, Integer currentIndex, Integer aaIndex, boolean nTerminus) {
// check whether another thread already did the job
ArrayList<SequenceSegment> result = indexCache.get(aaIndex);
if (result == null) {
result = addSequenceSegmentsToCache(indexCache, sequence, aminoAcid, currentIndex, aaIndex, nTerminus);
}
return result;
}
/**
* Adds the possible new sequence segments generated when appending the
* given amino acid to the given cache and returns the list of possible
* segments.
*
* @param indexCache the cache for this index on the sequence
* @param sequence the protein sequence
* @param aminoAcid the amino acid object
* @param currentIndex the current indexing level on the protein sequence
* @param aaIndex the amino acid index
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return the new possible sequences
*/
public ArrayList<SequenceSegment> addSequenceSegmentsToCache(HashMap<Integer, ArrayList<SequenceSegment>> indexCache, String sequence, AminoAcid aminoAcid, Integer currentIndex, Integer aaIndex, boolean nTerminus) {
ArrayList<SequenceSegment> previousSequences;
if (nTerminus) {
previousSequences = indexCache.get(aaIndex + 1);
} else {
previousSequences = indexCache.get(aaIndex - 1);
}
ArrayList<SequenceSegment> result = getCombinationsForAminoAcid(sequence, previousSequences, aminoAcid, currentIndex, aaIndex, nTerminus);
indexCache.put(aaIndex, result);
return result;
}
/**
* Adds the possible new sequence segments generated when appending the
* given amino acid.
*
* @param sequence the protein sequence
* @param possibleSequences the possible previous sequences
* @param aminoAcid the amino acid object
* @param currentIndex the current indexing level on the protein sequence
* @param aaIndex the amino acid index
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return the new possible sequences
*/
public ArrayList<SequenceSegment> getCombinationsForAminoAcid(String sequence, ArrayList<SequenceSegment> possibleSequences, AminoAcid aminoAcid, Integer currentIndex, Integer aaIndex, boolean nTerminus) {
char aa = aminoAcid.getSingleLetterCodeAsChar();
Double fixedMass = fixedAaModificationsMasses.get(aa);
HashMap<String, Double> variableModificationsAtAa = variableAaModifications.get(aa);
if (possibleSequences == null) {
possibleSequences = new ArrayList<SequenceSegment>(2);
SequenceSegment sequenceSegment = new SequenceSegment(aaIndex, nTerminus);
possibleSequences.add(sequenceSegment);
sequenceSegment.appendTerminus(aminoAcid);
double modificationMass = 0;
if (fixedMass != null) {
modificationMass += fixedMass;
}
if (nTerminus && aaIndex == 0) {
modificationMass += fixedNTermProteinModificationsMass;
if (!fixedAaModificationsProteinNtermMasses.isEmpty()) {
Double aaTerminalMass = fixedAaModificationsProteinNtermMasses.get(aa);
if (aaTerminalMass != null) {
modificationMass += aaTerminalMass;
}
}
} else if (!nTerminus && aaIndex == sequence.length() - 1) {
modificationMass += fixedCTermProteinModificationsMass;
if (!fixedAaModificationsProteinCtermMasses.isEmpty()) {
Double aaTerminalMass = fixedAaModificationsProteinCtermMasses.get(aa);
if (aaTerminalMass != null) {
modificationMass += aaTerminalMass;
}
}
}
sequenceSegment.addMass(modificationMass);
addVariableModifications(variableModificationsAtAa, sequenceSegment, possibleSequences);
if (nTerminus && aaIndex == 0) {
addVariableModifications(variableNTermProteinModifications, sequenceSegment, possibleSequences);
if (!variableAaModificationsAtProteinNterm.isEmpty()) {
HashMap<String, Double> aaTerminalModifications = variableAaModificationsAtProteinNterm.get(aa);
if (aaTerminalModifications != null) {
addVariableModifications(aaTerminalModifications, sequenceSegment, possibleSequences);
}
}
} else if (!nTerminus && aaIndex == sequence.length() - 1) {
addVariableModifications(variableCTermProteinModifications, sequenceSegment, possibleSequences);
if (!variableAaModificationsAtProteinCterm.isEmpty()) {
HashMap<String, Double> aaTerminalModifications = variableAaModificationsAtProteinCterm.get(aa);
if (aaTerminalModifications != null) {
addVariableModifications(aaTerminalModifications, sequenceSegment, possibleSequences);
}
}
}
return possibleSequences;
} else {
ArrayList<SequenceSegment> newPossibleSequences = new ArrayList<SequenceSegment>(possibleSequences.size());
for (int i = 0; i < possibleSequences.size(); i++) {
SequenceSegment sequenceSegment = possibleSequences.get(i);
SequenceSegment newSegment = new SequenceSegment(sequenceSegment);
newPossibleSequences.add(newSegment);
newSegment.appendTerminus(aminoAcid);
double modificationMass = 0;
if (fixedMass != null) {
modificationMass += fixedMass;
}
if (nTerminus && aaIndex == 0) {
modificationMass += fixedNTermProteinModificationsMass;
} else if (!nTerminus && aaIndex == sequence.length() - 1) {
modificationMass += fixedCTermProteinModificationsMass;
}
newSegment.addMass(modificationMass);
addVariableModifications(variableModificationsAtAa, newSegment, newPossibleSequences);
if (nTerminus && aaIndex == 0) {
addVariableModifications(variableNTermProteinModifications, newSegment, newPossibleSequences);
if (!variableAaModificationsAtProteinNterm.isEmpty()) {
HashMap<String, Double> aaTerminalModifications = variableAaModificationsAtProteinNterm.get(aa);
if (aaTerminalModifications != null) {
addVariableModifications(aaTerminalModifications, newSegment, newPossibleSequences);
}
}
} else if (!nTerminus && aaIndex == sequence.length() - 1) {
addVariableModifications(variableCTermProteinModifications, newSegment, newPossibleSequences);
if (!variableAaModificationsAtProteinCterm.isEmpty()) {
HashMap<String, Double> aaTerminalModifications = variableAaModificationsAtProteinCterm.get(aa);
if (aaTerminalModifications != null) {
addVariableModifications(aaTerminalModifications, newSegment, newPossibleSequences);
}
}
}
}
return newPossibleSequences;
}
}
/**
* Removes the segments which cannot match the mass gap from the possible
* sequences and transfers the valid segments to the valid sequences.
* Returns a boolean indicating whether the sequence iteration should be
* terminated.
*
* @param possibleSequences the possible sequences
* @param validSequences the valid sequences
* @param massGap the mass gap
* @param massTolerance the mass tolerance to use
* @param sequence the protein sequence
* @param sequenceAa the amino acid at terminus on the protein sequence
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return if true no more sequence segment can be mapped
*/
public boolean validateSegments(ArrayList<SequenceSegment> possibleSequences, ArrayList<SequenceSegment> validSequences, double massGap, double massTolerance, String sequence, char sequenceAa, boolean nTerminus) {
boolean allInspected = true;
for (int i = 0; i < possibleSequences.size(); i++) {
SequenceSegment sequenceSegment = possibleSequences.get(i);
double sequenceMass = sequenceSegment.getMass();
if (nTerminus) {
sequenceMass += fixedNTermPeptideModificationsMass;
if (!fixedAaModificationsPeptideNtermMasses.isEmpty()) {
Double aaTerminalMass = fixedAaModificationsPeptideNtermMasses.get(sequenceAa);
if (aaTerminalMass != null) {
sequenceMass += aaTerminalMass;
}
}
} else {
sequenceMass += fixedCTermPeptideModificationsMass;
if (!fixedAaModificationsPeptideCtermMasses.isEmpty()) {
Double aaTerminalMass = fixedAaModificationsPeptideCtermMasses.get(sequenceAa);
if (aaTerminalMass != null) {
sequenceMass += aaTerminalMass;
}
}
}
double terminalModificationMin;
if (nTerminus) {
terminalModificationMin = minNtermMod;
} else {
terminalModificationMin = minCtermMod;
}
double terminalModificationMax;
if (nTerminus) {
terminalModificationMax = maxNtermMod;
} else {
terminalModificationMax = maxCtermMod;
}
boolean found = false, overGap = true;
if (sequenceMass + terminalModificationMin <= massGap + massTolerance) {
overGap = false;
if (sequenceMass + terminalModificationMax >= massGap - massTolerance) {
found = validateSegment(validSequences, sequenceSegment, sequenceMass, massGap, massTolerance, sequenceAa, nTerminus);
}
}
if (!found && !overGap) {
allInspected = false;
}
}
return allInspected;
}
/**
* Validates a sequence segment.
*
* @param validSequences the retained sequences
* @param sequenceSegment the sequence segment to validate
* @param sequenceMass the mass of the sequence without mutation
* @param massGap the mass gap to fill
* @param massTolerance the mass tolerance to use
* @param sequenceAa the amino acid at index on the sequence
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return a boolean indicating the segment was validated
*/
private boolean validateSegment(ArrayList<SequenceSegment> validSequences, SequenceSegment sequenceSegment, double sequenceMass, double massGap, double massTolerance, char sequenceAa, boolean nTerminus) {
return validateSegment(validSequences, sequenceSegment, sequenceMass, massGap, massTolerance, sequenceAa, 0, 0, null, nTerminus);
}
/**
* Validates a sequence segment.
*
* @param validSequences the retained sequences
* @param sequenceSegment the sequence segment to validate
* @param sequenceMass the mass of the sequence without mutation
* @param massGap the mass gap to fill
* @param massTolerance the mass tolerance to use
* @param sequenceAa the amino acid at index on the sequence
* @param mutatedIndex the index of the mutation on the sequence segment
* @param deltaMutation the mass variation induced by the mutation
* @param mutated the list of potential products of the mutation
* @param nTerminus indicates whether the sequencing goes toward the N
* (true) or the C (false) terminus
*
* @return a boolean indicating the segment was validated
*/
private boolean validateSegment(ArrayList<SequenceSegment> validSequences, SequenceSegment sequenceSegment, double sequenceMass, double massGap, double massTolerance, char sequenceAa, int mutatedIndex, double deltaMutation, HashSet<Character> mutated, boolean nTerminus) {
if (Math.abs(sequenceMass + deltaMutation - massGap) <= massTolerance) {
if (mutated == null) {
validSequences.add(sequenceSegment);
} else {
for (char aa : mutated) {
SequenceSegment mutatedSegment = new SequenceSegment(sequenceSegment);
mutatedSegment.addMutation(mutatedIndex, aa);
validSequences.add(mutatedSegment);
}
}
return true;
} else {
if (nTerminus) {
if (variableNTermPeptideModifications != null) {
for (String modificationName : variableNTermPeptideModifications.keySet()) {
double modifiedMass = sequenceMass + variableNTermPeptideModifications.get(modificationName);
if (Math.abs(modifiedMass + deltaMutation - massGap) <= massTolerance) {
if (mutated == null) {
SequenceSegment modifiedSegment = new SequenceSegment(sequenceSegment);
modifiedSegment.addModificationTerminus(modificationName);
validSequences.add(modifiedSegment);
} else {
for (char aa : mutated) {
SequenceSegment mutatedSegment = new SequenceSegment(sequenceSegment);
mutatedSegment.addMutation(mutatedIndex, aa);
mutatedSegment.addModificationTerminus(modificationName);
validSequences.add(mutatedSegment);
}
}
return true;
}
}
}
if (!variableAaModificationsAtPeptideNterm.isEmpty()) {
HashMap<String, Double> variableTermPeptideModificationsAtAa = variableAaModificationsAtPeptideNterm.get(sequenceAa);
if (variableTermPeptideModificationsAtAa != null) {
for (String modificationName : variableTermPeptideModificationsAtAa.keySet()) {
double modifiedMass = sequenceMass + variableTermPeptideModificationsAtAa.get(modificationName);
if (Math.abs(modifiedMass + deltaMutation - massGap) <= massTolerance) {
if (mutated == null) {
SequenceSegment modifiedSegment = new SequenceSegment(sequenceSegment);
modifiedSegment.addModificationTerminus(modificationName);
validSequences.add(modifiedSegment);
} else {
for (char aa : mutated) {
SequenceSegment mutatedSegment = new SequenceSegment(sequenceSegment);
mutatedSegment.addMutation(mutatedIndex, aa);
mutatedSegment.addModificationTerminus(modificationName);
validSequences.add(mutatedSegment);
}
}
return true;
}
}
}
}
} else {
if (variableCTermPeptideModifications != null) {
for (String modificationName : variableCTermPeptideModifications.keySet()) {
double modifiedMass = sequenceMass + variableCTermPeptideModifications.get(modificationName);
if (Math.abs(modifiedMass + deltaMutation - massGap) <= massTolerance) {
if (mutated == null) {
SequenceSegment modifiedSegment = new SequenceSegment(sequenceSegment);
modifiedSegment.addModificationTerminus(modificationName);
validSequences.add(modifiedSegment);
} else {
for (char aa : mutated) {
SequenceSegment mutatedSegment = new SequenceSegment(sequenceSegment);
mutatedSegment.addMutation(mutatedIndex, aa);
mutatedSegment.addModificationTerminus(modificationName);
validSequences.add(mutatedSegment);
}
}
return true;
}
}
}
if (!variableAaModificationsAtPeptideCterm.isEmpty()) {
HashMap<String, Double> variableTermPeptideModificationsAtAa = variableAaModificationsAtPeptideCterm.get(sequenceAa);
if (variableTermPeptideModificationsAtAa != null) {
for (String modificationName : variableTermPeptideModificationsAtAa.keySet()) {
double modifiedMass = sequenceMass + variableTermPeptideModificationsAtAa.get(modificationName);
if (Math.abs(modifiedMass + deltaMutation - massGap) <= massTolerance) {
if (mutated == null) {
SequenceSegment modifiedSegment = new SequenceSegment(sequenceSegment);
modifiedSegment.addModificationTerminus(modificationName);
validSequences.add(modifiedSegment);
} else {
for (char aa : mutated) {
SequenceSegment mutatedSegment = new SequenceSegment(sequenceSegment);
mutatedSegment.addMutation(mutatedIndex, aa);
mutatedSegment.addModificationTerminus(modificationName);
validSequences.add(mutatedSegment);
}
}
return true;
}
}
}
}
}
}
return false;
}
/**
* Adds the potential sequence segments obtained after adding the given
* variable modifications on a segment terminus to the given list of
* possible segments.
*
* @param variableModifications the variable modifications to add
* @param noModSegment the sequence segment without modification
* @param possibleSegments the possible segment where to add the modified
* segments
*/
public void addVariableModifications(HashMap<String, Double> variableModifications, SequenceSegment noModSegment, ArrayList<SequenceSegment> possibleSegments) {
if (variableModifications != null) {
for (String modificationName : variableModifications.keySet()) {
SequenceSegment modifiedSegment = new SequenceSegment(noModSegment);
Double ptmMass = variableModifications.get(modificationName);
modifiedSegment.addModificationTerminus(modificationName, ptmMass);
possibleSegments.add(modifiedSegment);
}
}
}
/**
* Clears the cache.
*/
public void clearCache() {
nTermCache.clear();
cTermCache.clear();
}
/**
* Sets whether a cache should be used.
*
* @param useCache if true a cache will be used
*/
public void setUseCache(boolean useCache) {
this.useCache = useCache;
}
/**
* Sets whether the indexing of the sequence should be executed in a
* synchronized method. Use this in case different threads might attempt to
* sequence the same sequence at the same index at the same time.
*
* @param synchronizedIndexing true if the indexing of the sequence should
* be executed in a synchronized method
*/
public void setSynchronizedIndexing(boolean synchronizedIndexing) {
this.synchronizedIndexing = synchronizedIndexing;
}
}