package com.compomics.util.experiment.identification.protein_inference.proteintree;
import com.compomics.util.Util;
import com.compomics.util.experiment.biology.AminoAcid;
import com.compomics.util.experiment.biology.AminoAcidSequence;
import com.compomics.util.experiment.biology.Protein;
import com.compomics.util.experiment.identification.protein_inference.PeptideProteinMapping;
import com.compomics.util.experiment.identification.protein_sequences.SequenceFactory;
import com.compomics.util.preferences.SequenceMatchingPreferences;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
/**
* A node of the protein tree.
*
* @author Marc Vaudel
*/
public class Node implements Serializable {
/**
* Serial number for backward compatibility.
*/
static final long serialVersionUID = 8936868785405252371L;
/**
* The depth of the node in the tree.
*/
private int depth;
/**
* List of accessions contained in this node.
*/
private HashMap<String, ArrayList<Integer>> accessions = new HashMap<String, ArrayList<Integer>>();
/**
* In case of splitting, the terminal mappings are put here.
*/
private HashMap<String, ArrayList<Integer>> termini = new HashMap<String, ArrayList<Integer>>();
/**
* Subtree starting from this node.
*/
private HashMap<Character, Node> subtree = null;
/**
* The number of proteins which should be imported at a time.
*/
public static final int proteinBatchSize = 100;
/**
* Indicates whether the main thread is listening or preparing to wait.
*/
private boolean listening = true;
/**
* Constructor.
*
* @param depth the depth of the node
*/
public Node(int depth) {
this.depth = depth;
}
/**
* Constructor.
*
* @param depth the depth of the node
* @param accessions the accessions of the node
*/
public Node(int depth, HashMap<String, ArrayList<Integer>> accessions) {
this.depth = depth;
this.accessions = accessions;
}
/**
* Returns the protein mappings for the given peptide sequence. peptide
* sequence > protein accession > index in the protein. An empty map
* if not found.
*
* @param query the given amino acid sequence to query the tree
* @param currentSequence the sequence found until now
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return the protein mapping for the given peptide sequence
*
* @throws IOException if an IOException occurs
* @throws ClassNotFoundException if a ClassNotFoundException occurs
* @throws InterruptedException if an InterruptedException occurs
*/
public ArrayList<PeptideProteinMapping> getProteinMapping(AminoAcidSequence query, String currentSequence,
SequenceMatchingPreferences sequenceMatchingPreferences) throws IOException, InterruptedException, ClassNotFoundException {
ArrayList<PeptideProteinMapping> result = new ArrayList<PeptideProteinMapping>(1);
if (depth == query.length()) {
HashMap<String, ArrayList<Integer>> mapping = getAllMappings();
for (String accession : mapping.keySet()) {
for (Integer site : mapping.get(accession)) {
double xShare = ((double) Util.getOccurrence(currentSequence, 'X')) / currentSequence.length();
if (!sequenceMatchingPreferences.hasLimitX() || xShare <= sequenceMatchingPreferences.getLimitX()) {
PeptideProteinMapping peptideProteinMapping = new PeptideProteinMapping(accession, currentSequence, site);
result.add(peptideProteinMapping);
}
}
}
} else if (accessions != null) {
SequenceFactory sequenceFactory = SequenceFactory.getInstance();
HashMap<String, HashMap<String, ArrayList<Integer>>> indexes = new HashMap<String, HashMap<String, ArrayList<Integer>>>(1);
for (String accession : accessions.keySet()) {
Protein protein = sequenceFactory.getProtein(accession);
indexes.put(accession, matchInProtein(protein, accessions.get(accession), query, sequenceMatchingPreferences));
}
for (String accession : indexes.keySet()) {
HashMap<String, ArrayList<Integer>> accessionIndexes = indexes.get(accession);
for (String tempSequence : accessionIndexes.keySet()) {
for (Integer index : accessionIndexes.get(tempSequence)) {
double xShare = ((double) Util.getOccurrence(tempSequence, 'X')) / tempSequence.length();
if (!sequenceMatchingPreferences.hasLimitX() || xShare <= sequenceMatchingPreferences.getLimitX()) {
PeptideProteinMapping peptideProteinMapping = new PeptideProteinMapping(accession, tempSequence, index);
result.add(peptideProteinMapping);
}
}
}
}
} else {
for (char aa : getNextAminoAcids(query, sequenceMatchingPreferences)) {
Node node = subtree.get(aa);
if (node != null) {
String newSequence = currentSequence + aa;
double xShare = ((double) Util.getOccurrence(newSequence, 'X')) / newSequence.length();
if (!sequenceMatchingPreferences.hasLimitX() || xShare <= sequenceMatchingPreferences.getLimitX()) {
result.addAll(node.getProteinMapping(query, newSequence, sequenceMatchingPreferences));
}
}
}
}
return result;
}
/**
* Returns the possible next amino acids.
*
* @param peptideSequence the peptide sequence as amino acid pattern
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return the possible next amino acids
*/
private HashSet<Character> getNextAminoAcids(AminoAcidSequence peptideSequence, SequenceMatchingPreferences sequenceMatchingPreferences) {
HashSet<Character> result = new HashSet<Character>();
char aa = peptideSequence.charAt(depth);
AminoAcid aminoAcid = AminoAcid.getAminoAcid(aa);
if (sequenceMatchingPreferences.getSequenceMatchingType() == SequenceMatchingPreferences.MatchingType.string) {
result.add(aa);
} else {
for (char aaChar : aminoAcid.getSubAminoAcids()) {
result.add(aaChar);
}
for (char aaChar : aminoAcid.getCombinations()) {
result.add(aaChar);
}
if (sequenceMatchingPreferences.getSequenceMatchingType() == SequenceMatchingPreferences.MatchingType.indistiguishableAminoAcids
&& (aminoAcid == AminoAcid.I || aminoAcid == AminoAcid.J || aminoAcid == AminoAcid.L)) {
result.add('I');
result.add('J');
result.add('L');
}
}
return result;
}
/**
* Splits the node into subnode if its size is larger than the maxNodeSize
* and does the same for every sub node.
*
* @param maxNodeSize the maximal node size allowed when splitting
* @param maxDepth the maximum depth
*
* @return returns true if the node was actually split and thus needs to be
* saved in indexed mode
*
* @throws IOException if an IOException occurs
* @throws ClassNotFoundException if a ClassNotFoundException occurs
* @throws InterruptedException if an InterruptedException occurs
* @throws IllegalArgumentException if an IllegalArgumentException occurs
*/
public boolean splitNode(int maxNodeSize, int maxDepth) throws IOException, IllegalArgumentException, InterruptedException, ClassNotFoundException {
if (accessions.size() > maxNodeSize && depth <= maxDepth) {
subtree = new HashMap<Character, Node>();
for (String accession : accessions.keySet()) {
HashMap<Character, ArrayList<Integer>> indexes = getAA(accession, accessions.get(accession), depth);
if (indexes.isEmpty()) {
indexes = getAA(accession, accessions.get(accession), depth);
}
for (char aa : indexes.keySet()) {
if (!subtree.containsKey(aa)) {
subtree.put(aa, new Node(depth + 1));
}
Node node = subtree.get(aa);
node.addAccession(accession, indexes.get(aa));
}
}
accessions.clear();
accessions = null;
for (Node node : subtree.values()) {
node.splitNode(maxNodeSize, maxDepth);
}
return true;
}
return false;
}
/**
* Adds an accession to the node.
*
* @param accession the accession to add
* @param indexes the indexes in this accession where the key can be found.
* Any prior entry will be silently overwritten
*/
public void addAccession(String accession, ArrayList<Integer> indexes) {
accessions.put(accession, indexes);
}
/**
* Returns the size of the node in accession*tag.
*
* @return the size of the node
*/
public long getSize() {
if (accessions != null) {
return accessions.size();
} else {
long result = 0;
for (Node node : subtree.values()) {
result += node.getSize();
}
return result;
}
}
/**
* Returns the accessions attribute.
*
* @return the accessions attribute
*/
public HashMap<String, ArrayList<Integer>> getAccessions() {
return accessions;
}
/**
* Returns the terminal mappings (they are not in the subtree).
*
* @return the terminal mappings
*/
public HashMap<String, ArrayList<Integer>> getTermini() {
return termini;
}
/**
* Returns the subtree. Null if end of the tree.
*
* @return the subtree
*/
public HashMap<Character, Node> getSubtree() {
return subtree;
}
/**
* Clears the accessions of this node.
*/
public void clearAccessions() {
accessions.clear();
}
/**
* Indicates whether the node is empty.
*
* @return whether the node is empty
*/
public boolean isEmpty() {
return subtree == null && accessions.isEmpty();
}
/**
* Returns the depth of the node in the tree.
*
* @return the depth of the node in the tree
*/
public int getDepth() {
return depth;
}
/**
* Returns all the protein mapping of the node.
*
* @return all the protein mappings of the node
* @throws IOException if an IOException occurs
*/
public HashMap<String, ArrayList<Integer>> getAllMappings() throws IOException {
if (accessions != null) {
HashMap<String, ArrayList<Integer>> result = new HashMap<String, ArrayList<Integer>>(accessions.size());
for (String accession : accessions.keySet()) {
ArrayList<Integer> indexes = new ArrayList<Integer>(accessions.get(accession));
result.put(accession, indexes);
}
return result;
} else {
HashMap<String, ArrayList<Integer>> result = new HashMap<String, ArrayList<Integer>>();
for (Node node : subtree.values()) {
HashMap<String, ArrayList<Integer>> subResult = node.getAllMappings();
for (String accession : subResult.keySet()) {
ArrayList<Integer> indexes = result.get(accession);
if (indexes == null) {
indexes = new ArrayList<Integer>(subResult.get(accession));
result.put(accession, indexes);
} else {
indexes.addAll(subResult.get(accession));
Collections.sort(indexes);
int previousIndex = -1;
ArrayList<Integer> singleIndexes = new ArrayList<Integer>(indexes.size());
for (int tempIndex : indexes) {
if (tempIndex != previousIndex) {
singleIndexes.add(tempIndex);
previousIndex = tempIndex;
}
}
result.put(accession, singleIndexes);
}
}
}
for (String accession : termini.keySet()) {
ArrayList<Integer> indexes = result.get(accession);
if (indexes == null) {
indexes = new ArrayList<Integer>(1);
result.put(accession, indexes);
}
for (Integer index : termini.get(accession)) {
if (!indexes.contains(index)) {
indexes.add(index);
}
}
}
return result;
}
}
/**
* Matches a peptide sequence in a protein sequence based on a seedlist.
* Returns a map found sequence > indexes. Example: sequence TESTEIST
* seeds: 0, 3, 7 peptideSequence: TEI result: TEI > {3}
*
* @param protein the protein to inspect
* @param seeds the indexes where to start looking for
* @param peptideSequence the peptide sequence as an amino acid sequence
* @param sequenceMatchingPreferences the sequence matching preferences
*
* @return a list of indexes having the expected sequence
* @throws IOException
* @throws IllegalArgumentException
* @throws InterruptedException
*/
private HashMap<String, ArrayList<Integer>> matchInProtein(Protein protein, ArrayList<Integer> seeds,
AminoAcidSequence peptideSequence, SequenceMatchingPreferences sequenceMatchingPreferences)
throws IOException, IllegalArgumentException, InterruptedException, ClassNotFoundException {
String proteinSequence = protein.getSequence();
HashMap<String, ArrayList<Integer>> results = new HashMap<String, ArrayList<Integer>>();
int peptideLength = peptideSequence.length();
for (int i = 0; i < seeds.size(); i++) {
int startIndex = seeds.get(i);
int endIndex = startIndex + peptideLength;
if (endIndex <= proteinSequence.length()) {
String subSequence = proteinSequence.substring(startIndex, endIndex);
if (peptideSequence.matches(subSequence, sequenceMatchingPreferences)) {
ArrayList<Integer> indexes = results.get(subSequence);
if (indexes == null) {
indexes = new ArrayList<Integer>(1);
results.put(subSequence, indexes);
}
indexes.add(startIndex);
}
}
}
return results;
}
/**
* Returns a map of the amino acids found on the sequence: aa > indexes.
* If the termination of the protein is reached the terminal character is
* used (see static field)
*
* @param accession the accession of the protein of interest
* @param seeds the indexes where to start looking at
* @param offset the offset between the seed and the target
* @return a map of the amino acids found at seed + offset
* @throws IOException
* @throws IllegalArgumentException
* @throws InterruptedException
*/
private HashMap<Character, ArrayList<Integer>> getAA(String accession, ArrayList<Integer> seeds, int offset)
throws IOException, IllegalArgumentException, InterruptedException, ClassNotFoundException {
String proteinSequence = SequenceFactory.getInstance().getProtein(accession).getSequence();
HashMap<Character, ArrayList<Integer>> result = new HashMap<Character, ArrayList<Integer>>();
for (int startIndex : seeds) {
int tempIndex = startIndex + offset;
if (tempIndex < proteinSequence.length()) {
char aa = proteinSequence.charAt(tempIndex);
ArrayList<Integer> indexes = result.get(aa);
if (indexes == null) {
indexes = new ArrayList<Integer>(0);
result.put(aa, indexes);
}
if (!indexes.contains(startIndex)) {
indexes.add(startIndex);
}
} else if (tempIndex == proteinSequence.length()) {
ArrayList<Integer> indexes = termini.get(accession);
if (indexes == null) {
indexes = new ArrayList<Integer>(0);
termini.put(accession, indexes);
}
if (!indexes.contains(startIndex)) {
indexes.add(startIndex);
}
} else {
throw new IllegalArgumentException("Attempting to index after the protein termini.");
}
}
return result;
}
/**
* Returns the subnode associated to an amino acid sequence.
*
* @param sequence the amino acid sequence
*
* @return the corresponding subnode
*/
public Node getSubNode(String sequence) {
if (sequence.length() <= depth) {
throw new IllegalArgumentException(sequence + " is not subnode of the node (depth=" + depth + ").");
}
char aa = sequence.charAt(depth);
if (depth < sequence.length() - 1) {
return subtree.get(aa).getSubNode(sequence);
} else if (depth == sequence.length() - 1) {
return subtree.get(aa);
} else {
throw new IllegalArgumentException("depth " + depth + " longer than sequence " + sequence + ".");
}
}
}