/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure.cluster;
import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.alignment.template.GapPenalty;
import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.structure.Atom;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.StructureAlignment;
import org.biojava.nbio.structure.align.StructureAlignmentFactory;
import org.biojava.nbio.structure.align.ce.CeMain;
import org.biojava.nbio.structure.align.model.AFPChain;
import org.biojava.nbio.structure.align.multiple.Block;
import org.biojava.nbio.structure.align.multiple.BlockImpl;
import org.biojava.nbio.structure.align.multiple.BlockSet;
import org.biojava.nbio.structure.align.multiple.BlockSetImpl;
import org.biojava.nbio.structure.align.multiple.MultipleAlignment;
import org.biojava.nbio.structure.align.multiple.MultipleAlignmentEnsembleImpl;
import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl;
import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer;
import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer;
import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits;
import org.biojava.nbio.structure.symmetry.internal.CESymmParameters;
import org.biojava.nbio.structure.symmetry.internal.CeSymm;
import org.biojava.nbio.structure.symmetry.internal.CeSymmResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.stream.Collectors;
/**
* A SubunitCluster contains a set of equivalent {@link QuatSymmetrySubunits},
* the set of equivalent residues (EQR) between {@link Subunit} and a
* {@link Subunit} representative. It also stores the method used for
* clustering.
* <p>
* This class allows the comparison and merging of SubunitClusters.
*
* @author Aleix Lafita
* @since 5.0.0
*
*/
public class SubunitCluster {
private static final Logger logger = LoggerFactory
.getLogger(SubunitCluster.class);
private List<Subunit> subunits = new ArrayList<Subunit>();
private List<List<Integer>> subunitEQR = new ArrayList<List<Integer>>();
private int representative = -1;
private SubunitClustererMethod method = SubunitClustererMethod.IDENTITY;
/**
* A SubunitCluster is always initialized with a single Subunit. To obtain a
* SubunitCluster with multiple Subunits, initialize different
* SubunitClusters and merge them.
*
* @param subunit
* initial Subunit
*/
public SubunitCluster(Subunit subunit) {
subunits.add(subunit);
List<Integer> identity = new ArrayList<Integer>();
for (int i = 0; i < subunit.size(); i++)
identity.add(i);
subunitEQR.add(identity);
representative = 0;
}
/**
* Subunits contained in the SubunitCluster.
*
* @return an unmodifiable view of the original List
*/
public List<Subunit> getSubunits() {
return Collections.unmodifiableList(subunits);
}
/**
* Tells whether the other SubunitCluster contains exactly the same Subunit.
* This is checked by String equality of their residue one-letter sequences.
*
* @param other
* SubunitCluster
* @return true if the SubunitClusters are identical, false otherwise
*/
public boolean isIdenticalTo(SubunitCluster other) {
String thisSequence = this.subunits.get(this.representative)
.getProteinSequenceString();
String otherSequence = other.subunits.get(other.representative)
.getProteinSequenceString();
return thisSequence.equals(otherSequence);
}
/**
* Merges the other SubunitCluster into this one if it contains exactly the
* same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}.
*
* @param other
* SubunitCluster
* @return true if the SubunitClusters were merged, false otherwise
*/
public boolean mergeIdentical(SubunitCluster other) {
if (!isIdenticalTo(other))
return false;
logger.info("SubunitClusters are identical");
this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);
return true;
}
/**
* Merges the other SubunitCluster into this one if their representatives
* sequences are similar (higher sequence identity and coverage than the
* thresholds).
* <p>
* The sequence alignment is performed using Smith Waterman, default linear
* {@link SimpleGapPenalty} and BLOSUM62 as scoring matrix.
*
* @param other
* SubunitCluster
* @param minSeqid
* sequence identity threshold. Value in [0,1]. Values lower than
* 0.7 are not recommended. Use {@link #mergeStructure} for lower
* values.
* @param minCoverage
* coverage (alignment fraction) threshold. Value in [0,1].
* @return true if the SubunitClusters were merged, false otherwise
* @throws CompoundNotFoundException
*/
public boolean mergeSequence(SubunitCluster other, double minSeqid,
double minCoverage) throws CompoundNotFoundException {
return mergeSequence(other, minSeqid, minCoverage,
PairwiseSequenceAlignerType.LOCAL, new SimpleGapPenalty(),
SubstitutionMatrixHelper.getBlosum62());
}
/**
* Merges the other SubunitCluster into this one if their representatives
* sequences are similar (higher sequence identity and coverage than the
* thresholds).
* <p>
* The sequence alignment is performed using Smith Waterman, default linear
* {@link SimpleGapPenalty} and BLOSUM62 as scoring matrix.
*
* @param other
* SubunitCluster
* @param minSeqid
* sequence identity threshold. Value in [0,1]. Values lower than
* 0.7 are not recommended. Use {@link #mergeStructure} for lower
* values.
* @param minCoverage
* coverage (alignment fraction) threshold. Value in [0,1].
* @param alignerType
* parameter for the sequence alignment algorithm
* @param gapPenalty
* parameter for the sequence alignment algorithm
* @param subsMatrix
* parameter for the sequence alignment algorithm
* @return true if the SubunitClusters were merged, false otherwise
* @throws CompoundNotFoundException
*/
public boolean mergeSequence(SubunitCluster other, double minSeqid,
double minCoverage, PairwiseSequenceAlignerType alignerType,
GapPenalty gapPenalty,
SubstitutionMatrix<AminoAcidCompound> subsMatrix)
throws CompoundNotFoundException {
// Extract the protein sequences as BioJava alignment objects
ProteinSequence thisSequence = this.subunits.get(this.representative)
.getProteinSequence();
ProteinSequence otherSequence = other.subunits
.get(other.representative).getProteinSequence();
// Perform the alignment with provided parameters
PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> aligner = Alignments
.getPairwiseAligner(thisSequence, otherSequence, alignerType,
gapPenalty, subsMatrix);
// Calculate real coverage (subtract gaps in both sequences)
double gaps1 = aligner.getPair().getAlignedSequence(1)
.getNumGapPositions();
double gaps2 = aligner.getPair().getAlignedSequence(2)
.getNumGapPositions();
double lengthAlignment = aligner.getPair().getLength();
double lengthThis = aligner.getQuery().getLength();
double lengthOther = aligner.getTarget().getLength();
double coverage = (lengthAlignment - gaps1 - gaps2)
/ Math.max(lengthThis, lengthOther);
if (coverage < minCoverage)
return false;
double seqid = aligner.getPair().getPercentageOfIdentity();
if (seqid < minSeqid)
return false;
logger.info(String.format("SubunitClusters are similar in sequence "
+ "with %.2f sequence identity and %.2f coverage", seqid,
coverage));
// If coverage and sequence identity sufficient, merge other and this
List<Integer> thisAligned = new ArrayList<Integer>();
List<Integer> otherAligned = new ArrayList<Integer>();
// Extract the aligned residues of both Subunit
for (int p = 1; p < aligner.getPair().getLength() + 1; p++) {
// Skip gaps in any of the two sequences
if (aligner.getPair().getAlignedSequence(1).isGap(p))
continue;
if (aligner.getPair().getAlignedSequence(2).isGap(p))
continue;
int thisIndex = aligner.getPair().getIndexInQueryAt(p) - 1;
int otherIndex = aligner.getPair().getIndexInTargetAt(p) - 1;
// Only consider residues that are part of the SubunitCluster
if (this.subunitEQR.get(this.representative).contains(thisIndex)
&& other.subunitEQR.get(other.representative).contains(
otherIndex)) {
thisAligned.add(thisIndex);
otherAligned.add(otherIndex);
}
}
// Do a List intersection to find out which EQR columns to remove
List<Integer> thisRemove = new ArrayList<Integer>();
List<Integer> otherRemove = new ArrayList<Integer>();
for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!thisAligned.contains(this.subunitEQR.get(this.representative)
.get(t)))
thisRemove.add(t);
}
for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!otherAligned.contains(other.subunitEQR.get(
other.representative).get(t)))
otherRemove.add(t);
}
// Now remove unaligned columns, from end to start
Collections.sort(thisRemove);
Collections.reverse(thisRemove);
Collections.sort(otherRemove);
Collections.reverse(otherRemove);
for (int t = 0; t < thisRemove.size(); t++) {
for (List<Integer> eqr : this.subunitEQR) {
int column = thisRemove.get(t);
eqr.remove(column);
}
}
for (int t = 0; t < otherRemove.size(); t++) {
for (List<Integer> eqr : other.subunitEQR) {
int column = otherRemove.get(t);
eqr.remove(column);
}
}
// The representative is the longest sequence
if (this.subunits.get(this.representative).size() < other.subunits.get(
other.representative).size())
this.representative = other.representative + subunits.size();
this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);
this.method = SubunitClustererMethod.SEQUENCE;
return true;
}
/**
* Merges the other SubunitCluster into this one if their representative
* Atoms are structurally similar (lower RMSD and higher coverage than the
* thresholds).
* <p>
* The structure alignment is performed using FatCatRigid, with default
* parameters.
*
* @param other
* SubunitCluster
* @param maxRmsd
* RMSD threshold.
* @param minCoverage
* coverage (alignment fraction) threshold. Value in [0,1].
* @return true if the SubunitClusters were merged, false otherwise
* @throws StructureException
*/
public boolean mergeStructure(SubunitCluster other, double maxRmsd,
double minCoverage) throws StructureException {
// Use a CE alignment with default parameters
StructureAlignment algorithm = StructureAlignmentFactory
.getAlgorithm(CeMain.algorithmName);
return mergeStructure(other, maxRmsd, minCoverage, algorithm);
}
/**
* Merges the other SubunitCluster into this one if their representative
* Atoms are structurally similar (lower RMSD and higher coverage than the
* thresholds).
* <p>
* The structure alignment is performed using FatCatRigid, with default
* parameters.
*
* @param other
* SubunitCluster
* @param maxRmsd
* RMSD threshold.
* @param minCoverage
* coverage (alignment fraction) threshold. Value in [0,1].
* @param aligner
* StructureAlignment algorithm
* @return true if the SubunitClusters were merged, false otherwise
* @throws StructureException
*/
public boolean mergeStructure(SubunitCluster other, double maxRmsd,
double minCoverage, StructureAlignment aligner)
throws StructureException {
AFPChain afp = aligner.align(this.subunits.get(this.representative)
.getRepresentativeAtoms(),
other.subunits.get(other.representative)
.getRepresentativeAtoms());
// Convert AFPChain to MultipleAlignment for convinience
MultipleAlignment msa = new MultipleAlignmentEnsembleImpl(
afp,
this.subunits.get(this.representative).getRepresentativeAtoms(),
other.subunits.get(other.representative)
.getRepresentativeAtoms(), false)
.getMultipleAlignment(0);
double coverage = Math.min(msa.getCoverages().get(0), msa
.getCoverages().get(1));
if (coverage < minCoverage)
return false;
double rmsd = afp.getTotalRmsdOpt();
if (rmsd > maxRmsd)
return false;
String.format("SubunitClusters are structurally similar with "
+ "%.2f RMSD %.2f coverage", rmsd, coverage);
// If RMSD is low and coverage sufficient merge clusters
List<List<Integer>> alignedRes = msa.getBlock(0).getAlignRes();
List<Integer> thisAligned = new ArrayList<Integer>();
List<Integer> otherAligned = new ArrayList<Integer>();
// Extract the aligned residues of both Subunit
for (int p = 0; p < msa.length(); p++) {
// Skip gaps in any of the two sequences
if (alignedRes.get(0).get(p) == null)
continue;
if (alignedRes.get(1).get(p) == null)
continue;
int thisIndex = alignedRes.get(0).get(p);
int otherIndex = alignedRes.get(1).get(p);
// Only consider residues that are part of the SubunitCluster
if (this.subunitEQR.get(this.representative).contains(thisIndex)
&& other.subunitEQR.get(other.representative).contains(
otherIndex)) {
thisAligned.add(thisIndex);
otherAligned.add(otherIndex);
}
}
// Do a List intersection to find out which EQR columns to remove
List<Integer> thisRemove = new ArrayList<Integer>();
List<Integer> otherRemove = new ArrayList<Integer>();
for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!thisAligned.contains(this.subunitEQR.get(this.representative)
.get(t)))
thisRemove.add(t);
}
for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) {
// If the index is aligned do nothing, otherwise mark as removing
if (!otherAligned.contains(other.subunitEQR.get(
other.representative).get(t)))
otherRemove.add(t);
}
// Now remove unaligned columns, from end to start
Collections.sort(thisRemove);
Collections.reverse(thisRemove);
Collections.sort(otherRemove);
Collections.reverse(otherRemove);
for (int t = 0; t < thisRemove.size(); t++) {
for (List<Integer> eqr : this.subunitEQR) {
int column = thisRemove.get(t);
eqr.remove(column);
}
}
for (int t = 0; t < otherRemove.size(); t++) {
for (List<Integer> eqr : other.subunitEQR) {
int column = otherRemove.get(t);
eqr.remove(column);
}
}
// The representative is the longest sequence
if (this.subunits.get(this.representative).size() < other.subunits.get(
other.representative).size())
this.representative = other.representative + subunits.size();
this.subunits.addAll(other.subunits);
this.subunitEQR.addAll(other.subunitEQR);
this.method = SubunitClustererMethod.STRUCTURE;
return true;
}
/**
* Analyze the internal symmetry of the SubunitCluster and divide its
* {@link Subunit} into the internal repeats (domains) if they are
* internally symmetric.
*
* @param coverageThreshold
* the minimum coverage of all repeats in the Subunit
* @param rmsdThreshold
* the maximum allowed RMSD between the repeats
* @param minSequenceLength
* the minimum length of the repeating units
* @return true if the cluster was internally symmetric, false otherwise
* @throws StructureException
*/
public boolean divideInternally(double coverageThreshold,
double rmsdThreshold, int minSequenceLength)
throws StructureException {
CESymmParameters params = new CESymmParameters();
params.setMinCoreLength(minSequenceLength);
params.setGaps(false); // We want no gaps between the repeats
// Analyze the internal symmetry of the representative subunit
CeSymmResult result = CeSymm.analyze(subunits.get(representative)
.getRepresentativeAtoms(), params);
if (!result.isSignificant())
return false;
double rmsd = result.getMultipleAlignment().getScore(
MultipleAlignmentScorer.RMSD);
if (rmsd > rmsdThreshold)
return false;
double coverage = result.getMultipleAlignment().getCoverages().get(0)
* result.getNumRepeats();
if (coverage < coverageThreshold)
return false;
logger.info("SubunitCluster is internally symmetric with {} repeats, "
+ "{} RMSD and {} coverage", result.getNumRepeats(), rmsd,
coverage);
// Divide if symmety was significant with RMSD and coverage sufficient
List<List<Integer>> alignedRes = result.getMultipleAlignment()
.getBlock(0).getAlignRes();
List<List<Integer>> columns = new ArrayList<List<Integer>>();
for (int s = 0; s < alignedRes.size(); s++)
columns.add(new ArrayList<Integer>(alignedRes.get(s).size()));
// Extract the aligned columns of each repeat in the Subunit
for (int col = 0; col < alignedRes.get(0).size(); col++) {
// Check that all aligned residues are part of the Cluster
boolean missing = false;
for (int s = 0; s < alignedRes.size(); s++) {
if (!subunitEQR.get(representative).contains(
alignedRes.get(s).get(col))) {
missing = true;
break;
}
}
// Skip the column if any residue was not part of the cluster
if (missing)
continue;
for (int s = 0; s < alignedRes.size(); s++) {
columns.get(s).add(
subunitEQR.get(representative).indexOf(
alignedRes.get(s).get(col)));
}
}
// Divide the Subunits in their repeats
List<Subunit> newSubunits = new ArrayList<Subunit>(subunits.size()
* columns.size());
List<List<Integer>> newSubunitEQR = new ArrayList<List<Integer>>(
subunits.size() * columns.size());
for (int s = 0; s < subunits.size(); s++) {
for (int r = 0; r < columns.size(); r++) {
// Calculate start and end residues of the new Subunit
int start = subunitEQR.get(s).get(columns.get(r).get(0));
int end = subunitEQR.get(s).get(
columns.get(r).get(columns.get(r).size() - 1));
Atom[] reprAtoms = Arrays.copyOfRange(subunits.get(s)
.getRepresentativeAtoms(), start, end + 1);
newSubunits.add(new Subunit(reprAtoms, subunits.get(s)
.getName(), subunits.get(s).getIdentifier(), subunits
.get(s).getStructure()));
// Recalculate equivalent residues
List<Integer> eqr = new ArrayList<Integer>();
for (int p = 0; p < columns.get(r).size(); p++) {
eqr.add(subunitEQR.get(s).get(columns.get(r).get(p))
- start);
}
newSubunitEQR.add(eqr);
}
}
subunits = newSubunits;
subunitEQR = newSubunitEQR;
// Update representative
for (int s = 0; s < subunits.size(); s++) {
if (subunits.get(s).size() > subunits.get(representative).size())
representative = s;
}
method = SubunitClustererMethod.STRUCTURE;
return true;
}
/**
* @return the number of Subunits in the cluster
*/
public int size() {
return subunits.size();
}
/**
* @return the number of aligned residues between Subunits of the cluster
*/
public int length() {
return subunitEQR.get(representative).size();
}
/**
* @return the {@link SubunitClustererMethod} used for clustering the
* Subunits
*/
public SubunitClustererMethod getClustererMethod() {
return method;
}
/**
* @return A List of size {@link #size()} of Atom arrays of length
* {@link #length()} with the aligned Atoms for each Subunit in the
* cluster
*/
public List<Atom[]> getAlignedAtomsSubunits() {
List<Atom[]> alignedAtoms = Collections.emptyList();
// Loop through all subunits and add the aligned positions
for (int s = 0; s < subunits.size(); s++)
alignedAtoms.add(getAlignedAtomsSubunit(s));
return alignedAtoms;
}
/**
* @param index
* Subunit index in the Cluster
* @return An Atom array of length {@link #length()} with the aligned Atoms
* from the selected Subunit in the Cluster
*/
public Atom[] getAlignedAtomsSubunit(int index) {
Atom[] aligned = new Atom[subunitEQR.get(index).size()];
// Add only the aligned positions of the Subunit in the Cluster
for (int p = 0; p < subunitEQR.get(index).size(); p++) {
aligned[p] = subunits.get(index).getRepresentativeAtoms()[subunitEQR
.get(index).get(p)];
}
return aligned;
}
/**
* The multiple alignment is calculated from the equivalent residues in the
* SubunitCluster. The alignment is recalculated every time the method is
* called (no caching).
*
* @return MultipleAlignment representation of the aligned residues in this
* Subunit Cluster
* @throws StructureException
*/
public MultipleAlignment getMultipleAlignment() throws StructureException {
// Create a multiple alignment with the atom arrays of the Subunits
MultipleAlignment msa = new MultipleAlignmentImpl();
msa.setEnsemble(new MultipleAlignmentEnsembleImpl());
msa.getEnsemble().setAtomArrays(
subunits.stream().map(s -> s.getRepresentativeAtoms())
.collect(Collectors.toList()));
// Fill in the alignment information
BlockSet bs = new BlockSetImpl(msa);
Block b = new BlockImpl(bs);
b.setAlignRes(subunitEQR);
// Fill in the transformation matrices
new ReferenceSuperimposer(representative).superimpose(msa);
// Calculate some scores
MultipleAlignmentScorer.calculateScores(msa);
return msa;
}
@Override
public String toString() {
return "SubunitCluster [Size=" + size() + ", Length=" + length()
+ ", Representative=" + representative + ", Method=" + method
+ "]";
}
}