/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.nbio.structure.cluster; import org.biojava.nbio.alignment.Alignments; import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; import org.biojava.nbio.alignment.SimpleGapPenalty; import org.biojava.nbio.alignment.template.GapPenalty; import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; import org.biojava.nbio.core.exceptions.CompoundNotFoundException; import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.structure.Atom; import org.biojava.nbio.structure.StructureException; import org.biojava.nbio.structure.align.StructureAlignment; import org.biojava.nbio.structure.align.StructureAlignmentFactory; import org.biojava.nbio.structure.align.ce.CeMain; import org.biojava.nbio.structure.align.model.AFPChain; import org.biojava.nbio.structure.align.multiple.Block; import org.biojava.nbio.structure.align.multiple.BlockImpl; import org.biojava.nbio.structure.align.multiple.BlockSet; import org.biojava.nbio.structure.align.multiple.BlockSetImpl; import org.biojava.nbio.structure.align.multiple.MultipleAlignment; import org.biojava.nbio.structure.align.multiple.MultipleAlignmentEnsembleImpl; import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl; import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer; import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer; import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits; import org.biojava.nbio.structure.symmetry.internal.CESymmParameters; import org.biojava.nbio.structure.symmetry.internal.CeSymm; import org.biojava.nbio.structure.symmetry.internal.CeSymmResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import java.util.stream.Collectors; /** * A SubunitCluster contains a set of equivalent {@link QuatSymmetrySubunits}, * the set of equivalent residues (EQR) between {@link Subunit} and a * {@link Subunit} representative. It also stores the method used for * clustering. * <p> * This class allows the comparison and merging of SubunitClusters. * * @author Aleix Lafita * @since 5.0.0 * */ public class SubunitCluster { private static final Logger logger = LoggerFactory .getLogger(SubunitCluster.class); private List<Subunit> subunits = new ArrayList<Subunit>(); private List<List<Integer>> subunitEQR = new ArrayList<List<Integer>>(); private int representative = -1; private SubunitClustererMethod method = SubunitClustererMethod.IDENTITY; /** * A SubunitCluster is always initialized with a single Subunit. To obtain a * SubunitCluster with multiple Subunits, initialize different * SubunitClusters and merge them. * * @param subunit * initial Subunit */ public SubunitCluster(Subunit subunit) { subunits.add(subunit); List<Integer> identity = new ArrayList<Integer>(); for (int i = 0; i < subunit.size(); i++) identity.add(i); subunitEQR.add(identity); representative = 0; } /** * Subunits contained in the SubunitCluster. * * @return an unmodifiable view of the original List */ public List<Subunit> getSubunits() { return Collections.unmodifiableList(subunits); } /** * Tells whether the other SubunitCluster contains exactly the same Subunit. * This is checked by String equality of their residue one-letter sequences. * * @param other * SubunitCluster * @return true if the SubunitClusters are identical, false otherwise */ public boolean isIdenticalTo(SubunitCluster other) { String thisSequence = this.subunits.get(this.representative) .getProteinSequenceString(); String otherSequence = other.subunits.get(other.representative) .getProteinSequenceString(); return thisSequence.equals(otherSequence); } /** * Merges the other SubunitCluster into this one if it contains exactly the * same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}. * * @param other * SubunitCluster * @return true if the SubunitClusters were merged, false otherwise */ public boolean mergeIdentical(SubunitCluster other) { if (!isIdenticalTo(other)) return false; logger.info("SubunitClusters are identical"); this.subunits.addAll(other.subunits); this.subunitEQR.addAll(other.subunitEQR); return true; } /** * Merges the other SubunitCluster into this one if their representatives * sequences are similar (higher sequence identity and coverage than the * thresholds). * <p> * The sequence alignment is performed using Smith Waterman, default linear * {@link SimpleGapPenalty} and BLOSUM62 as scoring matrix. * * @param other * SubunitCluster * @param minSeqid * sequence identity threshold. Value in [0,1]. Values lower than * 0.7 are not recommended. Use {@link #mergeStructure} for lower * values. * @param minCoverage * coverage (alignment fraction) threshold. Value in [0,1]. * @return true if the SubunitClusters were merged, false otherwise * @throws CompoundNotFoundException */ public boolean mergeSequence(SubunitCluster other, double minSeqid, double minCoverage) throws CompoundNotFoundException { return mergeSequence(other, minSeqid, minCoverage, PairwiseSequenceAlignerType.LOCAL, new SimpleGapPenalty(), SubstitutionMatrixHelper.getBlosum62()); } /** * Merges the other SubunitCluster into this one if their representatives * sequences are similar (higher sequence identity and coverage than the * thresholds). * <p> * The sequence alignment is performed using Smith Waterman, default linear * {@link SimpleGapPenalty} and BLOSUM62 as scoring matrix. * * @param other * SubunitCluster * @param minSeqid * sequence identity threshold. Value in [0,1]. Values lower than * 0.7 are not recommended. Use {@link #mergeStructure} for lower * values. * @param minCoverage * coverage (alignment fraction) threshold. Value in [0,1]. * @param alignerType * parameter for the sequence alignment algorithm * @param gapPenalty * parameter for the sequence alignment algorithm * @param subsMatrix * parameter for the sequence alignment algorithm * @return true if the SubunitClusters were merged, false otherwise * @throws CompoundNotFoundException */ public boolean mergeSequence(SubunitCluster other, double minSeqid, double minCoverage, PairwiseSequenceAlignerType alignerType, GapPenalty gapPenalty, SubstitutionMatrix<AminoAcidCompound> subsMatrix) throws CompoundNotFoundException { // Extract the protein sequences as BioJava alignment objects ProteinSequence thisSequence = this.subunits.get(this.representative) .getProteinSequence(); ProteinSequence otherSequence = other.subunits .get(other.representative).getProteinSequence(); // Perform the alignment with provided parameters PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> aligner = Alignments .getPairwiseAligner(thisSequence, otherSequence, alignerType, gapPenalty, subsMatrix); // Calculate real coverage (subtract gaps in both sequences) double gaps1 = aligner.getPair().getAlignedSequence(1) .getNumGapPositions(); double gaps2 = aligner.getPair().getAlignedSequence(2) .getNumGapPositions(); double lengthAlignment = aligner.getPair().getLength(); double lengthThis = aligner.getQuery().getLength(); double lengthOther = aligner.getTarget().getLength(); double coverage = (lengthAlignment - gaps1 - gaps2) / Math.max(lengthThis, lengthOther); if (coverage < minCoverage) return false; double seqid = aligner.getPair().getPercentageOfIdentity(); if (seqid < minSeqid) return false; logger.info(String.format("SubunitClusters are similar in sequence " + "with %.2f sequence identity and %.2f coverage", seqid, coverage)); // If coverage and sequence identity sufficient, merge other and this List<Integer> thisAligned = new ArrayList<Integer>(); List<Integer> otherAligned = new ArrayList<Integer>(); // Extract the aligned residues of both Subunit for (int p = 1; p < aligner.getPair().getLength() + 1; p++) { // Skip gaps in any of the two sequences if (aligner.getPair().getAlignedSequence(1).isGap(p)) continue; if (aligner.getPair().getAlignedSequence(2).isGap(p)) continue; int thisIndex = aligner.getPair().getIndexInQueryAt(p) - 1; int otherIndex = aligner.getPair().getIndexInTargetAt(p) - 1; // Only consider residues that are part of the SubunitCluster if (this.subunitEQR.get(this.representative).contains(thisIndex) && other.subunitEQR.get(other.representative).contains( otherIndex)) { thisAligned.add(thisIndex); otherAligned.add(otherIndex); } } // Do a List intersection to find out which EQR columns to remove List<Integer> thisRemove = new ArrayList<Integer>(); List<Integer> otherRemove = new ArrayList<Integer>(); for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) { // If the index is aligned do nothing, otherwise mark as removing if (!thisAligned.contains(this.subunitEQR.get(this.representative) .get(t))) thisRemove.add(t); } for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) { // If the index is aligned do nothing, otherwise mark as removing if (!otherAligned.contains(other.subunitEQR.get( other.representative).get(t))) otherRemove.add(t); } // Now remove unaligned columns, from end to start Collections.sort(thisRemove); Collections.reverse(thisRemove); Collections.sort(otherRemove); Collections.reverse(otherRemove); for (int t = 0; t < thisRemove.size(); t++) { for (List<Integer> eqr : this.subunitEQR) { int column = thisRemove.get(t); eqr.remove(column); } } for (int t = 0; t < otherRemove.size(); t++) { for (List<Integer> eqr : other.subunitEQR) { int column = otherRemove.get(t); eqr.remove(column); } } // The representative is the longest sequence if (this.subunits.get(this.representative).size() < other.subunits.get( other.representative).size()) this.representative = other.representative + subunits.size(); this.subunits.addAll(other.subunits); this.subunitEQR.addAll(other.subunitEQR); this.method = SubunitClustererMethod.SEQUENCE; return true; } /** * Merges the other SubunitCluster into this one if their representative * Atoms are structurally similar (lower RMSD and higher coverage than the * thresholds). * <p> * The structure alignment is performed using FatCatRigid, with default * parameters. * * @param other * SubunitCluster * @param maxRmsd * RMSD threshold. * @param minCoverage * coverage (alignment fraction) threshold. Value in [0,1]. * @return true if the SubunitClusters were merged, false otherwise * @throws StructureException */ public boolean mergeStructure(SubunitCluster other, double maxRmsd, double minCoverage) throws StructureException { // Use a CE alignment with default parameters StructureAlignment algorithm = StructureAlignmentFactory .getAlgorithm(CeMain.algorithmName); return mergeStructure(other, maxRmsd, minCoverage, algorithm); } /** * Merges the other SubunitCluster into this one if their representative * Atoms are structurally similar (lower RMSD and higher coverage than the * thresholds). * <p> * The structure alignment is performed using FatCatRigid, with default * parameters. * * @param other * SubunitCluster * @param maxRmsd * RMSD threshold. * @param minCoverage * coverage (alignment fraction) threshold. Value in [0,1]. * @param aligner * StructureAlignment algorithm * @return true if the SubunitClusters were merged, false otherwise * @throws StructureException */ public boolean mergeStructure(SubunitCluster other, double maxRmsd, double minCoverage, StructureAlignment aligner) throws StructureException { AFPChain afp = aligner.align(this.subunits.get(this.representative) .getRepresentativeAtoms(), other.subunits.get(other.representative) .getRepresentativeAtoms()); // Convert AFPChain to MultipleAlignment for convinience MultipleAlignment msa = new MultipleAlignmentEnsembleImpl( afp, this.subunits.get(this.representative).getRepresentativeAtoms(), other.subunits.get(other.representative) .getRepresentativeAtoms(), false) .getMultipleAlignment(0); double coverage = Math.min(msa.getCoverages().get(0), msa .getCoverages().get(1)); if (coverage < minCoverage) return false; double rmsd = afp.getTotalRmsdOpt(); if (rmsd > maxRmsd) return false; String.format("SubunitClusters are structurally similar with " + "%.2f RMSD %.2f coverage", rmsd, coverage); // If RMSD is low and coverage sufficient merge clusters List<List<Integer>> alignedRes = msa.getBlock(0).getAlignRes(); List<Integer> thisAligned = new ArrayList<Integer>(); List<Integer> otherAligned = new ArrayList<Integer>(); // Extract the aligned residues of both Subunit for (int p = 0; p < msa.length(); p++) { // Skip gaps in any of the two sequences if (alignedRes.get(0).get(p) == null) continue; if (alignedRes.get(1).get(p) == null) continue; int thisIndex = alignedRes.get(0).get(p); int otherIndex = alignedRes.get(1).get(p); // Only consider residues that are part of the SubunitCluster if (this.subunitEQR.get(this.representative).contains(thisIndex) && other.subunitEQR.get(other.representative).contains( otherIndex)) { thisAligned.add(thisIndex); otherAligned.add(otherIndex); } } // Do a List intersection to find out which EQR columns to remove List<Integer> thisRemove = new ArrayList<Integer>(); List<Integer> otherRemove = new ArrayList<Integer>(); for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) { // If the index is aligned do nothing, otherwise mark as removing if (!thisAligned.contains(this.subunitEQR.get(this.representative) .get(t))) thisRemove.add(t); } for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) { // If the index is aligned do nothing, otherwise mark as removing if (!otherAligned.contains(other.subunitEQR.get( other.representative).get(t))) otherRemove.add(t); } // Now remove unaligned columns, from end to start Collections.sort(thisRemove); Collections.reverse(thisRemove); Collections.sort(otherRemove); Collections.reverse(otherRemove); for (int t = 0; t < thisRemove.size(); t++) { for (List<Integer> eqr : this.subunitEQR) { int column = thisRemove.get(t); eqr.remove(column); } } for (int t = 0; t < otherRemove.size(); t++) { for (List<Integer> eqr : other.subunitEQR) { int column = otherRemove.get(t); eqr.remove(column); } } // The representative is the longest sequence if (this.subunits.get(this.representative).size() < other.subunits.get( other.representative).size()) this.representative = other.representative + subunits.size(); this.subunits.addAll(other.subunits); this.subunitEQR.addAll(other.subunitEQR); this.method = SubunitClustererMethod.STRUCTURE; return true; } /** * Analyze the internal symmetry of the SubunitCluster and divide its * {@link Subunit} into the internal repeats (domains) if they are * internally symmetric. * * @param coverageThreshold * the minimum coverage of all repeats in the Subunit * @param rmsdThreshold * the maximum allowed RMSD between the repeats * @param minSequenceLength * the minimum length of the repeating units * @return true if the cluster was internally symmetric, false otherwise * @throws StructureException */ public boolean divideInternally(double coverageThreshold, double rmsdThreshold, int minSequenceLength) throws StructureException { CESymmParameters params = new CESymmParameters(); params.setMinCoreLength(minSequenceLength); params.setGaps(false); // We want no gaps between the repeats // Analyze the internal symmetry of the representative subunit CeSymmResult result = CeSymm.analyze(subunits.get(representative) .getRepresentativeAtoms(), params); if (!result.isSignificant()) return false; double rmsd = result.getMultipleAlignment().getScore( MultipleAlignmentScorer.RMSD); if (rmsd > rmsdThreshold) return false; double coverage = result.getMultipleAlignment().getCoverages().get(0) * result.getNumRepeats(); if (coverage < coverageThreshold) return false; logger.info("SubunitCluster is internally symmetric with {} repeats, " + "{} RMSD and {} coverage", result.getNumRepeats(), rmsd, coverage); // Divide if symmety was significant with RMSD and coverage sufficient List<List<Integer>> alignedRes = result.getMultipleAlignment() .getBlock(0).getAlignRes(); List<List<Integer>> columns = new ArrayList<List<Integer>>(); for (int s = 0; s < alignedRes.size(); s++) columns.add(new ArrayList<Integer>(alignedRes.get(s).size())); // Extract the aligned columns of each repeat in the Subunit for (int col = 0; col < alignedRes.get(0).size(); col++) { // Check that all aligned residues are part of the Cluster boolean missing = false; for (int s = 0; s < alignedRes.size(); s++) { if (!subunitEQR.get(representative).contains( alignedRes.get(s).get(col))) { missing = true; break; } } // Skip the column if any residue was not part of the cluster if (missing) continue; for (int s = 0; s < alignedRes.size(); s++) { columns.get(s).add( subunitEQR.get(representative).indexOf( alignedRes.get(s).get(col))); } } // Divide the Subunits in their repeats List<Subunit> newSubunits = new ArrayList<Subunit>(subunits.size() * columns.size()); List<List<Integer>> newSubunitEQR = new ArrayList<List<Integer>>( subunits.size() * columns.size()); for (int s = 0; s < subunits.size(); s++) { for (int r = 0; r < columns.size(); r++) { // Calculate start and end residues of the new Subunit int start = subunitEQR.get(s).get(columns.get(r).get(0)); int end = subunitEQR.get(s).get( columns.get(r).get(columns.get(r).size() - 1)); Atom[] reprAtoms = Arrays.copyOfRange(subunits.get(s) .getRepresentativeAtoms(), start, end + 1); newSubunits.add(new Subunit(reprAtoms, subunits.get(s) .getName(), subunits.get(s).getIdentifier(), subunits .get(s).getStructure())); // Recalculate equivalent residues List<Integer> eqr = new ArrayList<Integer>(); for (int p = 0; p < columns.get(r).size(); p++) { eqr.add(subunitEQR.get(s).get(columns.get(r).get(p)) - start); } newSubunitEQR.add(eqr); } } subunits = newSubunits; subunitEQR = newSubunitEQR; // Update representative for (int s = 0; s < subunits.size(); s++) { if (subunits.get(s).size() > subunits.get(representative).size()) representative = s; } method = SubunitClustererMethod.STRUCTURE; return true; } /** * @return the number of Subunits in the cluster */ public int size() { return subunits.size(); } /** * @return the number of aligned residues between Subunits of the cluster */ public int length() { return subunitEQR.get(representative).size(); } /** * @return the {@link SubunitClustererMethod} used for clustering the * Subunits */ public SubunitClustererMethod getClustererMethod() { return method; } /** * @return A List of size {@link #size()} of Atom arrays of length * {@link #length()} with the aligned Atoms for each Subunit in the * cluster */ public List<Atom[]> getAlignedAtomsSubunits() { List<Atom[]> alignedAtoms = Collections.emptyList(); // Loop through all subunits and add the aligned positions for (int s = 0; s < subunits.size(); s++) alignedAtoms.add(getAlignedAtomsSubunit(s)); return alignedAtoms; } /** * @param index * Subunit index in the Cluster * @return An Atom array of length {@link #length()} with the aligned Atoms * from the selected Subunit in the Cluster */ public Atom[] getAlignedAtomsSubunit(int index) { Atom[] aligned = new Atom[subunitEQR.get(index).size()]; // Add only the aligned positions of the Subunit in the Cluster for (int p = 0; p < subunitEQR.get(index).size(); p++) { aligned[p] = subunits.get(index).getRepresentativeAtoms()[subunitEQR .get(index).get(p)]; } return aligned; } /** * The multiple alignment is calculated from the equivalent residues in the * SubunitCluster. The alignment is recalculated every time the method is * called (no caching). * * @return MultipleAlignment representation of the aligned residues in this * Subunit Cluster * @throws StructureException */ public MultipleAlignment getMultipleAlignment() throws StructureException { // Create a multiple alignment with the atom arrays of the Subunits MultipleAlignment msa = new MultipleAlignmentImpl(); msa.setEnsemble(new MultipleAlignmentEnsembleImpl()); msa.getEnsemble().setAtomArrays( subunits.stream().map(s -> s.getRepresentativeAtoms()) .collect(Collectors.toList())); // Fill in the alignment information BlockSet bs = new BlockSetImpl(msa); Block b = new BlockImpl(bs); b.setAlignRes(subunitEQR); // Fill in the transformation matrices new ReferenceSuperimposer(representative).superimpose(msa); // Calculate some scores MultipleAlignmentScorer.calculateScores(msa); return msa; } @Override public String toString() { return "SubunitCluster [Size=" + size() + ", Length=" + length() + ", Representative=" + representative + ", Method=" + method + "]"; } }