/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.nbio.structure.io; import org.biojava.nbio.alignment.Alignments; import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; import org.biojava.nbio.alignment.SimpleGapPenalty; import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; import org.biojava.nbio.alignment.template.GapPenalty; import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; import org.biojava.nbio.core.alignment.template.SequencePair; import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; import org.biojava.nbio.core.exceptions.CompoundNotFoundException; import org.biojava.nbio.core.sequence.DNASequence; import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.RNASequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.core.sequence.compound.NucleotideCompound; import org.biojava.nbio.structure.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; /** * Heuristical finding of Entities (called Compounds in legacy PDB format) * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains * in the Structure. * * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where * possibly no SEQRES is given. * * @author Jose Duarte */ public class EntityFinder { private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class); /** * Above this ratio of mismatching residue types for same residue numbers we * consider the 2 chains to have mismatching residue numbers and warn about it */ public static final double RATIO_GAPS_FOR_MISMATCH = 0.50; /** * Identity value for 2 chains to be considered part of same entity */ public static final double IDENTITY_THRESHOLD = 0.99999; /** * Gap coverage value (num gaps over length of sequence) for each chain of the match: * 2 chains with more gap coverage than this value will not be considered part of the same entity */ public static final double GAP_COVERAGE_THRESHOLD = 0.3; /** * Utility method that employs some heuristics to find the {@link EntityInfo}s * for the polymeric chains given in constructor. * To be used in case the information is missing in PDB/mmCIF file * @return */ public static List<EntityInfo> findPolyEntities(List<List<Chain>> polyModels) { TreeMap<String,EntityInfo> chainIds2entities = findEntitiesFromAlignment(polyModels); List<EntityInfo> entities = findUniqueEntities(chainIds2entities); return entities; } /** * Utility method to obtain a list of unique entities from the chainIds2entities map * @return */ private static List<EntityInfo> findUniqueEntities(TreeMap<String,EntityInfo> chainIds2entities) { List<EntityInfo> list = new ArrayList<EntityInfo>(); for (EntityInfo cluster:chainIds2entities.values()) { boolean present = false; for (EntityInfo cl:list) { if (cl==cluster) { present = true; break; } } if (!present) list.add(cluster); } return list; } /** * Given all chains of all models find entities for the nonpolymers and water chains within them, * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List. * @param nonPolyModels * @param waterModels * @param entities */ public static void createPurelyNonPolyEntities(List<List<Chain>> nonPolyModels, List<List<Chain>> waterModels, List<EntityInfo> entities) { if (nonPolyModels.isEmpty()) return; // let's find first the max entity id to assign entity ids to the newly found entities int maxMolId = 0; if (!entities.isEmpty()) { maxMolId = Collections.max(entities, new Comparator<EntityInfo>() { @Override public int compare(EntityInfo o1, EntityInfo o2) { return new Integer(o1.getMolId()).compareTo(o2.getMolId()); } }).getMolId(); } // we go one over the max int molId = maxMolId + 1; if (!nonPolyModels.get(0).isEmpty()) { List<EntityInfo> nonPolyEntities = new ArrayList<>(); for (List<Chain> model:nonPolyModels) { for (Chain c: model) { // we assume there's only 1 group per non-poly chain String molecPdbName = c.getAtomGroup(0).getPDBName(); EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities); if (nonPolyEntity == null) { nonPolyEntity = new EntityInfo(); nonPolyEntity.setDescription(molecPdbName); nonPolyEntity.setType(EntityType.NONPOLYMER); nonPolyEntity.setMolId(molId++); nonPolyEntities.add(nonPolyEntity); } nonPolyEntity.addChain(c); c.setEntityInfo(nonPolyEntity); } } entities.addAll(nonPolyEntities); } if (!waterModels.get(0).isEmpty()) { EntityInfo waterEntity = new EntityInfo(); waterEntity.setType(EntityType.WATER); waterEntity.setDescription("water"); waterEntity.setMolId(molId); for (List<Chain> model:waterModels) { for (Chain waterChain:model) { waterEntity.addChain(waterChain); waterChain.setEntityInfo(waterEntity); } } entities.add(waterEntity); } } private static EntityInfo findNonPolyEntityWithDescription(String description, List<EntityInfo> nonPolyEntities) { for (EntityInfo e:nonPolyEntities) { if (e.getDescription().equals(description)) return e; } return null; } private static boolean areResNumbersAligned(Chain c1, Chain c2) { boolean isC1prot = StructureTools.isProtein(c1); boolean isC2prot = StructureTools.isProtein(c2); // different kind of chain: we won't try to align them if (isC1prot != isC2prot ) return false; List<Group> c1AtomGroups = null; if (isC1prot) { c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID); } else { c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE); } int countNonExisting = 0; for (Group g1:c1AtomGroups) { try { Group g2 = c2.getGroupByPDB(g1.getResidueNumber()); if (!g2.getPDBName().equals(g1.getPDBName())) { logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}", c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName()); return false; } } catch (StructureException e) { // the group doesn't exist (no density) in the chain, go on countNonExisting++; continue; } } if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) { logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.", RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId()); return false; } return true; } private static TreeMap<String,EntityInfo> findEntitiesFromAlignment(List<List<Chain>> polyModels) { TreeMap<String, EntityInfo> chainIds2entities = new TreeMap<String,EntityInfo>(); if (polyModels.isEmpty()) return chainIds2entities; Set<Integer> polyChainIndices = new TreeSet<Integer>(); for (int i=0;i<polyModels.get(0).size();i++) { polyChainIndices.add(i); } int molId = 1; outer: for (int i:polyChainIndices) { for (int j:polyChainIndices) { if (j<=i) continue; Chain c1 = polyModels.get(0).get(i); Chain c2 = polyModels.get(0).get(j); Map<Integer,Integer> positionIndex1 = new HashMap<Integer, Integer>(); Map<Integer,Integer> positionIndex2 = new HashMap<Integer, Integer>(); // here we use false, which means that X will be used for unknown compounds String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false); String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false); int seq1Length = 0; int seq2Length = 0; SequencePair<?,?> pair = null; if (isProteinSequence(str1) && isProteinSequence(str2)) { ProteinSequence s1 = getProteinSequence(str1); ProteinSequence s2 = getProteinSequence(str2); seq1Length = s1.getLength(); seq2Length = s2.getLength(); pair = alignProtein(s1,s2); } else if (isDNASequence(str1) && isDNASequence(str2)) { DNASequence s1 = getDNASequence(str1); DNASequence s2 = getDNASequence(str2); seq1Length = s1.getLength(); seq2Length = s2.getLength(); pair = alignDNA(s1,s2); } else if (isRNASequence(str1) && isRNASequence(str2)) { RNASequence s1 = getRNASequence(str1); RNASequence s2 = getRNASequence(str2); seq1Length = s1.getLength(); seq2Length = s2.getLength(); pair = alignRNA(s1,s2); } else { logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers"); continue; } int numGaps = getNumGaps(pair); int numGaps1 = getNumGapsQuery(pair); int numGaps2 = getNumGapsTarget(pair); int nonGaps = pair.getLength() - numGaps; double identity = (double)pair.getNumIdenticals()/(double)nonGaps; double gapCov1 = (double) numGaps1 / (double) seq1Length; double gapCov2 = (double) numGaps2 / (double) seq2Length; logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}", c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2)); logger.debug("\n"+pair.toString(100)); if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) { if ( !chainIds2entities.containsKey(c1.getId()) && !chainIds2entities.containsKey(c2.getId())) { logger.debug("Creating Compound with chains {},{}",c1.getId(),c2.getId()); EntityInfo ent = new EntityInfo(); ent.addChain(c1); ent.addChain(c2); ent.setMolId(molId++); ent.setType(EntityType.POLYMER); c1.setEntityInfo(ent); c2.setEntityInfo(ent); chainIds2entities.put(c1.getId(), ent); chainIds2entities.put(c2.getId(), ent); } else { EntityInfo ent = chainIds2entities.get(c1.getId()); if (ent==null) { logger.debug("Adding chain {} to entity {}",c1.getId(),c2.getId()); ent = chainIds2entities.get(c2.getId()); ent.addChain(c1); c1.setEntityInfo(ent); chainIds2entities.put(c1.getId(), ent); } else { logger.debug("Adding chain {} to entity {}",c2.getId(),c1.getId()); ent.addChain(c2); c2.setEntityInfo(ent); chainIds2entities.put(c2.getId(), ent); } } if (!areResNumbersAligned(c1, c2)) { logger.warn("Including 100% identical chains {},{} in same Compound, although they have misaligned residue numbers", c1.getId(),c2.getId()); } } if (identity>1) { logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})", c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity); logger.warn("\n"+pair.toString(100)); } if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities break outer; } } // anything not in an Entity will be its own Entity for (int i:polyChainIndices) { Chain c = polyModels.get(0).get(i); if (!chainIds2entities.containsKey(c.getId())) { logger.debug("Creating a 1-member Compound for chain {}",c.getId()); EntityInfo ent = new EntityInfo(); ent.addChain(c); ent.setMolId(molId++); c.setEntityInfo(ent); chainIds2entities.put(c.getId(),ent); } } // now that we've found the entities for first model, let's do the same for the rest of the models for (int i=1;i<polyModels.size();i++) { for (Chain chain : polyModels.get(i)) { EntityInfo e = chainIds2entities.get(chain.getId()); chain.setEntityInfo(e); e.addChain(chain); } } return chainIds2entities; } private static SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) { SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity(); GapPenalty penalty = new SimpleGapPenalty(8, 1); PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw = Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); return nw.getPair(); } private static SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) { SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); GapPenalty penalty = new SimpleGapPenalty(8, 1); PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw = Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); return nw.getPair(); } private static SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) { SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); GapPenalty penalty = new SimpleGapPenalty(8, 1); PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw = Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); return nw.getPair(); } private static int getNumGaps(SequencePair<?, ?> pair) { int numGaps = 0; for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { if (pair.hasGap(alignmentIndex)) numGaps++; } return numGaps; } private static int getNumGapsQuery(SequencePair<?, ?> pair) { int numGaps = 0; for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) { numGaps++; } } return numGaps; } private static int getNumGapsTarget(SequencePair<?, ?> pair) { int numGaps = 0; for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) { numGaps++; } } return numGaps; } private static boolean isProteinSequence(String str) { try { new ProteinSequence(str); } catch (CompoundNotFoundException e) { return false; } return true; } private static boolean isDNASequence(String str) { try { new DNASequence(str); } catch (CompoundNotFoundException e) { return false; } return true; } private static boolean isRNASequence(String str) { try { new RNASequence(str); } catch (CompoundNotFoundException e) { return false; } return true; } /** * Returns the ProteinSequence or null if one can't be created * @param str * @return */ private static ProteinSequence getProteinSequence(String str) { try { ProteinSequence s = new ProteinSequence(str); return s; } catch (CompoundNotFoundException e) { logger.error("Unexpected error when creating ProteinSequence",e); } return null; } /** * Returns the DNASequence or null if one can't be created * @param str * @return */ private static DNASequence getDNASequence(String str) { try { DNASequence s = new DNASequence(str); return s; } catch (CompoundNotFoundException e) { logger.error("Unexpected error when creating DNASequence ",e); } return null; } /** * Returns the RNASequence or null if one can't be created * @param str * @return */ private static RNASequence getRNASequence(String str) { try { RNASequence s = new RNASequence(str); return s; } catch (CompoundNotFoundException e) { logger.error("Unexpected error when creating RNASequence ",e); } return null; } }