/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * Created on 12.03.2004 * @author Andreas Prlic * */ package org.biojava.nbio.structure; import org.biojava.nbio.structure.io.FileConvert; import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory; import org.biojava.nbio.structure.io.mmcif.chem.PolymerType; import org.biojava.nbio.structure.io.mmcif.model.ChemComp; import org.biojava.nbio.core.exceptions.CompoundNotFoundException; import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.core.sequence.template.Sequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Serializable; import java.util.*; /** * A Chain in a PDB file. It contains several groups which can be of * one of the types defined in the {@link GroupType} constants. * * @author Andreas Prlic * @author Jules Jacobsen * @since 1.4 */ public class ChainImpl implements Chain, Serializable { private final static Logger logger = LoggerFactory.getLogger(ChainImpl.class); private static final long serialVersionUID = 1990171805277911840L; /** * The default chain identifier used to be an empty space */ private static final String DEFAULT_CHAIN_ID = "A"; private String swissprot_id ; private String authId; // the 'public' chain identifier as assigned by authors in PDB files private List <Group> groups; private List<Group> seqResGroups; private EntityInfo entity; private Structure parent; private Map<String, Integer> pdbResnumMap; private String asymId; // the 'internal' chain identifier as used in mmCIF files private List<SeqMisMatch> seqMisMatches = null; /** * Constructs a ChainImpl object. */ public ChainImpl() { super(); authId = DEFAULT_CHAIN_ID; groups = new ArrayList<>() ; seqResGroups = new ArrayList<>(); pdbResnumMap = new HashMap<>(); asymId = null; } /** {@inheritDoc} * */ @Override public String getId() { return asymId; } /** {@inheritDoc} * */ @Override public void setId(String asymId) { this.asymId = asymId; } /** {@inheritDoc} * */ @Override public String getName() { return authId; } /** {@inheritDoc} * */ @Override public void setName(String authId) { this.authId = authId; } /** {@inheritDoc} * */ @Override @Deprecated public void setParent(Structure parent) { setStructure(parent); } /** {@inheritDoc} * */ @Override public void setStructure(Structure parent){ this.parent = parent; } /** Returns the parent Structure of this chain. * * @return the parent Structure object */ @Override public Structure getStructure() { return parent; } /** Returns the parent Structure of this chain. * * @return the parent Structure object * @deprecated use getStructure instead. */ @Override @Deprecated public Structure getParent() { return getStructure(); } /** Returns an identical copy of this Chain . * @return an identical copy of this Chain */ @Override public Object clone() { // go through all groups and add to new Chain. ChainImpl n = new ChainImpl(); // copy chain data: n.setId(getId()); n.setName(getName()); n.setSwissprotId ( getSwissprotId()); // NOTE the EntityInfo will be reset at the parent level (Structure) if cloning is happening from parent level // here we don't deep-copy it and just keep the same reference, in case the cloning is happening at the Chain level only n.setEntityInfo(this.entity); for (Group group : groups) { Group g = (Group) group.clone(); n.addGroup(g); g.setChain(n); } if (seqResGroups!=null){ List<Group> tmpSeqRes = new ArrayList<>(); // cloning seqres and atom groups is ugly, due to their // nested relationship (some of the atoms can be in the seqres, but not all) for (Group seqResGroup : seqResGroups) { int i = groups.indexOf(seqResGroup); Group g ; if (i!=-1) { // group found in atom groups, we get the equivalent reference from the newly cloned atom groups g = n.getAtomGroup(i); } else { // group not found in atom groups, we clone the seqres group g = (Group) seqResGroup.clone(); } g.setChain(n); tmpSeqRes.add(g); } n.setSeqResGroups(tmpSeqRes); } return n ; } /** {@inheritDoc} * */ @Override public void setEntityInfo(EntityInfo mol) { this.entity = mol; } /** {@inheritDoc} * */ @Override public EntityInfo getEntityInfo() { return this.entity; } /** set the Swissprot id of this chains . * @param sp_id a String specifying the swissprot id value * @see #getSwissprotId */ @Override public void setSwissprotId(String sp_id){ swissprot_id = sp_id ; } /** get the Swissprot id of this chains . * @return a String representing the swissprot id value * @see #setSwissprotId */ @Override public String getSwissprotId() { return swissprot_id ; } /** {@inheritDoc} * */ @Override public void addGroup(Group group) { group.setChain(this); // Set the altlocs chain as well for(Group g : group.getAltLocs()) { g.setChain(this); } groups.add(group); // store the position internally for quick access of this group String pdbResnum = null ; ResidueNumber resNum = group.getResidueNumber(); if ( resNum != null) pdbResnum = resNum.toString(); if ( pdbResnum != null) { Integer pos = groups.size() - 1; // ARGH sometimes numbering in PDB files is confusing. // e.g. PDB: 1sfe /* * ATOM 620 N GLY 93 -24.320 -6.591 4.210 1.00 46.82 N * ATOM 621 CA GLY 93 -24.960 -6.849 5.497 1.00 47.35 C * ATOM 622 C GLY 93 -26.076 -5.873 5.804 1.00 47.24 C * ATOM 623 O GLY 93 -26.382 -4.986 5.006 1.00 47.56 O * and ... * HETATM 1348 O HOH 92 -21.853 -16.886 19.138 1.00 66.92 O * HETATM 1349 O HOH 93 -26.126 1.226 29.069 1.00 71.69 O * HETATM 1350 O HOH 94 -22.250 -18.060 -6.401 1.00 61.97 O */ // this check is to give in this case the entry priority that is an AminoAcid / comes first... // a good example of same residue number for 2 residues is 3th3, chain T, residue 201 (a LYS and a sugar BGC covalently attached to it) - JD 2016-03-09 if ( pdbResnumMap.containsKey(pdbResnum)) { logger.warn("Adding residue {}({}) to chain {} but a residue with same residue number is already present: {}({}). Will add only the aminoacid residue (if any) to the lookup, lookups for that residue number won't work properly.", pdbResnum, group.getPDBName(), getChainID(), groups.get(pdbResnumMap.get(pdbResnum)).getResidueNumber(), groups.get(pdbResnumMap.get(pdbResnum)).getPDBName()); if ( group instanceof AminoAcid) pdbResnumMap.put(pdbResnum,pos); } else pdbResnumMap.put(pdbResnum,pos); } } /** * {@inheritDoc} */ @Override public Group getAtomGroup(int position) { return groups.get(position); } /** * {@inheritDoc} */ @Override public List<Group> getAtomGroups(GroupType type){ List<Group> tmp = new ArrayList<>() ; for (Group g : groups) { if (g.getType().equals(type)) { tmp.add(g); } } return tmp ; } /** {@inheritDoc} * */ @Override public List<Group> getAtomGroups(){ return groups ; } /** {@inheritDoc} * */ @Override public void setAtomGroups(List<Group> groups){ for (Group g:groups){ g.setChain(this); } this.groups = groups; } @Override public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end, boolean ignoreMissing) throws StructureException { // Short-circut for include all groups if(start == null && end == null) { return groups.toArray(new Group[groups.size()]); } List<Group> retlst = new ArrayList<>(); boolean adding, foundStart; if( start == null ) { // start with first group adding = true; foundStart = true; } else { adding = false; foundStart = false; } for (Group g: groups){ // Check for start if (!adding && start.equalsPositional(g.getResidueNumber())) { adding = true; foundStart = true; } // Check if past start if ( ignoreMissing && ! (foundStart && adding) ) { ResidueNumber pos = g.getResidueNumber(); if ( start != null && start.compareToPositional(pos) <= 0) { foundStart = true; adding = true; } } if ( adding) retlst.add(g); // check for end if ( end != null && end.equalsPositional(g.getResidueNumber())) { if ( ! adding) throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId); adding = false; break; } // check if past end if ( ignoreMissing && adding && end != null){ ResidueNumber pos = g.getResidueNumber(); if ( end.compareToPositional(pos) <= 0) { adding = false; break; } } } if ( ! foundStart){ throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId); } if ( end != null && adding && !ignoreMissing) { throw new StructureException("did not find end PDB residue number " + end + " in chain " + authId); } //not checking if the end has been found in this case... return retlst.toArray(new Group[retlst.size()] ); } /** * {@inheritDoc} * */ @Override public Group getGroupByPDB(ResidueNumber resNum) throws StructureException { String pdbresnum = resNum.toString(); if ( pdbResnumMap.containsKey(pdbresnum)) { Integer pos = pdbResnumMap.get(pdbresnum); return groups.get(pos); } else { throw new StructureException("unknown PDB residue number " + pdbresnum + " in chain " + authId); } } /** * {@inheritDoc} * */ @Override public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end) throws StructureException { return getGroupsByPDB(start, end, false); } /** * {@inheritDoc} */ @Override public int getSeqResLength() { //new method returns the length of the sequence defined in the SEQRES records return seqResGroups.size(); } /** * {@inheritDoc} */ @Override public void setChainID(String asymId) { this.asymId = asymId; } /** * {@inheritDoc} */ @Override public String getChainID() { return this.asymId; } /** String representation. * @return String representation of the Chain */ @Override public String toString(){ String newline = System.getProperty("line.separator"); StringBuilder str = new StringBuilder(); str.append("Chain asymId:").append(getChainID()).append(" authId:").append(getName()).append(newline); if ( entity != null ){ if ( entity.getDescription() != null){ str.append(entity.getDescription()).append(newline); } } str.append("total SEQRES length: ").append(getSeqResGroups().size()).append(" total ATOM length:") .append(getAtomLength()).append(" residues ").append(newline); return str.toString() ; } /** * {@inheritDoc} */ @Override public Sequence<?> getBJSequence() { String seq = getSeqResSequence(); Sequence<AminoAcidCompound> s = null; try { s = new ProteinSequence(seq); } catch (CompoundNotFoundException e) { logger.error("Could not create sequence object from seqres sequence. Some unknown compound: {}",e.getMessage()); } //TODO: return a DNA sequence if the content is DNA... return s; } /** * {@inheritDoc} */ @Override public String getAtomSequence(){ List<Group> groups = getAtomGroups(); StringBuilder sequence = new StringBuilder() ; for ( Group g: groups){ ChemComp cc = g.getChemComp(); if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ // an amino acid residue.. use for alignment String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); if ( oneLetter == null) oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); sequence.append(oneLetter); } } return sequence.toString(); } /** * {@inheritDoc} */ @Override public String getSeqResSequence(){ StringBuilder str = new StringBuilder(); for (Group g : seqResGroups) { ChemComp cc = g.getChemComp(); if ( cc == null) { logger.warn("Could not load ChemComp for group: ", g); str.append(StructureTools.UNKNOWN_GROUP_LABEL); } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ // an amino acid residue.. use for alignment String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible. if ( oneLetter == null || oneLetter.isEmpty() || oneLetter.equals("?")) { oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); } str.append(oneLetter); } else { str.append(StructureTools.UNKNOWN_GROUP_LABEL); } } return str.toString(); } /** * Get the one letter sequence so that Sequence is guaranteed to * be the same length as seqResGroups. * Method related to https://github.com/biojava/biojava/issues/457 * @return a string of the sequence guaranteed to be the same length * as seqResGroups. */ public String getSeqResOneLetterSeq(){ StringBuilder str = new StringBuilder(); for (Group g : seqResGroups) { ChemComp cc = g.getChemComp(); if ( cc == null) { logger.warn("Could not load ChemComp for group: ", g); str.append(StructureTools.UNKNOWN_GROUP_LABEL); } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ // an amino acid residue.. use for alignment String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible. if ( oneLetter == null || oneLetter.isEmpty() || oneLetter.equals("?") || oneLetter.length()!=1) { oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); } str.append(oneLetter); } else { str.append(StructureTools.UNKNOWN_GROUP_LABEL); } } return str.toString(); } /** * {@inheritDoc} */ @Override public Group getSeqResGroup(int position) { return seqResGroups.get(position); } /** * {@inheritDoc} */ @Override public List<Group> getSeqResGroups(GroupType type) { List<Group> tmp = new ArrayList<>() ; for (Group g : seqResGroups) { if (g.getType().equals(type)) { tmp.add(g); } } return tmp ; } /** {@inheritDoc} * */ @Override public List<Group> getSeqResGroups() { return seqResGroups; } /** {@inheritDoc} * */ @Override public void setSeqResGroups(List<Group> groups){ for (Group g: groups){ g.setChain(this); } this.seqResGroups = groups; } /** {@inheritDoc} * */ @Override public int getAtomLength() { return groups.size(); } /** {@inheritDoc} * */ @Override public List<Group> getAtomLigands(){ List<Group> ligands = new ArrayList<>(); for (Group g : groups) if (!seqResGroups.contains(g) && !g.isWater()) ligands.add(g); return ligands; } @Override public String getInternalChainID() { return asymId; } @Override public void setInternalChainID(String internalChainID) { this.asymId = internalChainID; } @Override public String toPDB() { return FileConvert.toPDB(this); } @Override public String toMMCIF() { return FileConvert.toMMCIF(this, true); } @Override public void setSeqMisMatches(List<SeqMisMatch> seqMisMatches) { this.seqMisMatches = seqMisMatches; } @Override public List<SeqMisMatch> getSeqMisMatches() { return seqMisMatches; } @Override public EntityType getEntityType() { if (getEntityInfo()==null) return null; return getEntityInfo().getType(); } @Override public boolean isWaterOnly() { for (Group g : getAtomGroups()) { if (!g.isWater()) return false; } return true; } @Override public boolean isPureNonPolymer() { for (Group g : getAtomGroups()) { ChemComp cc = g.getChemComp(); if ( g.isPolymeric() && !g.isHetAtomInFile() ) { // important: the aminoacid or nucleotide residue can be in Atom records return false; } } return true; } @Override public GroupType getPredominantGroupType(){ double ratioResiduesToTotal = StructureTools.RATIO_RESIDUES_TO_TOTAL; int sizeAminos = getAtomGroups(GroupType.AMINOACID).size(); int sizeNucleotides = getAtomGroups(GroupType.NUCLEOTIDE).size(); List<Group> hetAtoms = getAtomGroups(GroupType.HETATM); int sizeHetatoms = hetAtoms.size(); int sizeWaters = 0; for (Group g : hetAtoms) { if (g.isWater()) sizeWaters++; } int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters; int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater; if ((double) sizeAminos / (double) fullSize > ratioResiduesToTotal) return GroupType.AMINOACID; if ((double) sizeNucleotides / (double) fullSize > ratioResiduesToTotal) return GroupType.NUCLEOTIDE; if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > ratioResiduesToTotal) return GroupType.HETATM; // finally if neither condition works, we try based on majority, but log // it GroupType max; if (sizeNucleotides > sizeAminos) { if (sizeNucleotides > sizeHetatomsWithoutWater) { max = GroupType.NUCLEOTIDE; } else { max = GroupType.HETATM; } } else { if (sizeAminos > sizeHetatomsWithoutWater) { max = GroupType.AMINOACID; } else { max = GroupType.HETATM; } } logger.debug( "Ratio of residues to total for chain with asym_id {} is below {}. Assuming it is a {} chain. " + "Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, " + "ratio aa/total: {}, ratio nuc/total: {}", getId(), ratioResiduesToTotal, max, sizeAminos, sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters, (double) sizeAminos / (double) fullSize, (double) sizeNucleotides / (double) fullSize); return max; } @Override public boolean isProtein() { return getPredominantGroupType() == GroupType.AMINOACID; } @Override public boolean isNucleicAcid() { return getPredominantGroupType() == GroupType.NUCLEOTIDE; } }