/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* created at Apr 26, 2008
*/
package org.biojava.nbio.structure.io.mmcif;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import javax.vecmath.Matrix4d;
import org.biojava.nbio.structure.AminoAcid;
import org.biojava.nbio.structure.AminoAcidImpl;
import org.biojava.nbio.structure.Atom;
import org.biojava.nbio.structure.AtomImpl;
import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.ChainImpl;
import org.biojava.nbio.structure.EntityInfo;
import org.biojava.nbio.structure.EntityType;
import org.biojava.nbio.structure.DBRef;
import org.biojava.nbio.structure.Element;
import org.biojava.nbio.structure.Group;
import org.biojava.nbio.structure.GroupType;
import org.biojava.nbio.structure.HetatomImpl;
import org.biojava.nbio.structure.NucleotideImpl;
import org.biojava.nbio.structure.PDBCrystallographicInfo;
import org.biojava.nbio.structure.PDBHeader;
import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.SeqMisMatch;
import org.biojava.nbio.structure.SeqMisMatchImpl;
import org.biojava.nbio.structure.Site;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureImpl;
import org.biojava.nbio.structure.StructureTools;
import org.biojava.nbio.structure.io.BondMaker;
import org.biojava.nbio.structure.io.ChargeAdder;
import org.biojava.nbio.structure.io.EntityFinder;
import org.biojava.nbio.structure.io.FileParsingParameters;
import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
import org.biojava.nbio.structure.io.mmcif.model.AtomSite;
import org.biojava.nbio.structure.io.mmcif.model.AtomSites;
import org.biojava.nbio.structure.io.mmcif.model.AuditAuthor;
import org.biojava.nbio.structure.io.mmcif.model.Cell;
import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom;
import org.biojava.nbio.structure.io.mmcif.model.ChemCompBond;
import org.biojava.nbio.structure.io.mmcif.model.ChemCompDescriptor;
import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBremark;
import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBrev;
import org.biojava.nbio.structure.io.mmcif.model.DatabasePdbrevRecord;
import org.biojava.nbio.structure.io.mmcif.model.Entity;
import org.biojava.nbio.structure.io.mmcif.model.EntityPoly;
import org.biojava.nbio.structure.io.mmcif.model.EntityPolySeq;
import org.biojava.nbio.structure.io.mmcif.model.EntitySrcGen;
import org.biojava.nbio.structure.io.mmcif.model.EntitySrcNat;
import org.biojava.nbio.structure.io.mmcif.model.EntitySrcSyn;
import org.biojava.nbio.structure.io.mmcif.model.Exptl;
import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompDescriptor;
import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompIdentifier;
import org.biojava.nbio.structure.io.mmcif.model.PdbxEntityNonPoly;
import org.biojava.nbio.structure.io.mmcif.model.PdbxNonPolyScheme;
import org.biojava.nbio.structure.io.mmcif.model.PdbxPolySeqScheme;
import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssembly;
import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssemblyGen;
import org.biojava.nbio.structure.io.mmcif.model.PdbxStructOperList;
import org.biojava.nbio.structure.io.mmcif.model.Refine;
import org.biojava.nbio.structure.io.mmcif.model.Struct;
import org.biojava.nbio.structure.io.mmcif.model.StructAsym;
import org.biojava.nbio.structure.io.mmcif.model.StructConn;
import org.biojava.nbio.structure.io.mmcif.model.StructKeywords;
import org.biojava.nbio.structure.io.mmcif.model.StructNcsOper;
import org.biojava.nbio.structure.io.mmcif.model.StructRef;
import org.biojava.nbio.structure.io.mmcif.model.StructRefSeq;
import org.biojava.nbio.structure.io.mmcif.model.StructRefSeqDif;
import org.biojava.nbio.structure.io.mmcif.model.StructSite;
import org.biojava.nbio.structure.io.mmcif.model.StructSiteGen;
import org.biojava.nbio.structure.io.mmcif.model.Symmetry;
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
import org.biojava.nbio.structure.xtal.CrystalCell;
import org.biojava.nbio.structure.xtal.SpaceGroup;
import org.biojava.nbio.structure.xtal.SymoplibParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A MMcifConsumer implementation that builds an in-memory representation of the
* content of a mmcif file as a BioJava Structure object.
*
* @author Andreas Prlic
* @since 1.7
*/
public class SimpleMMcifConsumer implements MMcifConsumer {
private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifConsumer.class);
private Structure structure;
private Chain currentChain;
private Group currentGroup;
/**
* A temporary data structure to hold all parsed chains
*/
private ArrayList<List<Chain>> allModels;
/**
* The current set of chains per model
*/
private List<Chain> currentModel;
private List<Entity> entities;
/**
* Needed in header only mode to get mapping between asym ids and author ids
*/
private List<EntityPoly> entityPolys;
private List<StructRef> strucRefs;
private List<Chain> seqResChains;
private List<Chain> entityChains; // needed to link entities, chains and compounds...
private List<StructAsym> structAsyms; // needed to link entities, chains and compounds...
private List<PdbxStructOperList> structOpers ; //
private List<PdbxStructAssembly> strucAssemblies;
private List<PdbxStructAssemblyGen> strucAssemblyGens;
private List<EntitySrcGen> entitySrcGens;
private List<EntitySrcNat> entitySrcNats;
private List<EntitySrcSyn> entitySrcSyns;
private List<StructConn> structConn;
private List<StructNcsOper> structNcsOper;
private List<StructRefSeqDif> sequenceDifs;
private List<StructSiteGen> structSiteGens;
private Matrix4d parsedScaleMatrix;
/**
* A map of asym ids (internal chain ids) to entity ids extracted from
* the _struct_asym category
*/
private Map<String,String> asymId2entityId;
/**
* A map of asym ids (internal chain ids) to author ids extracted from
* the _entity_poly category. Used in header only parsing.
*/
private Map<String,String> asymId2authorId;
private String currentNmrModelNumber ;
private FileParsingParameters params;
public SimpleMMcifConsumer(){
params = new FileParsingParameters();
documentStart();
}
@Override
public void newEntity(Entity entity) {
logger.debug("New entity: {}",entity.toString());
entities.add(entity);
}
@Override
public void newEntityPoly(EntityPoly entityPoly) {
entityPolys.add(entityPoly);
}
@Override
public void newPdbxStructOperList(PdbxStructOperList structOper){
structOpers.add(structOper);
}
@Override
public void newStructAsym(StructAsym sasym){
structAsyms.add(sasym);
}
private Entity getEntity(int entity_id){
try {
for (Entity e: entities){
int eId = Integer.parseInt(e.getId());
if (eId== entity_id){
return e;
}
}
} catch (NumberFormatException e) {
logger.warn("Entity id does not look like a number:", e.getMessage());
}
return null;
}
@Override
public void newStructKeywords(StructKeywords kw){
PDBHeader header = structure.getPDBHeader();
if ( header == null)
header = new PDBHeader();
header.setDescription(kw.getPdbx_keywords());
header.setClassification(kw.getPdbx_keywords());
}
@Override
public void setStruct(Struct struct) {
PDBHeader header = structure.getPDBHeader();
if ( header == null)
header = new PDBHeader();
header.setTitle(struct.getTitle());
header.setIdCode(struct.getEntry_id());
//header.setDescription(struct.getPdbx_descriptor());
//header.setClassification(struct.getPdbx_descriptor());
//header.setDescription(struct.getPdbx_descriptor());
structure.setPDBHeader(header);
structure.setPDBCode(struct.getEntry_id());
}
/** initiate new group, either Hetatom, Nucleotide, or AminoAcid */
private Group getNewGroup(String recordName,Character aminoCode1, long seq_id,String groupCode3) {
Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(groupCode3);
if ( g != null && !g.getChemComp().isEmpty()) {
if ( g instanceof AminoAcidImpl) {
AminoAcidImpl aa = (AminoAcidImpl) g;
aa.setId(seq_id);
} else if ( g instanceof NucleotideImpl) {
NucleotideImpl nuc = (NucleotideImpl) g;
nuc.setId(seq_id);
} else if ( g instanceof HetatomImpl) {
HetatomImpl het = (HetatomImpl)g;
het.setId(seq_id);
}
return g;
}
Group group;
if ( recordName.equals("ATOM") ) {
if (StructureTools.isNucleotide(groupCode3)) {
// it is a nucleotide
NucleotideImpl nu = new NucleotideImpl();
group = nu;
nu.setId(seq_id);
} else if (aminoCode1==null || aminoCode1 == StructureTools.UNKNOWN_GROUP_LABEL){
HetatomImpl h = new HetatomImpl();
h.setId(seq_id);
group = h;
} else {
AminoAcidImpl aa = new AminoAcidImpl() ;
aa.setAminoType(aminoCode1);
aa.setId(seq_id);
group = aa ;
}
}
else {
if (StructureTools.isNucleotide(groupCode3)) {
// it is a nucleotide
NucleotideImpl nu = new NucleotideImpl();
group = nu;
nu.setId(seq_id);
}
else if (aminoCode1 != null ) {
AminoAcidImpl aa = new AminoAcidImpl() ;
aa.setAminoType(aminoCode1);
aa.setId(seq_id);
group = aa ;
} else {
HetatomImpl h = new HetatomImpl();
h.setId(seq_id);
group = h;
}
}
return group ;
}
/**
* Test if the given asymId is already present in the list of chains given. If yes, returns the chain
* otherwise returns null.
*/
private static Chain isKnownChain(String asymId, List<Chain> chains){
for (int i = 0; i< chains.size();i++){
Chain testchain = chains.get(i);
//System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<");
if (asymId.equals(testchain.getId())) {
//System.out.println("chain "+ chainID+" already known ...");
return testchain;
}
}
return null;
}
@Override
public void newAtomSite(AtomSite atom) {
if (params.isHeaderOnly()) return;
// Warning: getLabel_asym_id is not the "chain id" in the PDB file
// it is the internally used chain id.
// later on we will fix this...
// later one needs to map the asym id to the pdb_strand_id
//TODO: add support for FileParsingParams.getMaxAtoms()
boolean startOfNewChain = false;
String asymId = atom.getLabel_asym_id();
String authId = atom.getAuth_asym_id();
String recordName = atom.getGroup_PDB();
String residueNumberS = atom.getAuth_seq_id();
Integer residueNrInt = Integer.parseInt(residueNumberS);
// the 3-letter name of the group:
String groupCode3 = atom.getLabel_comp_id();
boolean isHetAtomInFile = false;
Character aminoCode1 = null;
if ( recordName.equals("ATOM") )
aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
else {
aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
// for nucleotides this will be null..
if (aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
aminoCode1 = null;
isHetAtomInFile = true;
}
String insCodeS = atom.getPdbx_PDB_ins_code();
Character insCode = null;
if (! insCodeS.equals("?")) {
insCode = insCodeS.charAt(0);
}
// we store the internal seq id in the Atom._id field
// this is not a PDB file field but we need this to internally assign the insertion codes later
// from the pdbx_poly_seq entries..
long seq_id = -1;
try {
seq_id = Long.parseLong(atom.getLabel_seq_id());
} catch (NumberFormatException e){
// non polymer chains (ligands and small molecules) will have a label_seq_id set to '.', thus it is ok to
// silently ignore this
//logger.debug("Could not parse number for _atom_site.label_seq_id: "+e.getMessage());
}
String nmrModelNumber = atom.getPdbx_PDB_model_num();
if ( currentNmrModelNumber == null) {
currentNmrModelNumber = nmrModelNumber;
}
if (! currentNmrModelNumber.equals(nmrModelNumber)){
currentNmrModelNumber = nmrModelNumber;
// add previous data
if ( currentChain != null ) {
currentChain.addGroup(currentGroup);
currentGroup.trimToSize();
}
// we came to the beginning of a new NMR model
allModels.add(currentModel);
currentModel = new ArrayList<Chain>();
currentChain = null;
currentGroup = null;
}
if (currentChain == null) {
currentChain = new ChainImpl();
currentChain.setName(authId);
currentChain.setId(asymId);
currentModel.add(currentChain);
startOfNewChain = true;
}
//System.out.println("BEFORE: " + chain_id + " " + current_chain.getName());
if ( ! asymId.equals(currentChain.getId()) ) {
//logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId);
startOfNewChain = true;
// end up old chain...
currentChain.addGroup(currentGroup);
// see if old chain is known ...
Chain testchain = isKnownChain(asymId,currentModel);
if ( testchain == null) {
//logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId);
currentChain = new ChainImpl();
currentChain.setName(authId);
currentChain.setId(asymId);
} else {
currentChain = testchain;
}
if ( ! currentModel.contains(currentChain))
currentModel.add(currentChain);
}
ResidueNumber residueNumber = new ResidueNumber(authId,residueNrInt, insCode);
if (currentGroup == null) {
currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
currentGroup.setResidueNumber(residueNumber);
currentGroup.setPDBName(groupCode3);
currentGroup.setHetAtomInFile(isHetAtomInFile);
}
// SET UP THE ALT LOC GROUP
Group altGroup = null;
String altLocS = atom.getLabel_alt_id();
Character altLoc = ' ';
if ( altLocS.length()>0) {
altLoc = altLocS.charAt(0);
if ( altLoc.equals('.') )
altLoc = ' ';
}
// If it's the start of the new chain
if ( startOfNewChain){
currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
currentGroup.setResidueNumber(residueNumber);
currentGroup.setPDBName(groupCode3);
currentGroup.setHetAtomInFile(isHetAtomInFile);
}
// ANTHONY BRADLEY ADDED THIS -> WE ONLY WAN'T TO CHECK FOR ALT LOCS WHEN IT's NOT THE FIRST GROUP IN CHAIN
else{
// check if residue number is the same ...
// insertion code is part of residue number
if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
//System.out.println("end of residue: "+current_group.getPDBCode()+" "+residueNrInt);
currentChain.addGroup(currentGroup);
currentGroup.trimToSize();
currentGroup = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
currentGroup.setPDBName(groupCode3);
currentGroup.setResidueNumber(residueNumber);
currentGroup.setHetAtomInFile(isHetAtomInFile);
} else {
// same residueNumber, but altLocs...
// test altLoc
if ( ! altLoc.equals(' ') && ( ! altLoc.equals('.'))) {
logger.debug("found altLoc! " + altLoc + " " + currentGroup + " " + altGroup);
altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3, seq_id);
if (altGroup.getChain()==null) {
altGroup.setChain(currentChain);
}
}
}
}
//atomCount++;
//System.out.println("fixing atom name for >" + atom.getLabel_atom_id() + "< >" + fullname + "<");
if ( params.isParseCAOnly() ){
// yes , user wants to get CA only
// only parse CA atoms...
if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) {
//System.out.println("ignoring " + line);
//atomCount--;
return;
}
}
//see if chain_id is one of the previous chains ...
Atom a = convertAtom(atom);
//see if chain_id is one of the previous chains ...
if ( altGroup != null) {
altGroup.addAtom(a);
altGroup = null;
}
else {
currentGroup.addAtom(a);
}
String atomName = a.getName();
// make sure that main group has all atoms
// GitHub issue: #76
if ( ! currentGroup.hasAtom(atomName)) {
// Unless it's microheterogenity https://github.com/rcsb/codec-devel/issues/81
if (currentGroup.getPDBName().equals(a.getGroup().getPDBName())) {
if(!StructureTools.hasNonDeuteratedEquiv(a,currentGroup)){
currentGroup.addAtom(a);
}
}
}
}
/**
* Convert a mmCIF AtomSite object to a BioJava Atom object
*
* @param atom the mmmcif AtomSite record
* @return an Atom
*/
private Atom convertAtom(AtomSite atom){
Atom a = new AtomImpl();
a.setPDBserial(Integer.parseInt(atom.getId()));
a.setName(atom.getLabel_atom_id());
double x = Double.parseDouble (atom.getCartn_x());
double y = Double.parseDouble (atom.getCartn_y());
double z = Double.parseDouble (atom.getCartn_z());
a.setX(x);
a.setY(y);
a.setZ(z);
float occupancy = Float.parseFloat (atom.getOccupancy());
a.setOccupancy(occupancy);
float temp = Float.parseFloat (atom.getB_iso_or_equiv());
a.setTempFactor(temp);
String alt = atom.getLabel_alt_id();
if (( alt != null ) && ( alt.length() > 0) && (! alt.equals("."))){
a.setAltLoc(new Character(alt.charAt(0)));
} else {
a.setAltLoc(new Character(' '));
}
Element element = Element.R;
try {
element = Element.valueOfIgnoreCase(atom.getType_symbol());
} catch (IllegalArgumentException e) {
logger.info("Element {} was not recognised as a BioJava-known element, the element will be represented as the generic element {}", atom.getType_symbol(), Element.R.name());
}
a.setElement(element);
return a;
}
private Group getCorrectAltLocGroup( Character altLoc,
String recordName,
Character aminoCode1,
String groupCode3,
long seq_id) {
// see if we know this altLoc already;
List<Atom> atoms = currentGroup.getAtoms();
if ( atoms.size() > 0) {
Atom a1 = atoms.get(0);
// we are just adding atoms to the current group
// probably there is a second group following later...
if (a1.getAltLoc().equals(altLoc)) {
return currentGroup;
}
}
List<Group> altLocs = currentGroup.getAltLocs();
for ( Group altLocG : altLocs ){
atoms = altLocG.getAtoms();
if ( atoms.size() > 0) {
for ( Atom a1 : atoms) {
if (a1.getAltLoc().equals( altLoc)) {
return altLocG;
}
}
}
}
// no matching altLoc group found.
// build it up.
if ( groupCode3.equals(currentGroup.getPDBName())) {
if ( currentGroup.getAtoms().size() == 0) {
//System.out.println("current group is empty " + current_group + " " + altLoc);
return currentGroup;
}
//System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
Group altLocG = (Group) currentGroup.clone();
// drop atoms from cloned group...
// https://redmine.open-bio.org/issues/3307
altLocG.setAtoms(new ArrayList<Atom>());
altLocG.getAltLocs().clear();
currentGroup.addAltLoc(altLocG);
return altLocG;
}
// System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3);
//String recordName,Character aminoCode1, long seq_id,String groupCode3) {
Group altLocG = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
altLocG.setPDBName(groupCode3);
altLocG.setResidueNumber(currentGroup.getResidueNumber());
currentGroup.addAltLoc(altLocG);
return altLocG;
}
/**
* Start the parsing
*/
@Override
public void documentStart() {
structure = new StructureImpl();
currentChain = null;
currentGroup = null;
currentNmrModelNumber = null;
//atomCount = 0;
allModels = new ArrayList<List<Chain>>();
currentModel = new ArrayList<Chain>();
entities = new ArrayList<Entity>();
entityPolys = new ArrayList<>();
strucRefs = new ArrayList<StructRef>();
seqResChains = new ArrayList<Chain>();
entityChains = new ArrayList<Chain>();
structAsyms = new ArrayList<StructAsym>();
asymId2entityId = new HashMap<String,String>();
asymId2authorId = new HashMap<>();
structOpers = new ArrayList<PdbxStructOperList>();
strucAssemblies = new ArrayList<PdbxStructAssembly>();
strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>();
entitySrcGens = new ArrayList<EntitySrcGen>();
entitySrcNats = new ArrayList<EntitySrcNat>();
entitySrcSyns = new ArrayList<EntitySrcSyn>();
structConn = new ArrayList<StructConn>();
structNcsOper = new ArrayList<StructNcsOper>();
sequenceDifs = new ArrayList<StructRefSeqDif>();
structSiteGens = new ArrayList<StructSiteGen>();
}
@Override
public void documentEnd() {
// Expected that there is one current_chain that needs to be added to the model
// When in headerOnly mode, no Atoms are read, and there will not be an active
// current_chain.
if ( currentChain != null ) {
currentChain.addGroup(currentGroup);
if (isKnownChain(currentChain.getId(),currentModel) == null) {
currentModel.add(currentChain);
}
} else if (!params.isHeaderOnly()){
logger.warn("current chain is null at end of document.");
}
allModels.add(currentModel);
// this populates the asymId2authorId and asymId2entityId maps, needed in header only mode to get the mapping
// between the 2 chain identifiers.
initMaps();
for (StructAsym asym : structAsyms) {
logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
Chain s = getEntityChain(asym.getEntity_id());
Chain seqres = (Chain)s.clone();
// to solve issue #160 (e.g. 3u7t)
seqres = removeSeqResHeterogeneity(seqres);
seqres.setId(asym.getId());
if (asymId2authorId.get(asym.getId()) !=null ){
seqres.setName(asymId2authorId.get(asym.getId()));
} else {
seqres.setName(asym.getId());
}
EntityType type = null;
try {
Entity ent = getEntity(Integer.parseInt(asym.getEntity_id()));
type = EntityType.entityTypeFromString(ent.getType());
} catch (NumberFormatException e) {
logger.debug("Could not parse integer from entity id field {}", asym.getEntity_id());
}
// we'll only add seqres chains that are polymeric or unknown
if (type==null || type==EntityType.POLYMER ) {
seqResChains.add(seqres);
}
logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ;
// adding the entities to structure
addEntities(asym);
}
if (structAsyms.isEmpty()) {
logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
}
// entities
// In addEntities above we created the entities if they were present in the file
// Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now
linkEntities();
// now that we know the entities, we can add all chains to structure so that they are stored
// properly as polymer/nonpolymer/water chains inside structure
for (List<Chain> model:allModels) {
structure.addModel(model);
}
// Only align if requested (default) and not when headerOnly mode with no Atoms.
// Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){
logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
alignSeqRes();
} else {
logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
}
// Now make sure all altlocgroups have all the atoms in all the groups
StructureTools.cleanUpAltLocs(structure);
// NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
if (!params.isHeaderOnly()) {
if ( params.shouldCreateAtomBonds()) {
addBonds();
}
if ( params.shouldCreateAtomCharges()) {
addCharges();
}
}
if (!params.isHeaderOnly()) {
// Do structure.setSites(sites) after any chain renaming to be like PDB.
addSites();
}
// set the oligomeric state info in the header...
if (params.isParseBioAssembly()) {
// the more detailed mapping of chains to rotation operations happens in StructureIO...
Map<Integer,BioAssemblyInfo> bioAssemblies = new HashMap<Integer, BioAssemblyInfo>();
for ( PdbxStructAssembly psa : strucAssemblies){
List<PdbxStructAssemblyGen> psags = new ArrayList<PdbxStructAssemblyGen>(1);
for ( PdbxStructAssemblyGen psag: strucAssemblyGens ) {
if ( psag.getAssembly_id().equals(psa.getId())) {
psags.add(psag);
}
}
BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
// these are the transformations that need to be applied to our model
List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers);
int bioAssemblyId = -1;
try {
bioAssemblyId = Integer.parseInt(psa.getId());
} catch (NumberFormatException e) {
logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId());
}
// if bioassembly id is not numerical we throw it away
// this happens usually for viral capsid entries, like 1ei7
// see issue #230 in github
if (bioAssemblyId!=-1) {
int mmSize = 0;
// note that the transforms contain asym ids of both polymers and non-polymers
// For the mmsize, we are only interested in the polymers
for (BiologicalAssemblyTransformation transf:transformations) {
Chain c = structure.getChain(transf.getChainId());
if (c==null) {
logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
continue;
}
if (c.getEntityType() == EntityType.POLYMER &&
// for entries like 4kro, sugars are annotated as polymers but we
// don't want them in the macromolecularSize count
!c.getEntityInfo().getDescription().contains("SUGAR") ) {
mmSize++;
}
}
BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
bioAssembly.setId(bioAssemblyId);
bioAssembly.setMacromolecularSize(mmSize);
bioAssembly.setTransforms(transformations);
bioAssemblies.put(bioAssemblyId,bioAssembly);
}
}
structure.getPDBHeader().setBioAssemblies(bioAssemblies);
}
setStructNcsOps();
setCrystallographicInfoMetadata();
Map<String,List<SeqMisMatch>> misMatchMap = new HashMap<String, List<SeqMisMatch>>();
for (StructRefSeqDif sdif : sequenceDifs) {
SeqMisMatch misMatch = new SeqMisMatchImpl();
misMatch.setDetails(sdif.getDetails());
String insCode = sdif.getPdbx_pdb_ins_code();
if ( insCode != null && insCode.equals("?"))
insCode = null;
misMatch.setInsCode(insCode);
misMatch.setOrigGroup(sdif.getDb_mon_id());
misMatch.setPdbGroup(sdif.getMon_id());
misMatch.setPdbResNum(sdif.getPdbx_auth_seq_num());
misMatch.setUniProtId(sdif.getPdbx_seq_db_accession_code());
misMatch.setSeqNum(sdif.getSeq_num());
List<SeqMisMatch> mms = misMatchMap.get(sdif.getPdbx_pdb_strand_id());
if ( mms == null) {
mms = new ArrayList<SeqMisMatch>();
misMatchMap.put(sdif.getPdbx_pdb_strand_id(),mms);
}
mms.add(misMatch);
}
for (String chainId : misMatchMap.keySet()){
Chain chain = structure.getPolyChainByPDB(chainId);
if ( chain == null) {
logger.warn("Could not set mismatches for chain with author id" + chainId);
continue;
}
chain.setSeqMisMatches(misMatchMap.get(chainId));
}
}
/**
* Here we link entities to chains.
* Also if entities are not present in file, this initialises the entities with some heuristics, see {@link org.biojava.nbio.structure.io.EntityFinder}
*/
private void linkEntities() {
for (int i =0; i< allModels.size() ; i++){
for (Chain chain : allModels.get(i)) {
//logger.info("linking entities for " + chain.getId() + " " + chain.getName());
String entityId = asymId2entityId.get(chain.getId());
if (entityId==null) {
// this can happen for instance if the cif file didn't have _struct_asym category at all
// and thus we have no asymId2entityId mapping at all
logger.info("No entity id could be found for chain {}", chain.getId());
continue;
}
int eId = Integer.parseInt(entityId);
// Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
// TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
// asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the
// mmCIF internal data structures but is compatible with Structure interface.
// Some examples of PDB entries with this kind of problem:
// - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
// - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
// - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
EntityInfo entityInfo = structure.getEntityById(eId);
if (entityInfo==null) {
// Supports the case where the only chain members were from non-polymeric entity that is missing.
// Solved by creating a new Compound(entity) to which this chain will belong.
logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
eId, chain.getId());
entityInfo = new EntityInfo();
entityInfo.setMolId(eId);
entityInfo.addChain(chain);
if (StructureTools.isChainWaterOnly(chain)) {
entityInfo.setType(EntityType.WATER);
} else {
entityInfo.setType(EntityType.NONPOLYMER);
}
chain.setEntityInfo(entityInfo);
structure.addEntityInfo(entityInfo);
} else {
logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
chain.getId(), chain.getName(), eId);
entityInfo.addChain(chain);
chain.setEntityInfo(entityInfo);
}
}
}
// if no entity information was present in file we then go and find the entities heuristically with EntityFinder
List<EntityInfo> entityInfos = structure.getEntityInfos();
if (entityInfos==null || entityInfos.isEmpty()) {
List<List<Chain>> polyModels = new ArrayList<>();
List<List<Chain>> nonPolyModels = new ArrayList<>();
List<List<Chain>> waterModels = new ArrayList<>();
for (List<Chain> model:allModels) {
List<Chain> polyChains = new ArrayList<>();
List<Chain> nonPolyChains = new ArrayList<>();
List<Chain> waterChains = new ArrayList<>();
polyModels.add(polyChains);
nonPolyModels.add(nonPolyChains);
waterModels.add(waterChains);
for (Chain c:model) {
// we only have entities for polymeric chains, all others are ignored for assigning entities
if (StructureTools.isChainWaterOnly(c)) {
waterChains.add(c);
} else if (StructureTools.isChainPureNonPolymer(c)) {
nonPolyChains.add(c);
} else {
polyChains.add(c);
}
}
}
entityInfos = EntityFinder.findPolyEntities(polyModels);
EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
structure.setEntityInfos(entityInfos);
}
// final sanity check: it can happen that from the annotated entities some are not linked to any chains
// e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
// we simply log it, this can sign some other problems if the entities are used down the line
for (EntityInfo e:entityInfos) {
if (e.getChains().isEmpty()) {
logger.info("Entity {} '{}' has no chains associated to it",
e.getMolId()<0?"with no entity id":e.getMolId(), e.getDescription());
}
}
}
private void addCharges() {
ChargeAdder.addCharges(structure);
}
/**
* The method will return a new reference to a Chain with any consecutive groups
* having same residue numbers removed.
* This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
* @param c
* @return
*/
private static Chain removeSeqResHeterogeneity(Chain c) {
Chain trimmedChain = new ChainImpl();
ResidueNumber lastResNum = null;
for (Group g:c.getAtomGroups()) {
// note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
ResidueNumber currentResNum = new ResidueNumber(
g.getResidueNumber().getChainName(),
g.getResidueNumber().getSeqNum(),
g.getResidueNumber().getInsCode());
if (lastResNum == null || !lastResNum.equals(currentResNum) ) {
trimmedChain.addGroup(g);
} else {
logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': "+g);
}
lastResNum = currentResNum;
}
return trimmedChain;
}
private void addBonds() {
BondMaker maker = new BondMaker(structure, params);
maker.makeBonds();
maker.formBondsFromStructConn(structConn);
}
private void alignSeqRes() {
logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
// fix SEQRES residue numbering for all models
for (int model=0;model<structure.nrModels();model++) {
List<Chain> atomList = structure.getModel(model);
for (Chain seqResChain: seqResChains){
// this extracts the matching atom chain from atomList
Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
if (atomChain == null) {
// most likely there's no observed residues at all for the seqres chain: can't map
// e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's no observed residues in the chain.",
seqResChain.getId());
continue;
}
//map the atoms to the seqres...
// we need to first clone the seqres so that they stay independent for different models
List<Group> seqResGroups = new ArrayList<Group>();
for (int i=0;i<seqResChain.getAtomGroups().size();i++) {
seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
}
for ( int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
Group seqresG = seqResGroups.get(seqResPos);
boolean found = false;
for ( Group atomG: atomChain.getAtomGroups()) {
int internalNr = getInternalNr (atomG);
if (seqresG.getResidueNumber().getSeqNum() == internalNr ) {
seqResGroups.set(seqResPos, atomG);
found = true;
break;
}
}
if ( ! found)
// so far the residue number has tracked internal numbering.
// however there are no atom records, as such this can't be a PDB residue number...
seqresG.setResidueNumber(null);
}
atomChain.setSeqResGroups(seqResGroups);
}
}
}
private int getInternalNr(Group atomG) {
if ( atomG.getType().equals(GroupType.AMINOACID)) {
AminoAcidImpl aa = (AminoAcidImpl) atomG;
return new Long(aa.getId()).intValue();
} else if ( atomG.getType().equals(GroupType.NUCLEOTIDE)) {
NucleotideImpl nu = (NucleotideImpl) atomG;
return new Long(nu.getId()).intValue();
} else {
HetatomImpl he = (HetatomImpl) atomG;
return new Long(he.getId()).intValue();
}
}
private void addEntities(StructAsym asym) {
int eId = 0;
try {
eId = Integer.parseInt(asym.getEntity_id());
} catch (NumberFormatException e) {
logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity",asym.getEntity_id());
}
Entity e = getEntity(eId);
// for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
// we need to fill the Compounds in some other way:
EntityInfo entityInfo = structure.getEntityById(eId);
if (entityInfo==null) {
//logger.info("Creating new EntityInfo " + eId + " " + e.getId() + " " + e.getPdbx_description());
entityInfo = new EntityInfo();
entityInfo.setMolId(eId);
// we only add the compound if a polymeric one (to match what the PDB parser does)
if (e!=null) {
entityInfo.setDescription(e.getPdbx_description());
EntityType eType = EntityType.entityTypeFromString(e.getType());
if (eType!=null) {
entityInfo.setType(eType);
} else {
logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", e.getType(), eId);
}
addAncilliaryEntityData(asym, eId, e, entityInfo);
structure.addEntityInfo(entityInfo);
logger.debug("Adding Entity with entity id {} from _entity, with name: {}",eId, entityInfo.getDescription());
}
}
}
/**
* Add any extra information to the entity information.
* @param asym
* @param entityId
* @param entity
* @param entityInfo
*/
private void addAncilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) {
// Loop through each of the entity types and add the corresponding data
// We're assuming if data is duplicated between sources it is consistent
// This is a potentially huge assumption...
for (EntitySrcGen esg : entitySrcGens) {
if (! esg.getEntity_id().equals(asym.getEntity_id()))
continue;
addInformationFromESG(esg, entityId, entityInfo);
}
for (EntitySrcNat esn : entitySrcNats) {
if (! esn.getEntity_id().equals(asym.getEntity_id()))
continue;
addInformationFromESN(esn, entityId, entityInfo);
}
for (EntitySrcSyn ess : entitySrcSyns) {
if (! ess.getEntity_id().equals(asym.getEntity_id()))
continue;
addInfoFromESS(ess, entityId, entityInfo);
}
}
/**
* Add the information from an ESG to a compound.
* @param entitySrcInfo
* @param entityId
* @param c
*/
private void addInformationFromESG(EntitySrcGen entitySrcInfo, int entityId, EntityInfo c) {
c.setAtcc(entitySrcInfo.getPdbx_gene_src_atcc());
c.setCell(entitySrcInfo.getPdbx_gene_src_cell());
c.setOrganismCommon(entitySrcInfo.getGene_src_common_name());
c.setOrganismScientific(entitySrcInfo.getPdbx_gene_src_scientific_name());
c.setOrganismTaxId(entitySrcInfo.getPdbx_gene_src_ncbi_taxonomy_id());
c.setExpressionSystemTaxId(entitySrcInfo.getPdbx_host_org_ncbi_taxonomy_id());
c.setExpressionSystem(entitySrcInfo.getPdbx_host_org_scientific_name());
}
/**
* Add the information to entity info from ESN.
* @param esn
* @param eId
* @param c
*/
private void addInformationFromESN(EntitySrcNat esn, int eId, EntityInfo c) {
c.setAtcc(esn.getPdbx_atcc());
c.setCell(esn.getPdbx_cell());
c.setOrganismCommon(esn.getCommon_name());
c.setOrganismScientific(esn.getPdbx_organism_scientific());
c.setOrganismTaxId(esn.getPdbx_ncbi_taxonomy_id());
}
/**
* Add the information from ESS to Entity info.
* @param ess
* @param eId
* @param c
*/
private void addInfoFromESS(EntitySrcSyn ess, int eId, EntityInfo c) {
c.setOrganismCommon(ess.getOrganism_common_name());
c.setOrganismScientific(ess.getOrganism_scientific());
c.setOrganismTaxId(ess.getNcbi_taxonomy_id());
}
private void initMaps() {
if (structAsyms == null || structAsyms.isEmpty()) {
logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
return;
}
Map<String, List<String>> entityId2asymId = new HashMap<>();
for (StructAsym asym : structAsyms) {
logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
asymId2entityId.put(asym.getId(), asym.getEntity_id());
if (entityId2asymId.containsKey(asym.getEntity_id())) {
List<String> asymIds = entityId2asymId.get(asym.getEntity_id());
asymIds.add(asym.getId());
} else {
List<String> asymIds = new ArrayList<>();
asymIds.add(asym.getId());
entityId2asymId.put(asym.getEntity_id(), asymIds);
}
}
if (entityPolys==null || entityPolys.isEmpty()) {
logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available for header only parsing");
return;
}
for (EntityPoly ep:entityPolys) {
if (ep.getPdbx_strand_id()==null) {
logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to author ids for this entity.", ep.getEntity_id());
continue;
}
String[] chainNames = ep.getPdbx_strand_id().split(",");
List<String> asymIds = entityId2asymId.get(ep.getEntity_id());
if (chainNames.length!=asymIds.size()) {
logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) for entity {} have different lengths! Can't provide a mapping from asym ids to author chain ids", ep.getEntity_id());
continue;
}
for (int i=0; i<chainNames.length; i++) {
asymId2authorId.put(asymIds.get(i), chainNames[i]);
}
}
}
private void setStructNcsOps() {
ArrayList<Matrix4d> ncsOperators = new ArrayList<Matrix4d>();
for (StructNcsOper sNcsOper:structNcsOper) {
if (!sNcsOper.getCode().equals("generate")) continue;
try {
Matrix4d op = new Matrix4d();
op.setElement(3, 0, 0.0);
op.setElement(3, 1, 0.0);
op.setElement(3, 2, 0.0);
op.setElement(3, 3, 1.0);
op.setElement(0, 0, Double.parseDouble(sNcsOper.getMatrix11()));
op.setElement(0, 1, Double.parseDouble(sNcsOper.getMatrix12()));
op.setElement(0, 2, Double.parseDouble(sNcsOper.getMatrix13()));
op.setElement(1, 0, Double.parseDouble(sNcsOper.getMatrix21()));
op.setElement(1, 1, Double.parseDouble(sNcsOper.getMatrix22()));
op.setElement(1, 2, Double.parseDouble(sNcsOper.getMatrix23()));
op.setElement(2, 0, Double.parseDouble(sNcsOper.getMatrix31()));
op.setElement(2, 1, Double.parseDouble(sNcsOper.getMatrix32()));
op.setElement(2, 2, Double.parseDouble(sNcsOper.getMatrix33()));
op.setElement(0, 3, Double.parseDouble(sNcsOper.getVector1()));
op.setElement(1, 3, Double.parseDouble(sNcsOper.getVector2()));
op.setElement(2, 3, Double.parseDouble(sNcsOper.getVector3()));
ncsOperators.add(op);
} catch (NumberFormatException e) {
logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", structNcsOper.indexOf(sNcsOper)+1);
}
}
// we only set it if not empty, otherwise remains null
if (ncsOperators.size()>0) {
structure.getCrystallographicInfo().setNcsOperators(
ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
}
}
private void setCrystallographicInfoMetadata() {
if (parsedScaleMatrix!=null) {
PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
boolean nonStd = false;
if (crystalInfo.getCrystalCell()!=null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
nonStd = true;
}
crystalInfo.setNonStandardCoordFrameConvention(nonStd);
}
}
/** This method will return the parsed protein structure, once the parsing has been finished
*
* @return a BioJava protein structure object
*/
public Structure getStructure() {
return structure;
}
@Override
public void newDatabasePDBrevRecord(DatabasePdbrevRecord record) {
PDBHeader header = structure.getPDBHeader();
if ( header == null) {
header = new PDBHeader();
structure.setPDBHeader(header);
}
List<DatabasePdbrevRecord> revRecords = header.getRevisionRecords();
if ( revRecords == null) {
revRecords = new ArrayList<DatabasePdbrevRecord>();
header.setRevisionRecords(revRecords);
}
revRecords.add(record);
}
@Override
public void newDatabasePDBrev(DatabasePDBrev dbrev) {
//System.out.println("got a database revision:" + dbrev);
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
PDBHeader header = structure.getPDBHeader();
if ( header == null) {
header = new PDBHeader();
}
if (dbrev.getNum().equals("1")){
try {
Date dep = dateFormat.parse(dbrev.getDate_original());
header.setDepDate(dep);
} catch (ParseException e){
logger.warn("Could not parse date string '{}', deposition date will be unavailable", dbrev.getDate_original());
}
try {
Date mod = dateFormat.parse(dbrev.getDate());
header.setModDate(mod);
} catch (ParseException e){
logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
}
} else {
try {
Date mod = dateFormat.parse(dbrev.getDate());
header.setModDate(mod);
} catch (ParseException e){
logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
}
}
structure.setPDBHeader(header);
}
@Override
public void newDatabasePDBremark(DatabasePDBremark remark) {
//System.out.println(remark);
String id = remark.getId();
if (id.equals("2")){
//this remark field contains the resolution information:
String line = remark.getText();
int i = line.indexOf("ANGSTROM");
if ( i > 5) {
// line contains ANGSTROM info...
String resolution = line.substring(i-5,i).trim();
// convert string to float
float res = 99 ;
try {
res = Float.parseFloat(resolution);
} catch (NumberFormatException e) {
logger.info("could not parse resolution from line and ignoring it " + line);
return ;
}
// support for old style header
PDBHeader pdbHeader = structure.getPDBHeader();
pdbHeader.setResolution(res);
}
}
}
@Override
public void newRefine(Refine r){
PDBHeader pdbHeader = structure.getPDBHeader();
// RESOLUTION
// in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
// there are 2 resolution values, one for each method
// we take the last one found so that behaviour is like in PDB file parsing
if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
,r.getLs_d_res_high(), String.format("%4.2f",pdbHeader.getResolution()));
}
try {
pdbHeader.setResolution(Float.parseFloat(r.getLs_d_res_high()));
} catch (NumberFormatException e){
logger.info("Could not parse resolution from " + r.getLs_d_res_high() + " " + e.getMessage());
}
// RFREE
if (pdbHeader.getRfree()!=PDBHeader.DEFAULT_RFREE) {
logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {} ",
r.getLs_R_factor_R_free(), String.format("%4.2f",pdbHeader.getRfree()));
}
if (r.getLs_R_factor_R_free()==null) {
// some entries like 2ifo haven't got this field at all
logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
} else {
try {
pdbHeader.setRfree(Float.parseFloat(r.getLs_R_factor_R_free()));
} catch (NumberFormatException e){
// no rfree present ('?') is very usual, that's why we set it to debug
logger.debug("Could not parse Rfree from string '{}'", r.getLs_R_factor_R_free());
}
}
// RWORK
if(pdbHeader.getRwork()!=PDBHeader.DEFAULT_RFREE) {
logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
r.getLs_R_factor_R_work(), String.format("%4.2f",pdbHeader.getRwork()));
}
if(r.getLs_R_factor_R_work()==null){
logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
}
else{
try{
pdbHeader.setRwork(Float.parseFloat(r.getLs_R_factor_R_work()));
}
catch (NumberFormatException e){
logger.debug("Could not parse R-work from string '{}'", r.getLs_R_factor_R_work());
}
}
}
@Override
public void newAuditAuthor(AuditAuthor aa){
String name = aa.getName();
StringBuffer famName = new StringBuffer();
StringBuffer initials = new StringBuffer();
boolean afterComma = false;
for ( char c: name.toCharArray()) {
if ( c == ' ')
continue;
if ( c == ','){
afterComma = true;
continue;
}
if ( afterComma)
initials.append(c);
else
famName.append(c);
}
StringBuffer newaa = new StringBuffer();
newaa.append(initials);
newaa.append(famName);
PDBHeader header = structure.getPDBHeader();
String auth = header.getAuthors();
if (auth == null) {
header.setAuthors(newaa.toString());
}else {
auth += "," + newaa.toString();
header.setAuthors(auth);
}
}
@Override
public void newExptl(Exptl exptl) {
PDBHeader pdbHeader = structure.getPDBHeader();
String method = exptl.getMethod();
pdbHeader.setExperimentalTechnique(method);
}
@Override
public void newCell(Cell cell) {
try {
float a = Float.parseFloat(cell.getLength_a());
float b = Float.parseFloat(cell.getLength_b());
float c = Float.parseFloat(cell.getLength_c());
float alpha = Float.parseFloat(cell.getAngle_alpha());
float beta = Float.parseFloat(cell.getAngle_beta());
float gamma = Float.parseFloat(cell.getAngle_gamma());
CrystalCell xtalCell = new CrystalCell();
xtalCell.setA(a);
xtalCell.setB(b);
xtalCell.setC(c);
xtalCell.setAlpha(alpha);
xtalCell.setBeta(beta);
xtalCell.setGamma(gamma);
if (!xtalCell.isCellReasonable()) {
// If the entry describes a structure determined by a technique other than X-ray crystallography,
// cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
// if so we don't add and CrystalCell will be null
logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
CrystalCell.MIN_VALID_CELL_SIZE);
return;
}
structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(xtalCell);
} catch (NumberFormatException e){
structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(null);
logger.info("could not parse some cell parameters ("+e.getMessage()+"), ignoring _cell ");
}
}
@Override
public void newSymmetry(Symmetry symmetry) {
String spaceGroup = symmetry.getSpace_group_name_H_M();
SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
if (sg==null) {
logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(true);
} else {
structure.getPDBHeader().getCrystallographicInfo().setSpaceGroup(sg);
structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(false);
}
}
@Override
public void newStructNcsOper(StructNcsOper sNcsOper) {
structNcsOper.add(sNcsOper);
}
public void newAtomSites(AtomSites atomSites) {
try {
Matrix4d m = new Matrix4d(
Double.parseDouble(atomSites.getFract_transf_matrix11()), Double.parseDouble(atomSites.getFract_transf_matrix12()), Double.parseDouble(atomSites.getFract_transf_matrix13()), Double.parseDouble(atomSites.getFract_transf_vector1()),
Double.parseDouble(atomSites.getFract_transf_matrix21()), Double.parseDouble(atomSites.getFract_transf_matrix22()), Double.parseDouble(atomSites.getFract_transf_matrix23()), Double.parseDouble(atomSites.getFract_transf_vector2()),
Double.parseDouble(atomSites.getFract_transf_matrix31()), Double.parseDouble(atomSites.getFract_transf_matrix32()), Double.parseDouble(atomSites.getFract_transf_matrix33()), Double.parseDouble(atomSites.getFract_transf_vector3()),
0,0,0,1);
parsedScaleMatrix = m;
} catch (NumberFormatException e) {
logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", e.getMessage());
structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
// in this case parsedScaleMatrix stays null and can't be used in documentEnd()
}
}
@Override
public void newStructRef(StructRef sref) {
logger.debug(sref.toString());
strucRefs.add(sref);
}
private StructRef getStructRef(String ref_id){
for (StructRef structRef : strucRefs) {
if (structRef.getId().equals(ref_id)){
return structRef;
}
}
return null;
}
/**
* create a DBRef record from the StrucRefSeq record:
* <pre>
* PDB record DBREF
* Field Name mmCIF Data Item
* Section n.a.
* PDB_ID_Code _struct_ref_seq.pdbx_PDB_id_code
* Strand_ID _struct_ref_seq.pdbx_strand_id
* Begin_Residue_Number _struct_ref_seq.pdbx_auth_seq_align_beg
* Begin_Ins_Code _struct_ref_seq.pdbx_seq_align_beg_ins_code
* End_Residue_Number _struct_ref_seq.pdbx_auth_seq_align_end
* End_Ins_Code _struct_ref_seq.pdbx_seq_align_end_ins_code
* Database _struct_ref.db_name
* Database_Accession_No _struct_ref_seq.pdbx_db_accession
* Database_ID_Code _struct_ref.db_code
* Database_Begin_Residue_Number _struct_ref_seq.db_align_beg
* Databaes_Begin_Ins_Code _struct_ref_seq.pdbx_db_align_beg_ins_code
* Database_End_Residue_Number _struct_ref_seq.db_align_end
* Databaes_End_Ins_Code _struct_ref_seq.pdbx_db_align_end_ins_code
* </pre>
*
*
*/
@Override
public void newStructRefSeq(StructRefSeq sref) {
//if (DEBUG)
// System.out.println(sref);
DBRef r = new DBRef();
//if (DEBUG)
// System.out.println( " " + sref.getPdbx_PDB_id_code() + " " + sref.getPdbx_db_accession());
r.setIdCode(sref.getPdbx_PDB_id_code());
r.setDbAccession(sref.getPdbx_db_accession());
r.setDbIdCode(sref.getPdbx_db_accession());
r.setChainId(sref.getPdbx_strand_id());
StructRef structRef = getStructRef(sref.getRef_id());
if (structRef == null){
logger.info("could not find StructRef " + sref.getRef_id() + " for StructRefSeq " + sref);
} else {
r.setDatabase(structRef.getDb_name());
r.setDbIdCode(structRef.getDb_code());
}
int seqbegin;
int seqend;
try{
seqbegin = Integer.parseInt(sref.getPdbx_auth_seq_align_beg());
seqend = Integer.parseInt(sref.getPdbx_auth_seq_align_end());
}
catch(NumberFormatException e){
logger.info("Couldn't parse sequence alignment positions.");
logger.debug(e.toString());
return;
}
Character begin_ins_code = new Character(sref.getPdbx_seq_align_beg_ins_code().charAt(0));
Character end_ins_code = new Character(sref.getPdbx_seq_align_end_ins_code().charAt(0));
if (begin_ins_code == '?')
begin_ins_code = ' ';
if (end_ins_code == '?')
end_ins_code = ' ';
r.setSeqBegin(seqbegin);
r.setInsertBegin(begin_ins_code);
r.setSeqEnd(seqend);
r.setInsertEnd(end_ins_code);
int dbseqbegin = Integer.parseInt(sref.getDb_align_beg());
int dbseqend = Integer.parseInt(sref.getDb_align_end());
Character db_begin_in_code = new Character(sref.getPdbx_db_align_beg_ins_code().charAt(0));
Character db_end_in_code = new Character(sref.getPdbx_db_align_end_ins_code().charAt(0));
if (db_begin_in_code == '?')
db_begin_in_code = ' ';
if (db_end_in_code == '?')
db_end_in_code = ' ';
r.setDbSeqBegin(dbseqbegin);
r.setIdbnsBegin(db_begin_in_code);
r.setDbSeqEnd(dbseqend);
r.setIdbnsEnd(db_end_in_code);
List<DBRef> dbrefs = structure.getDBRefs();
if ( dbrefs == null)
dbrefs = new ArrayList<DBRef>();
dbrefs.add(r);
logger.debug(r.toPDB());
structure.setDBRefs(dbrefs);
}
@Override
public void newStructRefSeqDif(StructRefSeqDif sref) {
sequenceDifs.add(sref);
}
private Chain getEntityChain(String entity_id){
for (Chain chain : entityChains) {
if ( chain.getId().equals(entity_id)){
return chain;
}
}
// does not exist yet, so create...
Chain chain = new ChainImpl();
chain.setId(entity_id);
entityChains.add(chain);
return chain;
}
//private Chain getSeqResChain(String chainID){
// return getChainFromList(seqResChains, chainID);
//}
/**
* Data items in the ENTITY_SRC_GEN category record details of
* the source from which the entity was obtained in cases
* where the source was genetically manipulated. The
* following are treated separately: items pertaining to the tissue
* from which the gene was obtained, items pertaining to the host
* organism for gene expression and items pertaining to the actual
* producing organism (plasmid).
*/
@Override
public void newEntitySrcGen(EntitySrcGen entitySrcGen){
// add to internal list. Map to Compound object later on...
entitySrcGens.add(entitySrcGen);
}
@Override
public void newEntitySrcNat(EntitySrcNat entitySrcNat){
// add to internal list. Map to Compound object later on...
entitySrcNats.add(entitySrcNat);
}
@Override
public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){
// add to internal list. Map to Compound object later on...
entitySrcSyns.add(entitySrcSyn);
}
/**
* The EntityPolySeq object provide the amino acid sequence objects for the Entities.
* Later on the entities are mapped to the BioJava {@link Chain} and {@link EntityInfo} objects.
* @param epolseq the EntityPolySeq record for one amino acid
*/
@Override
public void newEntityPolySeq(EntityPolySeq epolseq) {
logger.debug("NEW entity poly seq " + epolseq);
int eId = -1;
try {
eId = Integer.parseInt(epolseq.getEntity_id());
} catch (NumberFormatException e) {
logger.warn("Could not parse entity id from EntityPolySeq: "+e.getMessage());
}
Entity e = getEntity(eId);
if (e == null){
logger.info("Could not find entity "+ epolseq.getEntity_id()+". Can not match sequence to it.");
return;
}
Chain entityChain = getEntityChain(epolseq.getEntity_id());
// first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
// TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id());
//int seqId = Integer.parseInt(epolseq.getNum());
if ( g != null && !g.getChemComp().isEmpty()) {
if ( g instanceof AminoAcidImpl) {
AminoAcidImpl aa = (AminoAcidImpl) g;
aa.setRecordType(AminoAcid.SEQRESRECORD);
//aa.setId(seqId);
}
} else {
if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){
AminoAcidImpl a = new AminoAcidImpl();
a.setRecordType(AminoAcid.SEQRESRECORD);
Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id());
a.setAminoType(code1);
g = a;
} else if ( StructureTools.isNucleotide(epolseq.getMon_id())) {
// the group is actually a nucleotide group...
NucleotideImpl n = new NucleotideImpl();
g = n;
} else {
logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id());
HetatomImpl h = new HetatomImpl();
g = h;
}
}
// at this stage we don't know about author residue numbers (insertion codes)
// we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n)
// later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
g.setPDBName(epolseq.getMon_id());
entityChain.addGroup(g);
}
@Override
public void newPdbxPolySeqScheme(PdbxPolySeqScheme ppss) {
//if ( headerOnly)
// return;
// replace the group asym ids with the real PDB ids!
// replaceGroupSeqPos(ppss); // This might be incorrect in some pdb, to use auth_seq_id of the pdbx_poly_seq_scheme.
}
@Override
public void newPdbxNonPolyScheme(PdbxNonPolyScheme ppss) {
//if (headerOnly)
// return;
// merge the EntityPolySeq info and the AtomSite chains into one...
//already known ignore:
}
@Override
public void newPdbxEntityNonPoly(PdbxEntityNonPoly pen){
// TODO: do something with them...
// not implemented yet...
logger.debug(pen.getEntity_id() + " " + pen.getName() + " " + pen.getComp_id());
}
@Override
public void newChemComp(ChemComp c) {
// TODO: do something with them...
}
@Override
public void newGenericData(String category, List<String> loopFields,
List<String> lineData) {
//logger.debug("unhandled category so far: " + category);
}
@Override
public FileParsingParameters getFileParsingParameters()
{
return params;
}
@Override
public void setFileParsingParameters(FileParsingParameters params)
{
this.params = params;
}
@Override
public void newChemCompDescriptor(ChemCompDescriptor ccd) {
// TODO nothing happening here yet.
}
public List<PdbxStructOperList> getStructOpers() {
return structOpers;
}
@Override
public void newPdbxStrucAssembly(PdbxStructAssembly strucAssembly) {
strucAssemblies.add(strucAssembly);
}
public List<PdbxStructAssembly> getStructAssemblies(){
return strucAssemblies;
}
@Override
public void newPdbxStrucAssemblyGen(PdbxStructAssemblyGen strucAssembly) {
strucAssemblyGens.add(strucAssembly);
}
public List<PdbxStructAssemblyGen> getStructAssemblyGens(){
return strucAssemblyGens;
}
@Override
public void newChemCompAtom(ChemCompAtom atom) {
}
@Override
public void newPdbxChemCompIndentifier(PdbxChemCompIdentifier id) {
}
@Override
public void newChemCompBond(ChemCompBond bond) {
}
@Override
public void newPdbxChemCompDescriptor(PdbxChemCompDescriptor desc) {
}
@Override
public void newStructConn(StructConn structConn) {
this.structConn.add(structConn);
}
@Override
public void newStructSiteGen(StructSiteGen siteGen) { this.structSiteGens.add(siteGen); }
@Override
public void newStructSite(StructSite structSite) {
if (params.isHeaderOnly()) {
return;
}
// Simply implement the method.
List<Site> sites = structure.getSites();
if (sites == null) sites = new ArrayList<Site>();
Site site = null;
for (Site asite : sites) {
if (asite.getSiteID().equals(structSite.getId())) {
site = asite; // Prevent duplicate siteIds
}
}
boolean addSite = false;
if (site == null) { site = new Site(); addSite = true; }
site.setSiteID(structSite.getId());
site.setDescription(structSite.getDetails());
// site.setPdbxEvidenceCode(structSite.getPdbxEvidenceCode()); // TODO - add addition fields in Sites
if (addSite) sites.add(site);
structure.setSites(sites);
}
/**
* Build sites in a BioJava Structure using the original author chain id & residue numbers.
* Sites are built from struct_site_gen records that have been parsed.
*/
private void addSites() {
List<Site> sites = structure.getSites();
if (sites == null) sites = new ArrayList<Site>();
for (StructSiteGen siteGen : structSiteGens) {
// For each StructSiteGen, find the residues involved, if they exist then
String site_id = siteGen.getSite_id(); // multiple could be in same site.
if (site_id == null) site_id = "";
String comp_id = siteGen.getLabel_comp_id(); // PDBName
// Assumption: the author chain ID and residue number for the site is consistent with the original
// author chain id and residue numbers.
String asymId = siteGen.getLabel_asym_id(); // chain name
String authId = siteGen.getAuth_asym_id(); // chain Id
String auth_seq_id = siteGen.getAuth_seq_id(); // Res num
String insCode = siteGen.getPdbx_auth_ins_code();
if ( insCode != null && insCode.equals("?"))
insCode = null;
// Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname.
Group g = null;
try {
Chain chain = structure.getChain(asymId);
if (null != chain) {
try {
Character insChar = null;
if (null != insCode && insCode.length() > 0) insChar = insCode.charAt(0);
g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
} catch (NumberFormatException e) {
logger.warn("Could not lookup residue : " + authId + auth_seq_id);
}
}
} catch (StructureException e) {
logger.warn("Problem finding residue in site entry " + siteGen.getSite_id() + " - " + e.getMessage(), e.getMessage());
}
if (g != null) {
// 2. find the site_id, if not existing, create anew.
Site site = null;
for (Site asite: sites) {
if (site_id.equals(asite.getSiteID())) site = asite;
}
boolean addSite = false;
// 3. add this residue to the site.
if (site == null) {
addSite = true;
site = new Site();
site.setSiteID(site_id);
}
List<Group> groups = site.getGroups();
if (groups == null) groups = new ArrayList<Group>();
// Check the self-consistency of the residue reference from auth_seq_id and chain_id
if (!comp_id.equals(g.getPDBName())) {
logger.warn("comp_id doesn't match the residue at " + authId + " " + auth_seq_id + " - skipping");
} else {
groups.add(g);
site.setGroups(groups);
}
if (addSite) sites.add(site);
}
}
structure.setSites(sites);
}
}