/* $Revision$ $Author$ $Date$ * * Copyright (C) 1997-2007 The Chemistry Development Kit (CDK) project * * Contact: cdk-devel@lists.sourceforge.net * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * All we ask is that proper credit is given for our work, which includes * - but is not limited to - adding the above copyright notice to the beginning * of your source code files, and to any copyright notice that you may distribute * with programs based on this work. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * */ package org.openscience.cdk.io; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Hashtable; import java.util.Map; import javax.vecmath.Point3d; import org.openscience.cdk.CDKConstants; import org.openscience.cdk.annotations.TestClass; import org.openscience.cdk.annotations.TestMethod; import org.openscience.cdk.config.AtomTypeFactory; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.exception.NoSuchAtomTypeException; import org.openscience.cdk.graph.rebond.RebondTool; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomType; import org.openscience.cdk.interfaces.IBioPolymer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.interfaces.IChemFile; import org.openscience.cdk.interfaces.IChemModel; import org.openscience.cdk.interfaces.IChemObject; import org.openscience.cdk.interfaces.IChemSequence; import org.openscience.cdk.interfaces.IMolecule; import org.openscience.cdk.interfaces.IMoleculeSet; import org.openscience.cdk.interfaces.IMonomer; import org.openscience.cdk.interfaces.IStrand; import org.openscience.cdk.io.formats.IResourceFormat; import org.openscience.cdk.io.formats.PDBFormat; import org.openscience.cdk.io.setting.BooleanIOSetting; import org.openscience.cdk.io.setting.IOSetting; import org.openscience.cdk.protein.data.PDBAtom; import org.openscience.cdk.protein.data.PDBMonomer; import org.openscience.cdk.protein.data.PDBPolymer; import org.openscience.cdk.protein.data.PDBStrand; import org.openscience.cdk.protein.data.PDBStructure; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; /** * Reads the contents of a PDBFile. * * <p>A description can be found at <a href="http://www.rcsb.org/pdb/static.do?p=file_formats/pdb/index.html"> * http://www.rcsb.org/pdb/static.do?p=file_formats/pdb/index.html</a>. * * @cdk.module pdb * @cdk.githash * * @author Edgar Luttmann * @author Bradley Smith <bradley@baysmith.com> * @author Martin Eklund <martin.eklund@farmbio.uu.se> * @author Ola Spjuth <ola.spjuth@farmbio.uu.se> * @cdk.created 2001-08-06 * @cdk.keyword file format, PDB * @cdk.bug 1714141 * @cdk.bug 1794439 */ @TestClass("org.openscience.cdk.io.PDBReaderTest") public class PDBReader extends DefaultChemObjectReader { private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(PDBReader.class); private BufferedReader _oInput; // The internal used BufferedReader private BooleanIOSetting useRebondTool; private BooleanIOSetting readConnect; private Map atomNumberMap; /* * This is a temporary store for bonds from CONNECT records. * As CONNECT is deliberately fully redundant (a->b and b->a) * we need to use this to weed out the duplicates. */ private ArrayList bondsFromConnectRecords; private static AtomTypeFactory pdbFactory; /** * * Contructs a new PDBReader that can read Molecules from a given * InputStream. * * @param oIn The InputStream to read from * */ public PDBReader(InputStream oIn) { this(new InputStreamReader(oIn)); } /** * * Contructs a new PDBReader that can read Molecules from a given * Reader. * * @param oIn The Reader to read from * */ public PDBReader(Reader oIn) { _oInput = new BufferedReader(oIn); initIOSettings(); pdbFactory = null; } public PDBReader() { this(new StringReader("")); } @TestMethod("testGetFormat") public IResourceFormat getFormat() { return PDBFormat.getInstance(); } @TestMethod("testSetReader_Reader") public void setReader(Reader input) throws CDKException { if (input instanceof BufferedReader) { this._oInput = (BufferedReader)input; } else { this._oInput = new BufferedReader(input); } } @TestMethod("testSetReader_InputStream") public void setReader(InputStream input) throws CDKException { setReader(new InputStreamReader(input)); } @TestMethod("testAccepts") public boolean accepts(Class classObject) { Class[] interfaces = classObject.getInterfaces(); for (int i=0; i<interfaces.length; i++) { if (IChemFile.class.equals(interfaces[i])) return true; } Class superClass = classObject.getSuperclass(); if (superClass != null) return this.accepts(superClass); return false; } /** * * Takes an object which subclasses IChemObject, e.g. Molecule, and will * read this (from file, database, internet etc). If the specific * implementation does not support a specific IChemObject it will throw * an Exception. * * @param oObj The object that subclasses IChemObject * @return The IChemObject read * @exception CDKException * */ public IChemObject read(IChemObject oObj) throws CDKException { if (oObj instanceof IChemFile) { if (pdbFactory == null) { try { pdbFactory = AtomTypeFactory.getInstance("org/openscience/cdk/config/data/pdb_atomtypes.xml", ((IChemFile)oObj).getBuilder()); } catch (Exception exception) { logger.debug(exception); throw new CDKException("Could not setup list of PDB atom types! " + exception.getMessage(), exception); } } return readChemFile((IChemFile)oObj); } else { throw new CDKException("Only supported is reading of ChemFile objects."); } } /** * Read a <code>ChemFile</code> from a file in PDB format. The molecules * in the file are stored as <code>BioPolymer</code>s in the * <code>ChemFile</code>. The residues are the monomers of the * <code>BioPolymer</code>, and their names are the concatenation of the * residue, chain id, and the sequence number. Separate chains (denoted by * TER records) are stored as separate <code>BioPolymer</code> molecules. * * <p>Connectivity information is not currently read. * * @return The ChemFile that was read from the PDB file. */ private IChemFile readChemFile(IChemFile oFile) { // initialize all containers IChemSequence oSeq = oFile.getBuilder().newChemSequence(); IChemModel oModel = oFile.getBuilder().newChemModel(); IMoleculeSet oSet = oFile.getBuilder().newMoleculeSet(); // some variables needed String cCol; PDBAtom oAtom; PDBPolymer oBP = new PDBPolymer(); IMolecule molecularStructure = oFile.getBuilder().newMolecule(); StringBuffer cResidue; String oObj; IMonomer oMonomer; String cRead = ""; char chain = 'A'; // To ensure stringent name giving of monomers IStrand oStrand; int lineLength = 0; boolean isProteinStructure = false; atomNumberMap = new Hashtable(); if (readConnect.isSet()) { bondsFromConnectRecords = new ArrayList(); } // do the reading of the Input try { do { cRead = _oInput.readLine(); logger.debug("Read line: ", cRead); if (cRead != null) { lineLength = cRead.length(); if (lineLength < 80) { logger.warn("Line is not of the expected length 80!"); } // make sure the record name is 6 characters long if (lineLength < 6) { cRead = cRead + " "; } // check the first column to decide what to do cCol = cRead.substring(0,6); if ("SEQRES".equalsIgnoreCase(cCol)) { isProteinStructure = true; } else if ("ATOM ".equalsIgnoreCase(cCol)) { // read an atom record oAtom = readAtom(cRead, lineLength); if (isProteinStructure) { // construct a string describing the residue cResidue = new StringBuffer(8); oObj = oAtom.getResName(); if (oObj != null) { cResidue = cResidue.append(oObj.trim()); } oObj = oAtom.getChainID(); if (oObj != null) { // cResidue = cResidue.append(((String)oObj).trim()); cResidue = cResidue.append(String.valueOf(chain)); } oObj = oAtom.getResSeq(); if (oObj != null) { cResidue = cResidue.append(oObj.trim()); } // search for an existing strand or create a new one. String strandName = oAtom.getChainID(); if (strandName == null || strandName.length() == 0) { strandName = String.valueOf(chain); } oStrand = oBP.getStrand(strandName); if (oStrand == null) { oStrand = new PDBStrand(); oStrand.setStrandName(strandName); oStrand.setID(String.valueOf(chain)); } // search for an existing monomer or create a new one. oMonomer = oBP.getMonomer(cResidue.toString(), String.valueOf(chain)); if (oMonomer == null) { PDBMonomer monomer = new PDBMonomer(); monomer.setMonomerName(cResidue.toString()); monomer.setMonomerType(oAtom.getResName()); monomer.setChainID(oAtom.getChainID()); monomer.setICode(oAtom.getICode()); monomer.setResSeq(oAtom.getResSeq()); oMonomer = monomer; } // add the atom oBP.addAtom(oAtom, oMonomer, oStrand); if (readConnect.isSet() && atomNumberMap.put(new Integer(oAtom.getSerial()), oAtom) != null) { logger.warn("Duplicate serial ID found for atom: ", oAtom); } } else { molecularStructure.addAtom(oAtom); } logger.debug("Added ATOM: ", oAtom); /** As HETATMs cannot be considered to either belong to a certain monomer or strand, * they are dealt with seperately.*/ } else if("HETATM".equalsIgnoreCase(cCol)) { // read an atom record oAtom = readAtom(cRead, lineLength); oAtom.setHetAtom(true); oBP.addAtom(oAtom); if (atomNumberMap.put(new Integer(oAtom.getSerial()), oAtom) != null) { logger.warn("Duplicate serial ID found for atom: ", oAtom); } logger.debug("Added HETATM: ", oAtom); } else if ("TER ".equalsIgnoreCase(cCol)) { // start new strand chain++; oStrand = new PDBStrand(); oStrand.setStrandName(String.valueOf(chain)); logger.debug("Added new STRAND"); } else if ("END ".equalsIgnoreCase(cCol)) { atomNumberMap.clear(); if (isProteinStructure) { // create bonds and finish the molecule if (useRebondTool.isSet()) { try { if(!createBondsWithRebondTool(oBP)) { // Get rid of all potentially created bonds. logger.info("Bonds could not be created using the RebondTool when PDB file was read."); oBP.removeAllBonds(); } } catch (Exception exception) { logger.info("Bonds could not be created when PDB file was read."); logger.debug(exception); } } oSet.addMolecule(oBP); } else { oSet.addMolecule(molecularStructure); } } else if (cCol.equals("MODEL ")) { // OK, start a new model and save the current one first *if* it contains atoms if (isProteinStructure) { if (oBP.getAtomCount() > 0) { // save the model oSet.addAtomContainer(oBP); oModel.setMoleculeSet(oSet); oSeq.addChemModel(oModel); // setup a new one oBP = new PDBPolymer(); oModel = oFile.getBuilder().newChemModel(); oSet = oFile.getBuilder().newMoleculeSet(); } } else { if (molecularStructure.getAtomCount() > 0) { // save the model oSet.addAtomContainer(molecularStructure); oModel.setMoleculeSet(oSet); oSeq.addChemModel(oModel); // setup a new one molecularStructure = oFile.getBuilder().newMolecule(); oModel = oFile.getBuilder().newChemModel(); oSet = oFile.getBuilder().newMoleculeSet(); } } } else if ("REMARK".equalsIgnoreCase(cCol)) { Object comment = oFile.getProperty(CDKConstants.COMMENT); if (comment == null) { comment = ""; } if (lineLength >12) { comment = comment.toString() + cRead.substring(11).trim() + System.getProperty("line.separator"); oFile.setProperty(CDKConstants.COMMENT, comment); } else { logger.warn("REMARK line found without any comment!"); } } else if ("COMPND".equalsIgnoreCase(cCol)) { String title = cRead.substring(10).trim(); oFile.setProperty(CDKConstants.TITLE, title); } /************************************************************* * Read connectivity information from CONECT records. * Only covalent bonds are dealt with. Perhaps salt bridges * should be dealt with in the same way..? */ else if (readConnect.isSet() && "CONECT".equalsIgnoreCase(cCol)) { cRead.trim(); if (cRead.length() < 16) { logger.debug("Skipping unexpected empty CONECT line! : ", cRead); } else { String bondAtom = cRead.substring(7, 11).trim(); int bondAtomNo = Integer.parseInt(bondAtom); String bondedAtom = cRead.substring(12, 16).trim(); int bondedAtomNo = -1; try {bondedAtomNo = Integer.parseInt(bondedAtom);} catch(Exception e) {bondedAtomNo = -1;} if(bondedAtomNo != -1) { addBond(oBP, bondAtomNo, bondedAtomNo); logger.warn("Bonded " + bondAtomNo + " with " + bondedAtomNo); } if(cRead.length() > 17) { bondedAtom = cRead.substring(17, 21); bondedAtom = bondedAtom.trim(); try {bondedAtomNo = Integer.parseInt(bondedAtom);} catch(Exception e) {bondedAtomNo = -1;} if(bondedAtomNo != -1) { addBond(oBP, bondAtomNo, bondedAtomNo); logger.warn("Bonded " + bondAtomNo + " with " + bondedAtomNo); } } if(cRead.length() > 22) { bondedAtom = cRead.substring(22, 26); bondedAtom = bondedAtom.trim(); try {bondedAtomNo = Integer.parseInt(bondedAtom);} catch(Exception e) {bondedAtomNo = -1;} if(bondedAtomNo != -1) { addBond(oBP, bondAtomNo, bondedAtomNo); logger.warn("Bonded " + bondAtomNo + " with " + bondedAtomNo); } } if(cRead.length() > 27) { bondedAtom = cRead.substring(27, 31); bondedAtom = bondedAtom.trim(); try {bondedAtomNo = Integer.parseInt(bondedAtom);} catch(Exception e) {bondedAtomNo = -1;} if(bondedAtomNo != -1) { addBond(oBP, bondAtomNo, bondedAtomNo); logger.warn("Bonded " + bondAtomNo + " with " + bondedAtomNo); } } } } /*************************************************************/ else if ("HELIX ".equalsIgnoreCase(cCol)) { // HELIX 1 H1A CYS A 11 LYS A 18 1 RESIDUE 18 HAS POSITIVE PHI 1D66 72 // 1 2 3 4 5 6 7 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 PDBStructure structure = new PDBStructure(); structure.setStructureType(PDBStructure.HELIX); structure.setStartChainID(cRead.charAt(19)); structure.setStartSequenceNumber(Integer.parseInt(cRead.substring(21, 25).trim())); structure.setStartInsertionCode(cRead.charAt(25)); structure.setEndChainID(cRead.charAt(31)); structure.setEndSequenceNumber(Integer.parseInt(cRead.substring(33, 37).trim())); structure.setEndInsertionCode(cRead.charAt(37)); oBP.addStructure(structure); } else if ("SHEET ".equalsIgnoreCase(cCol)) { PDBStructure structure = new PDBStructure(); structure.setStructureType(PDBStructure.SHEET); structure.setStartChainID(cRead.charAt(21)); structure.setStartSequenceNumber(Integer.parseInt(cRead.substring(22, 26).trim())); structure.setStartInsertionCode(cRead.charAt(26)); structure.setEndChainID(cRead.charAt(32)); structure.setEndSequenceNumber(Integer.parseInt(cRead.substring(33, 37).trim())); structure.setEndInsertionCode(cRead.charAt(37)); oBP.addStructure(structure); } else if ("TURN ".equalsIgnoreCase(cCol)) { PDBStructure structure = new PDBStructure(); structure.setStructureType(PDBStructure.TURN); structure.setStartChainID(cRead.charAt(19)); structure.setStartSequenceNumber(Integer.parseInt(cRead.substring(20, 24).trim())); structure.setStartInsertionCode(cRead.charAt(24)); structure.setEndChainID(cRead.charAt(30)); structure.setEndSequenceNumber(Integer.parseInt(cRead.substring(31, 35).trim())); structure.setEndInsertionCode(cRead.charAt(35)); oBP.addStructure(structure); } // ignore all other commands } } while (_oInput.ready() && (cRead != null)); } catch (Exception e) { logger.error("Found a problem at line:"); logger.error(cRead); logger.error("01234567890123456789012345678901234567890123456789012345678901234567890123456789"); logger.error(" 1 2 3 4 5 6 7 "); logger.error(" error: " + e.getMessage()); logger.debug(e); } // try to close the Input try { _oInput.close(); } catch (Exception e) { logger.debug(e); } // Set all the dependencies oModel.setMoleculeSet(oSet); oSeq.addChemModel(oModel); oFile.addChemSequence(oSeq); return oFile; } private void addBond(PDBPolymer obp, int bondAtomNo, int bondedAtomNo) { IAtom firstAtom = (PDBAtom)atomNumberMap.get(new Integer(bondAtomNo)); IAtom secondAtom = (PDBAtom)atomNumberMap.get(new Integer(bondedAtomNo)); if (firstAtom == null) { logger.error("Could not find bond start atom in map with serial id: ", bondAtomNo); } if (secondAtom == null) { logger.error("Could not find bond target atom in map with serial id: ", bondAtomNo); } IBond bond = firstAtom.getBuilder().newBond( firstAtom, secondAtom, IBond.Order.SINGLE); for (int i = 0; i < bondsFromConnectRecords.size(); i++) { IBond existingBond = (IBond) bondsFromConnectRecords.get(i); IAtom a = existingBond.getAtom(0); IAtom b = existingBond.getAtom(1); if ((a == firstAtom && b == secondAtom) || (b == firstAtom && a == secondAtom)) { // already stored return; } } bondsFromConnectRecords.add(bond); obp.addBond(bond); } private boolean createBondsWithRebondTool(IBioPolymer pol){ RebondTool tool = new RebondTool(2.0, 0.5, 0.5); try { // configure atoms AtomTypeFactory factory = AtomTypeFactory.getInstance("org/openscience/cdk/config/data/jmol_atomtypes.txt", pol.getBuilder()); java.util.Iterator atoms = pol.atoms().iterator(); while (atoms.hasNext()) { IAtom atom = (IAtom)atoms.next(); try { IAtomType[] types = factory.getAtomTypes(atom.getSymbol()); if (types.length > 0) { // just pick the first one AtomTypeManipulator.configure(atom, types[0]); } else { logger.warn("Could not configure atom with symbol: "+ atom.getSymbol()); } } catch (Exception e) { logger.warn("Could not configure atom (but don't care): " + e.getMessage()); logger.debug(e); } } tool.rebond(pol); } catch (Exception e) { logger.error("Could not rebond the polymer: " + e.getMessage()); logger.debug(e); } return true; } /** * Creates an <code>Atom</code> and sets properties to their values from * the ATOM record. If the line is shorter than 80 characters, the information * past 59 characters is treated as optional. If the line is shorter than 59 * characters, a <code>RuntimeException</code> is thrown. * * @param cLine the PDB ATOM record. * @return the <code>Atom</code> created from the record. * @throws RuntimeException if the line is too short (less than 59 characters). */ private PDBAtom readAtom(String cLine, int lineLength) { // a line looks like: // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 // ATOM 1 O5* C A 1 20.662 36.632 23.475 1.00 10.00 114D 45 // ATOM 1186 1H ALA 1 10.105 5.945 -6.630 1.00 0.00 1ALE1288 if (lineLength < 59) { throw new RuntimeException("PDBReader error during readAtom(): line too short"); } String elementSymbol = cLine.substring(12, 14).trim(); if (elementSymbol.length() == 2) { // ensure that the second char is lower case elementSymbol = elementSymbol.charAt(0) + elementSymbol.substring(1).toLowerCase(); } String rawAtomName = cLine.substring(12, 16).trim(); String resName = cLine.substring(17, 20).trim(); try { IAtomType type = pdbFactory.getAtomType(resName+"."+rawAtomName); elementSymbol = type.getSymbol(); } catch (NoSuchAtomTypeException e) { logger.error("Did not recognize PDB atom type: " + resName+"."+rawAtomName); } PDBAtom oAtom = new PDBAtom(elementSymbol, new Point3d(Double.parseDouble(cLine.substring(30, 38)), Double.parseDouble(cLine.substring(38, 46)), Double.parseDouble(cLine.substring(46, 54)) ) ); oAtom.setRecord(cLine); oAtom.setSerial(Integer.parseInt(cLine.substring(6, 11).trim())); oAtom.setName(rawAtomName.trim()); oAtom.setAltLoc(cLine.substring(16, 17).trim()); oAtom.setResName(resName); oAtom.setChainID(cLine.substring(21, 22).trim()); oAtom.setResSeq(cLine.substring(22, 26).trim()); oAtom.setICode(cLine.substring(26, 27).trim()); oAtom.setAtomTypeName(oAtom.getResName()+"."+rawAtomName); if (lineLength >= 59) { String frag = cLine.substring(54, 60).trim(); if (frag.length() > 0) { oAtom.setOccupancy(Double.parseDouble(frag)); } } if (lineLength >= 65) { String frag = cLine.substring(60, 66).trim(); if (frag.length() > 0) { oAtom.setTempFactor(Double.parseDouble(frag)); } } if (lineLength >= 75) { oAtom.setSegID(cLine.substring(72, 76).trim()); } // if (lineLength >= 78) { // oAtom.setSymbol((new String(cLine.substring(76, 78))).trim()); // } if (lineLength >= 79) { String frag = cLine.substring(78, 80).trim(); if (frag.length() > 0) { oAtom.setCharge(Double.parseDouble(frag)); } } /************************************************************************************* * It sets a flag in the property content of an atom, * which is used when bonds are created to check if the atom is an OXT-record => needs * special treatment. */ String oxt = cLine.substring(13, 16).trim(); if(oxt.equals("OXT")) { oAtom.setOxt(true); } else { oAtom.setOxt(false); } /*************************************************************************************/ return oAtom; } @TestMethod("testClose") public void close() throws IOException { _oInput.close(); } private void initIOSettings() { useRebondTool = new BooleanIOSetting("UseRebondTool", IOSetting.LOW, "Should the PDBReader deduce bonding patterns?", "false"); readConnect = new BooleanIOSetting("ReadConnectSection", IOSetting.LOW, "Should the CONECT be read?", "true"); } public void customizeJob() { fireIOSettingQuestion(useRebondTool); fireIOSettingQuestion(readConnect); } public IOSetting[] getIOSettings() { IOSetting[] settings = new IOSetting[2]; settings[0] = useRebondTool; settings[1] = readConnect; return settings; } }