/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * Created on Jun 8, 2010 * Author: Jianjiong Gao * Author: Peter W. Rose * */ package org.biojava.nbio.protmod.structure; import junit.framework.TestCase; import org.biojava.nbio.protmod.ProteinModification; import org.biojava.nbio.protmod.ProteinModificationRegistry; import org.biojava.nbio.structure.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; /** * * @author Jianjiong Gao * @since 3.0 */ public class ProteinModificationParserTest extends TestCase { private static final Logger logger = LoggerFactory.getLogger(ProteinModificationParserTest.class); private String[][] strucs; @Override public void setUp() { strucs = setUpShortTest(); // strucs = setUpLongTest(); } public static String[][] setUpShortTest() { String[][] strucs = new String[][] { // {"1cdg", null}, // // // Attachments // {"3HN3", "AA0151"}, // NAG // {"1ZNF", "AA0053"}, // ACE on THR {"1MCC", "AA0045"}, // ACE on GLU {"1SCY", "AA0089"}, // NH2 on HIS {"5HNE", "AA0119"}, // PLP on LYS // Modified resdiues {"1UIS", "AA0183"}, // NRQ {"3MVJ", "AA0037"}, // SEP {"5EUN", "AA0119"}, // LLP // remediation changed 1DOJ... //{"1DOJ", "AA0065"}, // MEA {"1DOJ", "AA0172"}, // TYS //{"3H5R", "AA0302"}, // SNN, note: SNN is not at C-terminal in some structures, e.g. 3I4W // Cross link {"1UIS", "AA0379"}, // NRQ {"3M6S", "AA0025"}, // Disulfide bond {"1A6L", "AA0139"}, // F3S {"1A70", "AA0137"}, // FES {"1RPB", "AA0216"}, // Isopeptide (Cys - ASP) {"1FP4", "AA0141"}, // CFM, HCA, CYS, HIS {"1EMA", "AA0183"}, // CRO, cross-link1 {"2IWK", "AA0298"}, // CU4 {"1SU6", "AA0310"}, // NFS, 5 CYS, HIS {"2AXR", "AA0436"}, // CYS-FAD-HIS {"3H8L", "AA0513"}, // CYS-S3H-CYS {"1CAD", null}, // FE and 4 Cys, cross-link4 // Terbium cases {"1NCZ", null}, {"3LTQ",null}, // has metalc, {"4ESQ",null}, {"1TJB",null}, {"2V15",null}, {"2K61",null}, // iron bond to CYS //{"1G20","AA0300"}, {"3CM6",null}, {"1W6Z",null}, //{"1Z2M",null}, distances are too big for the new cutoffs {"2O6N",null}, {"1GA7",null}, {"1ACD","AA0262"}, // test for CSD {"1AA6","AA0022"} , // test for SEC {"1WCT","AA0179"}, {"2VH3","AA0459"}, // Chromophores {"2HGD",null}, // X9Q {"3LF4",null}, // 0YG // {"2BF9","AA0099"}, // TYC this one needs a fix in the CCD before it can work }; return strucs; } public static String[][] setUpLongTest() { String[][] strucs = new String[][] { // Attachments {"3HN3", "AA0151"}, // NAG {"1CPO", "AA0406"}, // XYS {"1AL2", "AA0059"}, // MYR {"1L9H", "AA0106"}, // PLM {"1BDO", "AA0117"}, // BTN //{"2IQD", "AA0118"}, // no successful test case for LPA {"1AT9", "AA0120"}, // RET //{"1DJP", "AA0121"}, // DO2, (bond length error 3.0) {"1ALL", "AA0131"}, // CYC {"1B8D", "AA0132"}, // PEB {"1OGP", "AA0142"}, // MTQ {"1EL5", "AA0143"}, // FAD on CYS {"1W1O", "AA0144"}, // FAD on HIS {"1DII", "AA0145"}, // FAD on TYR {"2LML", "AA0150"}, // PNS {"1D7E", "AA0207"}, // HC4 {"2TMD", "AA0220"}, // FMN {"1VAO", "AA0221"}, // FAD on HIS {"1PDA", "AA0252"}, // DPM {"2J96", "AA0258"}, // PVN {"2HIL", "AA0264"}, // OPE //{"1RTX", "AA0329"}, // HEM, (bond length error 3.0, much closer to FE) {"1FEH", "AA0334"}, // HC1 //{"2Z6D", "AA0351"}, // FMN, (bond length error 2.0) {"1N63", "AA0355"}, // CYS-CUN-MCN {"1HXQ", "AA0372"}, // U5P on HIS {"1QI9", "AA0395"}, // VO4 on HIS {"1XG0", "AA0428"}, // DBV on CYS //{"1E9W", "AA0447"}, // TSI on ILE, error when reading {"2HIL", "AA0497"}, // OPE on SER {"3I3L", "AA0522"}, // FAD on ASP {"1MCC", "AA0045"}, // ACE on GLU {"5CPV", "AA0041"}, // ACE on ALA {"1BBR", "AA0042"}, // ACE on ASP {"1AL1", "AA0044"}, // ACE on GLU {"1SEM", "AA0050"}, // ACE on PRO {"1PVB", "AA0051"}, // ACE on SER {"1ZNF", "AA0053"}, // ACE on THR {"1SCY", "AA0089"}, // NH2 on HIS // Modified resdiues {"3MVJ", "AA0037"}, // SEP {"3MVJ", "AA0038"}, // TPO {"1KZU", "AA0021"}, // FME {"1AA6", "AA0022"}, // CSE {"1NT0", "AA0026"}, // AHB {"1ERM", "AA0027"}, // BHD {"1QGW", "AA0028"}, // LYZ {"2G66", "AA0029"}, // HY3 {"2G66", "AA0030"}, // HYP {"1A39", "AA0031"}, // PCA {"1AG7", "AA0032"}, // CGU {"1D5W", "AA0033"}, // PHD {"1H9C", "AA0034"}, // CSP {"1EUD", "AA0035"}, // NEP {"1NSQ", "AA0036"}, // HIP {"3LXN", "AA0039"}, // PTR {"1ZM2", "AA0040"}, // DDE {"1E0Z", "AA0055"}, // ALY {"1DM3", "AA0056"}, // SCY // {"2NPP", "AA0061"}, // MAA {"1GK8", "AA0064"}, // MME {"2PIL", "AA0065"}, // MEA {"1DOJ", "AA0172"}, // TYS {"1G42", "AA0067"}, // 2MR {"2B2U", "AA0068"}, // DA2 {"2B2U", "AA0074"}, // M3L {"1ALL", "AA0070"}, // MEN {"3FMY", "AA0071"}, // MEQ {"1E6Y", "AA0073"}, // MHS {"1E6Y", "AA0272"}, // AGM {"1IV8", "AA0075"}, // MLY {"1IV8", "AA0076"}, // MLZ {"1ZTO", "AA0082"}, // AAR {"2V1S", "AA0085"}, // CY3 {"1XXP", "AA0091"}, // CLE // {"1XAE", "AA0094"}, // NFA, C-terminal modification, but occurs in non-terminal residue // {"2H9E", "AA0095"}, // LPD // {"2BF9", "AA0099"}, // TYC, error reading PDB file // {"1YYL", "AA0100"}, // VLM {"1AEX", "AA0101"}, // SCH {"1OMW", "AA0105"}, // CMT {"2C0J", "AA0106"}, // P1L {"1AA1", "AA0114"}, // KCX {"1O5K", "AA0115"}, // MCL {"1A8I", "AA0119"}, // LLP {"2J4Y", "AA0120"}, // LYR //PVL not exist in PDB {"1A2V", "AA0147"}, // TPQ {"1JJU", "AA0148"}, // TRQ {"1WCT", "AA0155"}, // GTH {"1A2C", "AA0172"}, // TYS {"1WCT", "AA0179"}, // BTR {"1AUK", "AA0185"}, // FGL {"148L", "AA0191"}, // DAL // {"1C4B", "AA0192"}, // DIL {"1T5M", "AA0196"}, // DSG // {"1CZQ", "AA0198"}, // DTR {"2JUE", "AA0199"}, // DTH {"1A7Y", "AA0200"}, // DVA {"1CXP", "AA0205"}, // CSO {"1F8W", "AA0205"}, // CSX {"1FFV", "AA0215"}, // ARO {"1CKN", "AA0228"}, // GPL {"1BUW", "AA0230"}, // SNC {"1CZI", "AA0234"}, // SMC {"1E93", "AA0251"}, // OMT {"1ACD", "AA0262"}, // CSD {"1C0T", "AA0262"}, // CSW {"1E6Y", "AA0265"}, // GL3 {"1BI0", "AA0269"}, // CSS {"1E6Y", "AA0272"}, // AGM {"1HBM", "AA0273"}, // MGN {"1FFU", "AA0277"}, // CSZ {"3H5R", "AA0302"}, // SNN, note: SNN is not at C-terminal in some structures, e.g. 3I4W {"1NKK", "AA0311"}, // DMH {"1J6Z", "AA0317"}, // HIC {"1B80", "AA0322"}, // HTR {"1CWM", "AA0336"}, // IML {"1BCK", "AA0337"}, // MLE {"1EA7", "AA0361"}, // OSE {"1TYS", "AA0363"}, // CXM {"1EBV", "AA0364"}, // OAS {"2VZK", "AA0423"}, // TH5 {"2IU4", "AA0431"}, // HIQ {"1Y9A", "AA0432"}, // OHS {"2IUW", "AA0444"}, // LED {"1K83", "AA0449"}, // ILX {"2VH3", "AA0458"}, // FGL {"1DSR", "AA0478"}, // AHB {"1AIQ", "AA0493"}, // CXM {"1CF0", "AA0509"}, // IYR {"1CTP", "AA0510"}, // TYI {"3L4M", "AA0520"}, // 0AF {"4ECA", "AA0525"}, // AEI // Cross link {"3M6S", "AA0025"}, // Disulfide bond {"1A6L", "AA0139"}, // F3S {"1A70", "AA0137"}, // FES {"1RPB", "AA0216"}, // Isopeptide (Cys - ASP) {"3B2M", "AA0294"}, // isopeptide (Lys - Asn) {"1FP4", "AA0141"}, // CFM, HCA, CYS, HIS {"1M1N", "AA0141"}, // CFN, HCA, CYS, HIS //{"1G21", "AA0141"}, // CFM, HCA, CYS, HIS, (bond length error 0.5) //{"1M34", "AA0141"}, // CFM, HCA, CYS, HIS, (bond length error 1.0) {"1G7K", "AA0183"}, // CRQ, cross-link1 {"1EMA", "AA0183"}, // CRO, cross-link1 //{"1GGE", "AA0250"}, // HIS-TYR, cross-link2, (bond length error 0.6) {"2JE3", "AA0271"}, // HEC, CYS, CYS, LYS //{"1MHL", "AA0280"}, // not work for HEM //{"1MYP", "AA0280"}, // not work for HEM //{"3HML", "AA0283"}, // PQQ, GLU, TYR, (bond length error 2) {"1FWX", "AA0298"}, // CU4 {"1QNI", "AA0298"}, // CU4 {"2IWF", "AA0298"}, // CU4 {"2IWK", "AA0298"}, // CU4 //{"1G20", "AA0300"}, // CLF (bond length error 20) {"1SU6", "AA0310"}, // NFS, 5 CYS, HIS // {"1SU7", "AA0310"}, // NFS, 5 CYS, HIS (looks like 6 CYS are linked) //{"1JJU", "AA0313"}, // CYS-TRP, (bond length error 3) {"1JJU", "AA0314"}, // CYS-ASP {"1JJU", "AA0315"}, // CYS-GLU //{"1AJ1", "AA0330"}, // CYS-THR, could not find. {"1PXQ", "AA0340"}, // CYS-PHE {"1PXQ", "AA0342"}, // CYS-THR {"1ITK", "AA0348"}, // MET-TYR-TRP //{"1R30", "AA0356"}, // 3 CYS-SF4-SAM (bond length error 0.6) {"1R30", "AA0357"}, // 3 CYS-FES-ARG // {"1S5L", "AA0366"}, // 2 ASP-3 GLU-HIT-OEC (bond length error 6) {"1NGK", "AA0368"}, //TYR-TYR {"1YZW", "AA0378"}, // CRU {"1XQM", "AA0379"}, // CH6 {"1UIS", "AA0379"}, // NRQ {"2OJK", "AA0380"}, // NYG {"2A46", "AA0381"}, // CR7 {"1YZW", "AA0183"}, // CRU {"1XQM", "AA0183"}, // CH6 {"1UIS", "AA0183"}, // NRQ {"2OJK", "AA0183"}, // NYG {"2A46", "AA0183"}, // CR7 {"2AXR", "AA0436"}, // CYS-FAD-HIS {"2QH7", "AA0438"}, // 3 CYS-FES-HIS //{"2VUM", "AA0451"}, // CYS-TRP (bond length error 2) {"3EE4", "AA0490"}, // VAL-TYR {"3H8L", "AA0513"}, // CYS-S3H-CYS {"1CAD", null}, // FE and 4 Cys, cross-link4 // Terbium {"1NCZ", null}, {"3LTQ",null}, {"4ESQ",null}, {"1TJB",null}, {"2V15",null}, {"2K61",null}, // Chromophores {"2HGD",null}, // X9Q {"3LF4",null}, // 0YG }; return strucs; } public void testParser() throws IOException, StructureException { multiTest(); } public void multiTest() throws IOException, StructureException { for ( String[] name : strucs){ parserTest(name[0], (String)null); if ( name[1] != null) parserTest(name[0], name[1]); } } private void parserTest(String pdbId, String residId) throws IOException, StructureException { Set<ProteinModification> mods; if (residId==null) { mods = ProteinModificationRegistry.allModifications(); } else { mods = ProteinModificationRegistry.getByResidId(residId); } parserTest(pdbId, mods); } private void parserTest(String pdbId, Set<ProteinModification> mods) throws IOException, StructureException { Structure struc = TmpAtomCache.cache.getStructure(pdbId); /* //needed for testing 1G20 if ( pdbId.equalsIgnoreCase("1G20")) { Structure n = new StructureImpl(); n.addChain(struc.getPolyChainByPDB("A")); n.addChain(struc.getPolyChainByPDB("B")); for (Chain c : struc.getNonPolyChainsByPDB("A")) n.addChain(c); for (Chain c : struc.getNonPolyChainsByPDB("B")) n.addChain(c); struc = n; } */ ProteinModificationIdentifier parser = new ProteinModificationIdentifier(); boolean recordUnidentifiable = false; parser.setRecordUnidentifiableCompounds(recordUnidentifiable); //parser.setbondLengthTolerance(2); assertFalse(mods.isEmpty()); parser.identify(struc, mods); //System.out.println(parser.getUnidentifiableModifiedResidues()); if ( parser.getIdentifiedModifiedCompound().isEmpty() ){ String msg = "Did not identify any modified compounds for " + pdbId; logger.warn(msg); fail(msg); } assertFalse("Did not identify any modified compounds for " + pdbId , parser.getIdentifiedModifiedCompound().isEmpty()); boolean print = false; if (print) printResult(pdbId, parser, recordUnidentifiable); } private void printResult(String pdbId, ProteinModificationIdentifier parser, boolean recordUnidentifiable) { StringBuilder sb = new StringBuilder(); sb.append("==="); sb.append(pdbId); sb.append("===\n"); Set<ModifiedCompound> mcs = parser.getIdentifiedModifiedCompound(); int i=0; for (ModifiedCompound mc : mcs) { sb.append("Modification #"); sb.append(++i); sb.append(" ").append(mc.getDescription()).append(" ").append(mc.getModification().getId()); sb.append(":\n"); sb.append(mc.getAtomLinkages()); sb.append('\n'); } if (recordUnidentifiable) { Set<StructureGroup> unidentifiedModifiedResidues = parser.getUnidentifiableModifiedResidues(); i = 0; for (StructureGroup group : unidentifiedModifiedResidues) { sb.append("Unidenfied modified residue #"); sb.append(++i); sb.append(":\n"); sb.append(group); sb.append('\n'); } Set<StructureAtomLinkage> unidentifiedLinkages = parser.getUnidentifiableAtomLinkages(); i = 0; for (StructureAtomLinkage link : unidentifiedLinkages) { sb.append("Unidenfied linkage #"); sb.append(++i); sb.append(":\n"); sb.append(link); sb.append('\n'); } } logger.info(sb.toString()); } /** * Note: if you change this unit test, also change the cook book: * http://www.biojava.org/wiki/BioJava:CookBook3:ProtMod */ public void testCookBookTestCases() throws StructureException, IOException { // identify all modificaitons from PDB:1CAD and print them String pdbId = "1CAD"; Structure struc = TmpAtomCache.cache.getStructure(pdbId); Set<ModifiedCompound> mcs = identifyAllModfications(struc); assertFalse(mcs.isEmpty()); // identify all phosphosites from PDB:3MVJ and print them pdbId = "3MVJ"; struc = TmpAtomCache.cache.getStructure(pdbId); List<ResidueNumber> psites = identifyPhosphosites(struc); assertFalse(psites.isEmpty()); } /** * Note: if you change this unit test, also change the cook book: * http://www.biojava.org/wiki/BioJava:CookBook3:ProtMod */ private Set<ModifiedCompound> identifyAllModfications(Structure struc) { ProteinModificationIdentifier parser = new ProteinModificationIdentifier(); parser.identify(struc); Set<ModifiedCompound> mcs = parser.getIdentifiedModifiedCompound(); return mcs; } /** * Note: if you change this unit test, also change the cook book: * http://www.biojava.org/wiki/BioJava:CookBook3:ProtMod */ private List<ResidueNumber> identifyPhosphosites(Structure struc) { List<ResidueNumber> phosphosites = new ArrayList<ResidueNumber>(); ProteinModificationIdentifier parser = new ProteinModificationIdentifier(); parser.identify(struc, ProteinModificationRegistry.getByKeyword("phosphoprotein")); Set<ModifiedCompound> mcs = parser.getIdentifiedModifiedCompound(); for (ModifiedCompound mc : mcs) { Set<StructureGroup> groups = mc.getGroups(true); for (StructureGroup group : groups) { phosphosites.add(group.getPDBResidueNumber()); } } return phosphosites; } }