/** * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * Created by Spencer Bliven * */ package org.biojava.nbio.structure.io; import junit.framework.TestCase; import org.biojava.nbio.structure.*; import org.biojava.nbio.structure.align.util.AtomCache; import org.biojava.nbio.core.exceptions.CompoundNotFoundException; import org.biojava.nbio.core.sequence.ProteinSequence; import org.junit.Test; import java.io.IOException; import java.util.HashMap; import java.util.Map; /** * @author Spencer Bliven * */ public class StructureSequenceMatcherTest extends TestCase { private Structure struct1; private String[] pdbNum1; private String seq1; @Override public void setUp() throws IOException, StructureException { String name1 = "2PTC"; AtomCache cache = new AtomCache(); struct1 = cache.getStructure(name1); pdbNum1 = new String[] { "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114", "115", "116", "117", "118", "119", "120", "121", "122", "123", "124", "125", "127", "128", "129", "130", "132", "133", "134", "135", "136", "137", "138", "139", "140", "141", "142", "143", "144", "145", "146", "147", "148", "149", "150", "151", "152", "153", "154", "155", "156", "157", "158", "159", "160", "161", "162", "163", "164", "165", "166", "167", "168", "169", "170", "171", "172", "173", "174", "175", "176", "177", "178", "179", "180", "181", "182", "183", "184A", "184", "185", "186", "187", "188A", "188", "189", "190", "191", "192", "193", "194", "195", "196", "197", "198", "199", "200", "201", "202", "203", "204", "209", "210", "211", "212", "213", "214", "215", "216", "217", "219", "220", "221A", "221", "222", "223", "224", "225", "226", "227", "228", "229", "230", "231", "232", "233", "234", "235", "236", "237", "238", "239", "240", "241", "242", "243", "244", "245", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58" }; seq1 = //>2PTC:E|PDBID|CHAIN|SEQUENCE "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNT"+ "LNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNM"+ "FCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"+ //>2PTC:I|PDBID|CHAIN|SEQUENCE "RPDFCLEPPYTGPCKARIIRYFYNAKAGLCQTFVYGGCRAKRNNFKSAEDCMRTCGGA"; assertTrue(seq1.length() == pdbNum1.length); /*// report some stats System.out.println("The SEQRES and ATOM information is available via the chains:"); int modelnr = 0 ; // also is 0 if structure is an XRAY structure. List<Chain> chains = struct1.getChains(modelnr); for (Chain cha:chains){ List<Group> agr = cha.getAtomGroups(GroupType.AMINOACID); List<Group> hgr = cha.getAtomGroups(GroupType.HETATM); List<Group> ngr = cha.getAtomGroups(GroupType.NUCLEOTIDE); System.out.print("chain: >"+cha.getChainID()+"<"); System.out.print(" length SEQRES: " +cha.getSeqResLength()); System.out.print(" length ATOM: " +cha.getAtomLength()); System.out.print(" aminos: " +agr.size()); System.out.print(" hetatms: "+hgr.size()); System.out.println(" nucleotides: "+ngr.size()); } System.out.println(prot.toString()); */ } @Test public void testSubstructureMatchingProteinSequence() throws CompoundNotFoundException { ProteinSequence seq = new ProteinSequence(seq1.substring(30, 40)); Structure result = StructureSequenceMatcher.getSubstructureMatchingProteinSequence(seq, struct1); assertEquals("Wrong number of groups", 10, StructureTools.getNrGroups(result)); assertEquals("Wrong number of chains", 1, result.getChains().size()); int i = 0; for (Group group : result.getChainByIndex(0).getAtomGroups()) { assertTrue("Contains non-amino acid group", group instanceof AminoAcid); AminoAcid aa = (AminoAcid) group; char c = StructureTools.get1LetterCodeAmino(aa.getPDBName()); assertEquals("Wrong amino acid", seq.getSequenceAsString().charAt(i), c); i++; } } @Test public void testGetProteinSequenceForStructure() { Map<Integer,Group> groupIndexPos = new HashMap<Integer,Group>(); ProteinSequence prot = StructureSequenceMatcher.getProteinSequenceForStructure(struct1, groupIndexPos); // Test returned sequence assertEquals("Unreported residues", seq1.length(), prot.getLength() ); assertEquals("Modified residues",seq1, prot.toString()); // Test mapping assertEquals("Missing residues in mapping",seq1.length(),groupIndexPos.size()); for(int res=0;res<seq1.length();res++) { assertTrue("no mapping for group "+res,groupIndexPos.containsKey(res)); Group g = groupIndexPos.get(res); ResidueNumber resnum = g.getResidueNumber(); Character aa = StructureTools.get1LetterCodeAmino(g.getPDBName()); assertEquals("Wrong PDB number at pos "+res,pdbNum1[res],resnum.toString()); assertEquals("Wrong Amino acid at pos "+res, Character.valueOf(seq1.charAt(res)),aa); //System.out.format("%4d %.5s %s\n", res,resnum.toString(),aa.toString()); } } @Test public void testMatchSequenceToStructure() throws StructureException, CompoundNotFoundException { // create modified sequence by removing 10 residues and adding 3 String sequenceStr = //>2PTC:E|PDBID|CHAIN|SEQUENCE "IVGGYTCGAN" + "XXX"+ //added "TVPYQVSLNS" + //"GYHFCGGSLI" + "NSQWVVSAAH" + "-CYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNT"+ "LNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNM"+ "FCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"; String[] correctResidues = new String[] { "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", null, null, null, "26", "27", "28", "29", "30", "31", "32", "33", "34", "37", //"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", null,"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114", "115", "116", "117", "118", "119", "120", "121", "122", "123", "124", "125", "127", "128", "129", "130", "132", "133", "134", "135", "136", "137", "138", "139", "140", "141", "142", "143", "144", "145", "146", "147", "148", "149", "150", "151", "152", "153", "154", "155", "156", "157", "158", "159", "160", "161", "162", "163", "164", "165", "166", "167", "168", "169", "170", "171", "172", "173", "174", "175", "176", "177", "178", "179", "180", "181", "182", "183", "184A", "184", "185", "186", "187", "188A", "188", "189", "190", "191", "192", "193", "194", "195", "196", "197", "198", "199", "200", "201", "202", "203", "204", "209", "210", "211", "212", "213", "214", "215", "216", "217", "219", "220", "221A", "221", "222", "223", "224", "225", "226", "227", "228", "229", "230", "231", "232", "233", "234", "235", "236", "237", "238", "239", "240", "241", "242", "243", "244", "245" }; System.err.println("Note: the following 10 warnings about missing residues are expected."); ProteinSequence seq = new ProteinSequence(sequenceStr); ResidueNumber[] match = StructureSequenceMatcher.matchSequenceToStructure(seq, struct1); assertEquals("Wrong length!",sequenceStr.length(),match.length); for(int i=0;i<sequenceStr.length();i++) { ResidueNumber res = match[i]; if( res == null) { if(!(sequenceStr.charAt(i) == '-' || sequenceStr.charAt(i) == 'X' )) { fail("Incorrectly marked as missing residue at pos "+i+" aa "+sequenceStr.charAt(i)); } } else { Group g = struct1.findGroup(res.getChainName(), res.toString()); assertNotNull(g); String aa3 = g.getPDBName(); assertNotNull(aa3); Character aa = StructureTools.get1LetterCodeAmino(aa3); assertEquals("Wrong PDB number at position "+i, correctResidues[i] ,g.getResidueNumber().toString()); assertEquals("Wrong amino acid at position "+i, Character.valueOf(sequenceStr.charAt(i)),aa); } } } @Test public void testRemoveGaps1() throws CompoundNotFoundException { String ungapped = "ACDEFGHIKLMNPQRSTVWY"; String gapped = "--ACDE-F-GHI..KLM-NPQRSTVWY--"; ProteinSequence gappedProt = new ProteinSequence(gapped); ProteinSequence ungappedProt = StructureSequenceMatcher.removeGaps(gappedProt); assertEquals(ungapped,ungappedProt.getSequenceAsString()); } }