/**
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created by Spencer Bliven
*
*/
package org.biojava.nbio.structure.io;
import junit.framework.TestCase;
import org.biojava.nbio.structure.*;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.junit.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* @author Spencer Bliven
*
*/
public class StructureSequenceMatcherTest extends TestCase {
private Structure struct1;
private String[] pdbNum1;
private String seq1;
@Override
public void setUp() throws IOException, StructureException {
String name1 = "2PTC";
AtomCache cache = new AtomCache();
struct1 = cache.getStructure(name1);
pdbNum1 = new String[] {
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
"26", "27", "28", "29", "30", "31", "32", "33", "34", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47",
"48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78",
"79", "80", "81", "82", "83", "84", "85", "86", "87", "88",
"89", "90", "91", "92", "93", "94", "95", "96", "97", "98",
"99", "100", "101", "102", "103", "104", "105", "106", "107",
"108", "109", "110", "111", "112", "113", "114", "115", "116",
"117", "118", "119", "120", "121", "122", "123", "124", "125",
"127", "128", "129", "130", "132", "133", "134", "135", "136",
"137", "138", "139", "140", "141", "142", "143", "144", "145",
"146", "147", "148", "149", "150", "151", "152", "153", "154",
"155", "156", "157", "158", "159", "160", "161", "162", "163",
"164", "165", "166", "167", "168", "169", "170", "171", "172",
"173", "174", "175", "176", "177", "178", "179", "180", "181",
"182", "183", "184A", "184", "185", "186", "187", "188A", "188",
"189", "190", "191", "192", "193", "194", "195", "196", "197",
"198", "199", "200", "201", "202", "203", "204", "209", "210",
"211", "212", "213", "214", "215", "216", "217", "219", "220",
"221A", "221", "222", "223", "224", "225", "226", "227", "228",
"229", "230", "231", "232", "233", "234", "235", "236", "237",
"238", "239", "240", "241", "242", "243", "244", "245",
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
"13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
"23", "24", "25", "26", "27", "28", "29", "30", "31", "32",
"33", "34", "35", "36", "37", "38", "39", "40", "41", "42",
"43", "44", "45", "46", "47", "48", "49", "50", "51", "52",
"53", "54", "55", "56", "57", "58"
};
seq1 =
//>2PTC:E|PDBID|CHAIN|SEQUENCE
"IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNT"+
"LNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNM"+
"FCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"+
//>2PTC:I|PDBID|CHAIN|SEQUENCE
"RPDFCLEPPYTGPCKARIIRYFYNAKAGLCQTFVYGGCRAKRNNFKSAEDCMRTCGGA";
assertTrue(seq1.length() == pdbNum1.length);
/*// report some stats
System.out.println("The SEQRES and ATOM information is available via the chains:");
int modelnr = 0 ; // also is 0 if structure is an XRAY structure.
List<Chain> chains = struct1.getChains(modelnr);
for (Chain cha:chains){
List<Group> agr = cha.getAtomGroups(GroupType.AMINOACID);
List<Group> hgr = cha.getAtomGroups(GroupType.HETATM);
List<Group> ngr = cha.getAtomGroups(GroupType.NUCLEOTIDE);
System.out.print("chain: >"+cha.getChainID()+"<");
System.out.print(" length SEQRES: " +cha.getSeqResLength());
System.out.print(" length ATOM: " +cha.getAtomLength());
System.out.print(" aminos: " +agr.size());
System.out.print(" hetatms: "+hgr.size());
System.out.println(" nucleotides: "+ngr.size());
}
System.out.println(prot.toString());
*/
}
@Test
public void testSubstructureMatchingProteinSequence() throws CompoundNotFoundException {
ProteinSequence seq = new ProteinSequence(seq1.substring(30, 40));
Structure result = StructureSequenceMatcher.getSubstructureMatchingProteinSequence(seq, struct1);
assertEquals("Wrong number of groups", 10, StructureTools.getNrGroups(result));
assertEquals("Wrong number of chains", 1, result.getChains().size());
int i = 0;
for (Group group : result.getChainByIndex(0).getAtomGroups()) {
assertTrue("Contains non-amino acid group", group instanceof AminoAcid);
AminoAcid aa = (AminoAcid) group;
char c = StructureTools.get1LetterCodeAmino(aa.getPDBName());
assertEquals("Wrong amino acid", seq.getSequenceAsString().charAt(i), c);
i++;
}
}
@Test
public void testGetProteinSequenceForStructure() {
Map<Integer,Group> groupIndexPos = new HashMap<Integer,Group>();
ProteinSequence prot = StructureSequenceMatcher.getProteinSequenceForStructure(struct1, groupIndexPos);
// Test returned sequence
assertEquals("Unreported residues", seq1.length(), prot.getLength() );
assertEquals("Modified residues",seq1, prot.toString());
// Test mapping
assertEquals("Missing residues in mapping",seq1.length(),groupIndexPos.size());
for(int res=0;res<seq1.length();res++) {
assertTrue("no mapping for group "+res,groupIndexPos.containsKey(res));
Group g = groupIndexPos.get(res);
ResidueNumber resnum = g.getResidueNumber();
Character aa = StructureTools.get1LetterCodeAmino(g.getPDBName());
assertEquals("Wrong PDB number at pos "+res,pdbNum1[res],resnum.toString());
assertEquals("Wrong Amino acid at pos "+res,
Character.valueOf(seq1.charAt(res)),aa);
//System.out.format("%4d %.5s %s\n", res,resnum.toString(),aa.toString());
}
}
@Test
public void testMatchSequenceToStructure() throws StructureException, CompoundNotFoundException {
// create modified sequence by removing 10 residues and adding 3
String sequenceStr = //>2PTC:E|PDBID|CHAIN|SEQUENCE
"IVGGYTCGAN" +
"XXX"+ //added
"TVPYQVSLNS" +
//"GYHFCGGSLI" +
"NSQWVVSAAH" +
"-CYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNT"+
"LNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNM"+
"FCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN";
String[] correctResidues = new String[] {
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
null, null, null,
"26", "27", "28", "29", "30", "31", "32", "33", "34", "37",
//"38", "39", "40", "41", "42", "43", "44", "45", "46", "47",
"48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
null,"58", "59", "60", "61", "62", "63", "64", "65", "66", "67",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78",
"79", "80", "81", "82", "83", "84", "85", "86", "87", "88",
"89", "90", "91", "92", "93", "94", "95", "96", "97", "98",
"99", "100", "101", "102", "103", "104", "105", "106", "107",
"108", "109", "110", "111", "112", "113", "114", "115", "116",
"117", "118", "119", "120", "121", "122", "123", "124", "125",
"127", "128", "129", "130", "132", "133", "134", "135", "136",
"137", "138", "139", "140", "141", "142", "143", "144", "145",
"146", "147", "148", "149", "150", "151", "152", "153", "154",
"155", "156", "157", "158", "159", "160", "161", "162", "163",
"164", "165", "166", "167", "168", "169", "170", "171", "172",
"173", "174", "175", "176", "177", "178", "179", "180", "181",
"182", "183", "184A", "184", "185", "186", "187", "188A", "188",
"189", "190", "191", "192", "193", "194", "195", "196", "197",
"198", "199", "200", "201", "202", "203", "204", "209", "210",
"211", "212", "213", "214", "215", "216", "217", "219", "220",
"221A", "221", "222", "223", "224", "225", "226", "227", "228",
"229", "230", "231", "232", "233", "234", "235", "236", "237",
"238", "239", "240", "241", "242", "243", "244", "245"
};
System.err.println("Note: the following 10 warnings about missing residues are expected.");
ProteinSequence seq = new ProteinSequence(sequenceStr);
ResidueNumber[] match = StructureSequenceMatcher.matchSequenceToStructure(seq, struct1);
assertEquals("Wrong length!",sequenceStr.length(),match.length);
for(int i=0;i<sequenceStr.length();i++) {
ResidueNumber res = match[i];
if( res == null) {
if(!(sequenceStr.charAt(i) == '-' || sequenceStr.charAt(i) == 'X' )) {
fail("Incorrectly marked as missing residue at pos "+i+" aa "+sequenceStr.charAt(i));
}
} else {
Group g = struct1.findGroup(res.getChainName(), res.toString());
assertNotNull(g);
String aa3 = g.getPDBName();
assertNotNull(aa3);
Character aa = StructureTools.get1LetterCodeAmino(aa3);
assertEquals("Wrong PDB number at position "+i,
correctResidues[i] ,g.getResidueNumber().toString());
assertEquals("Wrong amino acid at position "+i,
Character.valueOf(sequenceStr.charAt(i)),aa);
}
}
}
@Test
public void testRemoveGaps1() throws CompoundNotFoundException {
String ungapped = "ACDEFGHIKLMNPQRSTVWY";
String gapped = "--ACDE-F-GHI..KLM-NPQRSTVWY--";
ProteinSequence gappedProt = new ProteinSequence(gapped);
ProteinSequence ungappedProt = StructureSequenceMatcher.removeGaps(gappedProt);
assertEquals(ungapped,ungappedProt.getSequenceAsString());
}
}