/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure.io;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeNotNull;
import static org.junit.Assume.assumeTrue;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.Map;
import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.Group;
import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureIO;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.io.mmcif.MMcifParser;
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
import org.junit.Test;
/**
* Testing parsing of some difficult mmCIF files.
* For instance those containing multi-line quoting using ";\n" as delimiters
* Feel free to add any other difficult case here
*
*
* @author duarte_j
*
*/
public class TestDifficultMmCIFFiles {
/**
* The 2KSA mmCIF contains a 5 model NMR structure. The first residue of the sequence is not visible
* and the models should all begin with Asp indexed as residue #2.
* @throws IOException
* @throws StructureException
*/
@Test
public void test2KSA() throws IOException, StructureException {
AtomCache cache = new AtomCache();
StructureIO.setAtomCache(cache);
FileParsingParameters params = cache.getFileParsingParams();
params.setParseBioAssembly(true);
params.setAlignSeqRes(true);
StructureIO.setAtomCache(cache);
cache.setUseMmCif(true);
Structure sCif = StructureIO.getStructure("2KSA");
assertNotNull(sCif);
// Unit test for each of the chains to show they begin with the correct first residue.
for (int i = 0; i < sCif.nrModels(); i++) {
List<Chain> chains = sCif.getModel(i);
// Chain A first residue should start at ASP 2..
// but if replaceGroupSeqPos(PdbxPolySeqScheme ppss) is used, this is incorrect and will be 1.
assertEquals(2, chains.get(0).getAtomGroup(0).getResidueNumber().getSeqNum().intValue());
}
}
@Test
public void test2BI6() throws IOException, StructureException {
// In this entry _struct_conf contains multiline quoting (quoting with "\n;" ) in a non-loop field
// It seems that at the moment the field is not parsed by the mmCIF parser, anyway let's
// keep this here if in the future it is
AtomCache cache = new AtomCache();
StructureIO.setAtomCache(cache);
cache.setUseMmCif(true);
Structure sCif = StructureIO.getStructure("2BI6");
assertNotNull(sCif);
// an NMR entry
assertFalse(sCif.isCrystallographic());
assertTrue(sCif.isNmr());
assertTrue(sCif.getPDBHeader().getRevisionRecords().size() > 1);
}
@Test
public void test1GQO() throws IOException, StructureException {
// In this entry _pdbx_struct_assembly_gen contains multiline quoting (quoting with "\n;" ) in loop field
AtomCache cache = new AtomCache();
StructureIO.setAtomCache(cache);
FileParsingParameters params = cache.getFileParsingParams();
params.setParseBioAssembly(true);
StructureIO.setAtomCache(cache);
cache.setUseMmCif(false);
Structure sPdb = StructureIO.getStructure("1GQO");
cache.setUseMmCif(true);
Structure sCif = StructureIO.getStructure("1GQO");
assertNotNull(sCif);
assertNotNull(sPdb.getPDBHeader().getBioAssemblies());
assertNotNull(sCif.getPDBHeader().getBioAssemblies());
Map<Integer,BioAssemblyInfo> mapPdb = sPdb.getPDBHeader().getBioAssemblies();
Map<Integer,BioAssemblyInfo> mapCif = sCif.getPDBHeader().getBioAssemblies();
assertEquals(mapPdb.size(),mapCif.size());
assertEquals(60, mapCif.get(1).getTransforms().size());
assertEquals(60, mapCif.get(2).getTransforms().size());
// an X-RAY entry
assertTrue(sPdb.isCrystallographic());
assertTrue(sCif.isCrystallographic());
assertFalse(sPdb.isNmr());
assertFalse(sCif.isNmr());
}
@Test
public void testResidueNumbers() throws IOException, StructureException {
AtomCache cache = new AtomCache();
cache.setUseMmCif(true);
Structure s = cache.getStructure("2PTC");
Chain c = s.getChainByIndex(0);
System.out.println(c);
assertEquals("Wrong first chain",c.getName(),"E");
Group res = c.getAtomGroup(0);
ResidueNumber resNum = res.getResidueNumber();
assertEquals("Groups have wrong chain in resnum",resNum.getChainName(),"E");
}
@Test
public void test4letterChains() throws IOException, StructureException, URISyntaxException {
String filename = "/1hh0_4char.cif.gz";
URL url = getClass().getResource(filename);
assumeNotNull("Can't find resource "+filename,url);
File file = new File(url.toURI());
assumeNotNull(file);
assumeTrue(file.exists());
MMCIFFileReader reader = new MMCIFFileReader();
Structure s = reader.getStructure(file);
assertNotNull("Failed to load structure from jar",s);
List<Chain> chains = s.getChains();
assertEquals("Wrong number of chains",chains.size(), 1);
Chain chain = chains.get(0);
assertEquals("Wrong chain ID",chain.getId(),"ABCD");
Chain chain2 = s.getPolyChainByPDB("ABCD");
assertNotNull(chain2);
assertEquals(chain2, chain);
}
/**
* This is to test the issue discussed here:
* http://www.globalphasing.com/startools/
* Essentially single quote characters (') are valid not only for quoting, but also as parts of
* data values as long as some rules of the STAR format are followed.
* For instance Phenix produces mmCIF files with non-quoted strings containing single quote characters
* @throws IOException
*/
@Test
public void testQuotingCornerCase () throws IOException {
InputStream inStream = this.getClass().getResourceAsStream("/org/biojava/nbio/structure/io/difficult_mmcif_quoting.cif");
MMcifParser parser = new SimpleMMcifParser();
SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
FileParsingParameters fileParsingParams = new FileParsingParameters();
fileParsingParams.setAlignSeqRes(true);
consumer.setFileParsingParameters(fileParsingParams);
parser.addMMcifConsumer(consumer);
parser.parse(new BufferedReader(new InputStreamReader(inStream)));
Structure s = consumer.getStructure();
assertNotNull(s);
}
/**
* The last category in 2KLI mmCIF file is _pdbx_struct_oper_list, which is needed for
* the biounit annotation.
* This tests makes sure that the last category in a mmCIF file is not missed because
* of its position as last one in file.
* @throws IOException
* @throws StructureException
*/
@Test
public void test2KLI() throws IOException, StructureException {
AtomCache cache = new AtomCache();
StructureIO.setAtomCache(cache);
FileParsingParameters params = cache.getFileParsingParams();
params.setParseBioAssembly(true);
StructureIO.setAtomCache(cache);
cache.setUseMmCif(true);
Structure sCif = StructureIO.getStructure("2KLI");
assertNotNull(sCif);
assertNotNull(sCif.getPDBHeader().getBioAssemblies());
Map<Integer,BioAssemblyInfo> mapCif = sCif.getPDBHeader().getBioAssemblies();
assertNotNull(mapCif);
}
}