TestDifficultMmCIFFiles.java example

Explorer
biojava-master
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeNotNull;
import static org.junit.Assume.assumeTrue;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.Map;

import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.Group;
import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureIO;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.io.mmcif.MMcifParser;
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
import org.junit.Test;

/**
 * Testing parsing of some difficult mmCIF files.
 * For instance those containing multi-line quoting using ";\n" as delimiters
 * Feel free to add any other difficult case here
 *
 *
 * @author duarte_j
 *
 */
public class TestDifficultMmCIFFiles {

	/**
	 * The 2KSA mmCIF contains a 5 model NMR structure.  The first residue of the sequence is not visible
	 * and the models should all begin with Asp indexed as residue #2.
	 * @throws IOException
	 * @throws StructureException
	 */
	@Test
	public void test2KSA() throws IOException, StructureException {
		AtomCache cache = new AtomCache();

		StructureIO.setAtomCache(cache);

		FileParsingParameters params = cache.getFileParsingParams();
		params.setParseBioAssembly(true);
		params.setAlignSeqRes(true);
		StructureIO.setAtomCache(cache);


		cache.setUseMmCif(true);
		Structure sCif = StructureIO.getStructure("2KSA");

		assertNotNull(sCif);

		// Unit test for each of the chains to show they begin with the correct first residue.
		for (int i = 0; i < sCif.nrModels(); i++) {
			List<Chain> chains = sCif.getModel(i);

			// Chain A first residue should start at ASP 2..
			// but if replaceGroupSeqPos(PdbxPolySeqScheme ppss) is used, this is incorrect and will be 1.
			assertEquals(2, chains.get(0).getAtomGroup(0).getResidueNumber().getSeqNum().intValue());
		}
	}

	@Test
	public void test2BI6() throws IOException, StructureException {

		// In this entry _struct_conf contains multiline quoting (quoting with "\n;" ) in a non-loop field

		// It seems that at the moment the field is not parsed by the mmCIF parser, anyway let's
		// keep this here if in the future it is


		AtomCache cache = new AtomCache();

		StructureIO.setAtomCache(cache);

		cache.setUseMmCif(true);
		Structure sCif = StructureIO.getStructure("2BI6");

		assertNotNull(sCif);

		// an NMR entry
		assertFalse(sCif.isCrystallographic());

		assertTrue(sCif.isNmr());

		assertTrue(sCif.getPDBHeader().getRevisionRecords().size() > 1);


	}

	@Test
	public void test1GQO() throws IOException, StructureException {

		// In this entry _pdbx_struct_assembly_gen contains multiline quoting (quoting with "\n;" ) in loop field

		AtomCache cache = new AtomCache();

		StructureIO.setAtomCache(cache);

		FileParsingParameters params = cache.getFileParsingParams();
		params.setParseBioAssembly(true);
		StructureIO.setAtomCache(cache);

		cache.setUseMmCif(false);
		Structure sPdb = StructureIO.getStructure("1GQO");

		cache.setUseMmCif(true);
		Structure sCif = StructureIO.getStructure("1GQO");

		assertNotNull(sCif);

		assertNotNull(sPdb.getPDBHeader().getBioAssemblies());
		assertNotNull(sCif.getPDBHeader().getBioAssemblies());

		Map<Integer,BioAssemblyInfo> mapPdb = sPdb.getPDBHeader().getBioAssemblies();
		Map<Integer,BioAssemblyInfo> mapCif = sCif.getPDBHeader().getBioAssemblies();



		assertEquals(mapPdb.size(),mapCif.size());

		assertEquals(60, mapCif.get(1).getTransforms().size());
		assertEquals(60, mapCif.get(2).getTransforms().size());

		// an X-RAY entry
		assertTrue(sPdb.isCrystallographic());
		assertTrue(sCif.isCrystallographic());

		assertFalse(sPdb.isNmr());
		assertFalse(sCif.isNmr());


	}

	@Test
	public void testResidueNumbers() throws IOException, StructureException {
		AtomCache cache = new AtomCache();
		cache.setUseMmCif(true);

		Structure s = cache.getStructure("2PTC");
		Chain c = s.getChainByIndex(0);
		System.out.println(c);
		assertEquals("Wrong first chain",c.getName(),"E");

		Group res = c.getAtomGroup(0);
		ResidueNumber resNum = res.getResidueNumber();

		assertEquals("Groups have wrong chain in resnum",resNum.getChainName(),"E");
	}

	@Test
	public void test4letterChains() throws IOException, StructureException, URISyntaxException {
		String filename = "/1hh0_4char.cif.gz";
		URL url = getClass().getResource(filename);
		assumeNotNull("Can't find resource "+filename,url);

		File file = new File(url.toURI());
		assumeNotNull(file);
		assumeTrue(file.exists());

		MMCIFFileReader reader = new MMCIFFileReader();
		Structure s = reader.getStructure(file);

		assertNotNull("Failed to load structure from jar",s);

		List<Chain> chains = s.getChains();
		assertEquals("Wrong number of chains",chains.size(), 1);

		Chain chain = chains.get(0);
		assertEquals("Wrong chain ID",chain.getId(),"ABCD");

		Chain chain2 = s.getPolyChainByPDB("ABCD");
		assertNotNull(chain2);
		assertEquals(chain2, chain);
	}

	/**
	 * This is to test the issue discussed here:
	 * http://www.globalphasing.com/startools/
	 * Essentially single quote characters (') are valid not only for quoting, but also as parts of
	 * data values as long as some rules of the STAR format are followed.
	 * For instance Phenix produces mmCIF files with non-quoted strings containing single quote characters
	 * @throws IOException
	 */
	@Test
	public void testQuotingCornerCase () throws IOException {
		InputStream inStream = this.getClass().getResourceAsStream("/org/biojava/nbio/structure/io/difficult_mmcif_quoting.cif");
		MMcifParser parser = new SimpleMMcifParser();

		SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();

		FileParsingParameters fileParsingParams = new FileParsingParameters();
		fileParsingParams.setAlignSeqRes(true);

		consumer.setFileParsingParameters(fileParsingParams);

		parser.addMMcifConsumer(consumer);

		parser.parse(new BufferedReader(new InputStreamReader(inStream)));

		Structure s = consumer.getStructure();

		assertNotNull(s);


	}

	/**
	 * The last category in 2KLI mmCIF file is _pdbx_struct_oper_list, which is needed for
	 * the biounit annotation.
	 * This tests makes sure that the last category in a mmCIF file is not missed because
	 * of its position as last one in file.
	 * @throws IOException
	 * @throws StructureException
	 */
	@Test
	public void test2KLI() throws IOException, StructureException {

		AtomCache cache = new AtomCache();

		StructureIO.setAtomCache(cache);

		FileParsingParameters params = cache.getFileParsingParams();
		params.setParseBioAssembly(true);
		StructureIO.setAtomCache(cache);


		cache.setUseMmCif(true);
		Structure sCif = StructureIO.getStructure("2KLI");

		assertNotNull(sCif);

		assertNotNull(sCif.getPDBHeader().getBioAssemblies());

		Map<Integer,BioAssemblyInfo> mapCif = sCif.getPDBHeader().getBioAssemblies();

		assertNotNull(mapCif);

	}
}