TestHeaderOnly.java example

Explorer
biojava-master
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io;

import static org.junit.Assert.assertNotNull;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.biojava.nbio.structure.Chain;
import org.biojava.nbio.structure.Group;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.StructureIO;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.structure.io.LocalPDBDirectory.FetchBehavior;
import org.biojava.nbio.structure.io.mmcif.MMcifParser;
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TestHeaderOnly {

	private static final Logger logger = LoggerFactory.getLogger(TestHeaderOnly.class);


	private final String pdbID = "1REP";

	/**
	 * All groups are expected to be empty.
	 *
	 * @throws StructureException
	 * @throws IOException
	 */
	@Test
	public void testHeaderOnly() throws StructureException, IOException {
		// Get either PDB or mmCIF with a headerOnly = true.

		// Test 1: with PDB
		AtomCache cache = new AtomCache();
		cache.setUseMmCif(false);

		FileParsingParameters params = new FileParsingParameters();
		params.setHeaderOnly(true);
		// params.setAlignSeqRes(true);  // Now this is default.
		cache.setFileParsingParams(params);

		StructureIO.setAtomCache(cache);

		Structure sPDB = StructureIO.getStructure(pdbID);

		Assert.assertEquals(false, doSeqResHaveAtoms(sPDB));
		
		// Test 2: with mmCIF
		cache.setUseMmCif(true);

		Structure sCIF = StructureIO.getStructure(pdbID);
		Assert.assertEquals(false, doSeqResHaveAtoms(sCIF));

		
	}

	/**
	 * Test that with alignSeqRes, expected Group(s) have Atoms, while others
	 * are present with correct sequence but empty.
	 *
	 * @throws StructureException
	 * @throws IOException
	 */
	@Test
	public void testAlignSeqres() throws StructureException, IOException {
		// Get either PDB or mmCIF with a headerOnly = false.

		// Test 1: with PDB
		AtomCache cache = new AtomCache();
		cache.setUseMmCif(false);

		FileParsingParameters params = new FileParsingParameters();
		params.setHeaderOnly(false);
		// params.setAlignSeqRes(true);  // Now this is default.
		cache.setFileParsingParams(params);

		StructureIO.setAtomCache(cache);

		Structure sPDB = StructureIO.getStructure(pdbID);
		Assert.assertEquals(true, doSeqResHaveAtoms(sPDB));
		check1REPChainC(sPDB); // Check particular residues to be aligned.

		// Test 2: with mmCIF
		cache.setUseMmCif(true);

		Structure sCIF = StructureIO.getStructure(pdbID);
		Assert.assertEquals(true, doSeqResHaveAtoms(sCIF));
		check1REPChainC(sCIF); // Check particular residues to be aligned.
	}

	// A better test follows that uses local files.
	// @Test
	public void testSpeed() {
		// Force using a file reader.
		MMCIFFileReader fr = new MMCIFFileReader();
		FileParsingParameters par = new FileParsingParameters();
		//par.setAlignSeqRes(true);
		// par.setHeaderOnly(true);
		par.setHeaderOnly(false);
		fr.setFileParsingParameters(par);
		fr.setFetchBehavior(FetchBehavior.FETCH_FILES);

		Structure s = null;
		long start = System.nanoTime();
		try {
			// Medium sized structure parsed in 0.549s (no header) vs .676s (header) ~ 20% faster
			s = fr.getStructureById("4WZ6");
			// A larger structure could be parsed ~ 4.991s (no header) vs 5.867s (header) ~ 16% faster
			// s = fr.getStructureById("4V60");
		} catch (IOException e) {
			e.printStackTrace();
			System.exit(1);
		}
		long stop = System.nanoTime();
		double diff = (stop - start) / 1000000000.0;
		logger.info(String.format("[%s] Elapsed time: %.3f s", s.getIdentifier(), diff));
	}

	// Test using local files.
	@Test
	public void testSpeed2() throws StructureException, IOException {
		// Test the file parsing speed when the files are already downloaded.

		InputStream cifStream = new GZIPInputStream(this.getClass().getResourceAsStream("/4hhb.cif.gz"));
		InputStream pdbStream = new GZIPInputStream(this.getClass().getResourceAsStream("/4hhb.pdb.gz"));

		assertNotNull(cifStream);

		FileParsingParameters params = new FileParsingParameters();
		params.setHeaderOnly(true);  // Flip this true/false to compare parsing speed.

		logger.info("Testing PDB parsing speed");
		PDBFileParser pdbpars = new PDBFileParser();
		pdbpars.setFileParsingParameters(params);
		//pdbpars.setLoadChemCompInfo(true);
		long start = System.nanoTime();
		Structure s1 = pdbpars.parsePDBFile(pdbStream) ;
		long stop = System.nanoTime();
		double diff = (stop - start) / 1000000000.0;
		logger.info(String.format("[%s] Elapsed time: %.3f s", s1.getIdentifier(), diff));

		MMcifParser mmcifpars = new SimpleMMcifParser();
		SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
		consumer.setFileParsingParameters(params);
		mmcifpars.addMMcifConsumer(consumer);

		logger.info("Testing mmCIF parsing speed");
		start = System.nanoTime();
		mmcifpars.parse(cifStream) ;
		Structure s2 = consumer.getStructure();
		stop = System.nanoTime();
		diff = (stop - start) / 1000000000.0;
		logger.info(String.format("[%s] Elapsed time: %.3f s", s2.getIdentifier(), diff));

		/* Running from an SSD..
		 * PDB .165s (all atom) -> 0.009s (only header)  95% faster.
		 * mmCIF 0.323s (no header) -> 0.175s (only header) 45% faster.
		 */
	}

	/**
	 * Scan through SeqResGroups, returns true if any have Atoms.
	 * @param s
	 * @return
	 */
	public boolean doSeqResHaveAtoms(Structure s) {
		for (int i = 0; i < s.nrModels(); i++) {
			for (Chain c : s.getChains(i)) {
				for (Group g : c.getSeqResGroups()) {
					if (hasAtoms(g)) return true; // Found some Atoms in a Seqres group.
				}
			}
		}

		return false;
	}

	/**
	 * Does a group have any Atom(s)?
	 *
	 * @param g : a group
	 * @return true if has any Atom(s)
	 */
	public boolean hasAtoms(Group g) {
		if (g.getAtoms().size() > 0) return true;
		return false;
	}

	/**
	 * Check that the gapped residues have no atoms, but that ungapped residues
	 * have atoms.
	 *
	 * @param s: Structure to test.
	 */
	public void check1REPChainC(Structure s) throws StructureException {
		String sequence = "MAETAVINHKKRKNSPRIVQSNDLTEAAYSLSRDQKRMLYLFVDQIRK" +
				"SDGTLQEHDGICEIHVAKYAEIFGLTSAEASKDIRQALKSFAGKEVVFYRPEEDAGDE" +
				"KGYESFPWFIKPAHSPSRGLYSVHINPYLIPFFIGLQNRFTQFRLSETKEITNPYAMR" +
				"LYESLCQYRKPDGSGIVSLKIDWIIERYQLPQSYQRMPDFRRRFLQVCVNEINSRTPM" +
				"RLSYIEKKKGRQTTHIVFSFRDITSMTTG";

		boolean [] shouldMatch = new boolean[sequence.length()];
		for (int i = 0; i < sequence.length(); i++) shouldMatch[i] = true;

		// 1-14 is gap
		for (int i = 0; i < 14; i++) shouldMatch[i] = false;

		// 50-55 is gap
		for (int i = 49; i < 55; i++) shouldMatch[i] = false;

		// 98-109 is gap
		for (int i = 97; i < 109; i++) shouldMatch[i] = false;

		// 247-251 is gap
		for (int i = 246; i < 251; i++) shouldMatch[i] = false;

		Chain c = s.getPolyChainByPDB("C");

		List<Group> seqres = c.getSeqResGroups();

		// Check lengths
		Assert.assertEquals(sequence.length(), seqres.size());

		// Check sequences.
		Assert.assertEquals(sequence, c.getSeqResSequence());

		for (int i = 0; i < sequence.length(); i++) {
			Assert.assertEquals(shouldMatch[i], hasAtoms(seqres.get(i)));
		}
	}

	/**
	 *
	 * @param seqres : a list of Group(s)
	 * @return a String representing these Groups
	 */
	public String getSequenceString(List<Group> seqres) {
		StringBuilder sb = new StringBuilder();

		for (Group g : seqres) {
			ChemComp c = g.getChemComp();
			sb.append(c.getOne_letter_code());
		}

		return sb.toString();
	}
}