FastaStructureParser.java example

Explorer
biojava-master
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.structure.io;

import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.io.FastaReader;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;


/**
 * Reads a protein sequence from a fasta file and attempts to match it to a
 * 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in
 * the output, allowing structural alignments to be read from fasta files.
 *
 * <p>Structures are loaded from an AtomCache. For this to work, the accession
 * for each protein should be parsed from the fasta header line into a form
 * understood by {@link AtomCache#getStructure(String)}.
 *
 * <p>Lowercase letters are sometimes used to specify unaligned residues.
 * This information can be preserved by using a CasePreservingSequenceCreator,
 * which allows the case of residues to be accessed through the
 * {@link ProteinSequence#getUserCollection()} method.
 *
 * @author Spencer Bliven
 *
 */
public class FastaStructureParser {

	// inputs
	private FastaReader<ProteinSequence, AminoAcidCompound> reader;
	private AtomCache cache;

	// cache processed data
	private String[] accessions;
	private ProteinSequence[] sequences;
	private Structure[] structures;
	private ResidueNumber[][] residues;

	public FastaStructureParser(InputStream is,
			SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
			SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
			AtomCache cache)
	{
		this(new FastaReader<ProteinSequence, AminoAcidCompound>(
				is, headerParser, sequenceCreator),cache);
	}

	public FastaStructureParser(File file,
			SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
			SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
			AtomCache cache) throws FileNotFoundException
	{
		this(new FastaReader<ProteinSequence, AminoAcidCompound>(
				file, headerParser, sequenceCreator), cache);
	}

	public FastaStructureParser(FastaReader<ProteinSequence, AminoAcidCompound> reader,
			AtomCache cache) {
		this.reader = reader;
		this.cache = cache;
		this.accessions = null;
		this.sequences = null;
		this.structures = null;
		this.residues = null;
	}


	/**
	 * Parses the fasta file and loads it into memory.
	 *
	 * Information can be subsequently accessed through
	 * {@link #getSequences()},
	 * {@link #getStructures()},
	 * {@link #getResidues()}, and
	 * {@link #getAccessions()}.
	 *
	 * @throws IOException
	 * @throws StructureException
	 */
	public void process() throws IOException, StructureException {
		if(sequences == null) { // only process once, then return cached values
			LinkedHashMap<String, ProteinSequence> sequenceMap = reader.process();

			sequences = sequenceMap.values().toArray(new ProteinSequence[0]);
			accessions = new String[sequences.length];
			structures = new Structure[sequences.length];
			residues = new ResidueNumber[sequences.length][];

			// Match each sequence  to a series of PDB Residue numbers
			for(int i=0;i<sequences.length;i++) {
				accessions[i] = sequences[i].getAccession().getID();

				//System.out.println("Fetching "+accession);
				structures[i] = cache.getStructure(accessions[i]);

				residues[i] = StructureSequenceMatcher.matchSequenceToStructure(sequences[i], structures[i]);

				assert( residues[i].length == sequences[i].getLength());
			}
		}
	}


	/**
	 * Gets the protein sequences read from the Fasta file.
	 * Returns null if {@link #process()} has not been called.
	 * @return An array ProteinSequences from
	 *  parsing the fasta file, or null if process() hasn't been called.
	 */
	public ProteinSequence[] getSequences() {
		return sequences;
	}

	/**
	 * Gets the protein structures mapped from the Fasta file.
	 * Returns null if {@link #process()} has not been called.
	 * @return An array of Structures for each protein
	 *  in the fasta file, or null if process() hasn't been called.
	 */
	public Structure[] getStructures() {
		return structures;
	}

	/**
	 * For each residue in the fasta file, return the ResidueNumber in the
	 * corresponding structure. If the residue cannot be found in the structure,
	 * that entry will be null. This can happen if that residue was not included
	 * in the PDB file (eg disordered residues), if the fasta sequence does not
	 * match the PDB sequence, or if errors occur during the matching process.
	 * @return A 2D array of ResidueNumbers, or null if process() hasn't been called.
	 * @see StructureSequenceMatcher#matchSequenceToStructure(ProteinSequence, Structure)
	 */
	public ResidueNumber[][] getResidues() {
		return residues;
	}

	/**
	 * Gets the protein accessions mapped from the Fasta file.
	 * Returns null if {@link #process()} has not been called.
	 * @return An array of Structures for each protein
	 *  in the fasta file, or null if process() hasn't been called.
	 */
	public String[] getAccessions() {
		return accessions;
	}
}