/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.structure.io;
import org.biojava.nbio.structure.ResidueNumber;
import org.biojava.nbio.structure.Structure;
import org.biojava.nbio.structure.StructureException;
import org.biojava.nbio.structure.align.util.AtomCache;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.io.FastaReader;
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;
/**
* Reads a protein sequence from a fasta file and attempts to match it to a
* 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in
* the output, allowing structural alignments to be read from fasta files.
*
* <p>Structures are loaded from an AtomCache. For this to work, the accession
* for each protein should be parsed from the fasta header line into a form
* understood by {@link AtomCache#getStructure(String)}.
*
* <p>Lowercase letters are sometimes used to specify unaligned residues.
* This information can be preserved by using a CasePreservingSequenceCreator,
* which allows the case of residues to be accessed through the
* {@link ProteinSequence#getUserCollection()} method.
*
* @author Spencer Bliven
*
*/
public class FastaStructureParser {
// inputs
private FastaReader<ProteinSequence, AminoAcidCompound> reader;
private AtomCache cache;
// cache processed data
private String[] accessions;
private ProteinSequence[] sequences;
private Structure[] structures;
private ResidueNumber[][] residues;
public FastaStructureParser(InputStream is,
SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
AtomCache cache)
{
this(new FastaReader<ProteinSequence, AminoAcidCompound>(
is, headerParser, sequenceCreator),cache);
}
public FastaStructureParser(File file,
SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
AtomCache cache) throws FileNotFoundException
{
this(new FastaReader<ProteinSequence, AminoAcidCompound>(
file, headerParser, sequenceCreator), cache);
}
public FastaStructureParser(FastaReader<ProteinSequence, AminoAcidCompound> reader,
AtomCache cache) {
this.reader = reader;
this.cache = cache;
this.accessions = null;
this.sequences = null;
this.structures = null;
this.residues = null;
}
/**
* Parses the fasta file and loads it into memory.
*
* Information can be subsequently accessed through
* {@link #getSequences()},
* {@link #getStructures()},
* {@link #getResidues()}, and
* {@link #getAccessions()}.
*
* @throws IOException
* @throws StructureException
*/
public void process() throws IOException, StructureException {
if(sequences == null) { // only process once, then return cached values
LinkedHashMap<String, ProteinSequence> sequenceMap = reader.process();
sequences = sequenceMap.values().toArray(new ProteinSequence[0]);
accessions = new String[sequences.length];
structures = new Structure[sequences.length];
residues = new ResidueNumber[sequences.length][];
// Match each sequence to a series of PDB Residue numbers
for(int i=0;i<sequences.length;i++) {
accessions[i] = sequences[i].getAccession().getID();
//System.out.println("Fetching "+accession);
structures[i] = cache.getStructure(accessions[i]);
residues[i] = StructureSequenceMatcher.matchSequenceToStructure(sequences[i], structures[i]);
assert( residues[i].length == sequences[i].getLength());
}
}
}
/**
* Gets the protein sequences read from the Fasta file.
* Returns null if {@link #process()} has not been called.
* @return An array ProteinSequences from
* parsing the fasta file, or null if process() hasn't been called.
*/
public ProteinSequence[] getSequences() {
return sequences;
}
/**
* Gets the protein structures mapped from the Fasta file.
* Returns null if {@link #process()} has not been called.
* @return An array of Structures for each protein
* in the fasta file, or null if process() hasn't been called.
*/
public Structure[] getStructures() {
return structures;
}
/**
* For each residue in the fasta file, return the ResidueNumber in the
* corresponding structure. If the residue cannot be found in the structure,
* that entry will be null. This can happen if that residue was not included
* in the PDB file (eg disordered residues), if the fasta sequence does not
* match the PDB sequence, or if errors occur during the matching process.
* @return A 2D array of ResidueNumbers, or null if process() hasn't been called.
* @see StructureSequenceMatcher#matchSequenceToStructure(ProteinSequence, Structure)
*/
public ResidueNumber[][] getResidues() {
return residues;
}
/**
* Gets the protein accessions mapped from the Fasta file.
* Returns null if {@link #process()} has not been called.
* @return An array of Structures for each protein
* in the fasta file, or null if process() hasn't been called.
*/
public String[] getAccessions() {
return accessions;
}
}