/** * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * Created on 2013-05-28 * Created by Douglas Myers-Turnbull * * @since 3.0.6 */ package org.biojava.nbio.structure.io; import org.biojava.nbio.core.alignment.template.AlignedSequence; import org.biojava.nbio.core.alignment.template.SequencePair; import org.biojava.nbio.structure.*; import org.biojava.nbio.structure.align.model.AFPChain; import org.biojava.nbio.structure.align.util.AlignmentTools; import org.biojava.nbio.structure.align.xml.AFPChainXMLConverter; import org.biojava.nbio.core.exceptions.CompoundNotFoundException; import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; import org.biojava.nbio.core.sequence.io.CasePreservingProteinSequenceCreator; import org.biojava.nbio.core.sequence.io.FastaReader; import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser; import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; import org.biojava.nbio.core.sequence.template.Sequence; import org.biojava.nbio.core.util.SequenceTools; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.*; /** * A collection of static utilities to convert between {@link AFPChain AFPChains} and {@link FastaSequence FastaSequences}. * * @author dmyersturnbull * @see StructureSequenceMatcher * @see FastaStructureParser * @see SeqRes2AtomAligner */ public class FastaAFPChainConverter { private final static Logger logger = LoggerFactory.getLogger(FastaAFPChainConverter.class); public static AFPChain cpFastaToAfpChain(String first, String second, Structure structure, int cpSite) throws StructureException, CompoundNotFoundException { ProteinSequence s1 = new ProteinSequence(first); s1.setUserCollection(getAlignedUserCollection(first)); ProteinSequence s2 = new ProteinSequence(second); s2.setUserCollection(getAlignedUserCollection(second)); return cpFastaToAfpChain(s1, s2, structure, cpSite); } /** * Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site * {@link cpSite} residues to the right. * * @param fastaFile A FASTA file containing exactly 2 sequences, the first unpermuted and the second permuted * @param cpSite * The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence * are acceptable * @throws IOException * @throws StructureException */ public static AFPChain cpFastaToAfpChain(File fastaFile, Structure structure, int cpSite) throws IOException, StructureException { InputStream inStream = new FileInputStream(fastaFile); SequenceCreatorInterface<AminoAcidCompound> creator = new CasePreservingProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()); SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser = new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(); FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(inStream, headerParser, creator); LinkedHashMap<String, ProteinSequence> sequences = fastaReader.process(); inStream.close(); Iterator<ProteinSequence> iter = sequences.values().iterator(); ProteinSequence first = iter.next(); ProteinSequence second = iter.next(); return cpFastaToAfpChain(first, second, structure, cpSite); } /** * Takes a structure and sequence corresponding to an alignment between a structure or sequence and itself (or even a structure with a sequence), where the result has a circular permutation site * {@link cpSite} residues to the right. * * @param first The unpermuted sequence * @param second The sequence permuted by cpSite * @param cpSite * The number of residues from the beginning of the sequence at which the circular permutation site occurs; can be positive or negative; values greater than the length of the sequence * are acceptable * @throws StructureException */ public static AFPChain cpFastaToAfpChain(ProteinSequence first, ProteinSequence second, Structure structure, int cpSite) throws StructureException { if (structure == null) { throw new IllegalArgumentException("The structure is null"); } if (first == null) { throw new IllegalArgumentException("The sequence is null"); } // we need to find the ungapped CP site int gappedCpShift = 0; int ungappedCpShift = 0; while (ungappedCpShift < Math.abs(cpSite)) { char c; try { if (cpSite <= 0) { c = second.getSequenceAsString().charAt(gappedCpShift); } else { c = second.getSequenceAsString().charAt(first.getLength()-1 - gappedCpShift); } } catch (StringIndexOutOfBoundsException e) { throw new IllegalArgumentException("CP site of " + cpSite + " is wrong"); } if (c != '-') { ungappedCpShift++; } gappedCpShift++; } Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure); Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure); // can't use cloneCAArray because it doesn't set parent group.chain.structure ProteinSequence antipermuted = null; try { antipermuted = new ProteinSequence(SequenceTools.permuteCyclic(second.getSequenceAsString(), gappedCpShift)); } catch (CompoundNotFoundException e) { // this can't happen, the original sequence comes from a ProteinSequence logger.error("Unexpected error while creating protein sequence: {}. This is most likely a bug.",e.getMessage() ); } ResidueNumber[] residues = StructureSequenceMatcher.matchSequenceToStructure(first, structure); ResidueNumber[] antipermutedResidues = StructureSequenceMatcher.matchSequenceToStructure(antipermuted, structure); ResidueNumber[] nonpermutedResidues = new ResidueNumber[antipermutedResidues.length]; SequenceTools.permuteCyclic(antipermutedResidues, nonpermutedResidues, -gappedCpShift); // nullify ResidueNumbers that have a lowercase sequence character if (first.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(first, residues); } if (second.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(second, nonpermutedResidues); } // for (int i = 0; i < residues.length; i++) { // if (residues[i] == null) { // System.out.print("="); // } else { // System.out.print(sequence.getSequenceAsString().charAt(i)); // } // } // System.out.println(); // for (int i = 0; i < residues.length; i++) { // if (nonpermutedResidues[i] == null) { // System.out.print("="); // } else { // System.out.print(second.getSequenceAsString().charAt(i)); // } // } // System.out.println(); return buildAlignment(ca1, ca2, residues, nonpermutedResidues); } /** * Reads the file {@code fastaFile}, expecting exactly two sequences which give a pairwise alignment. Uses this and two structures to create an AFPChain corresponding to the alignment. Uses a * {@link CasePreservingProteinSequenceCreator} and assumes that a residue is aligned if and only if it is given by an uppercase letter. * * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) * @throws IOException * @throws StructureException */ public static AFPChain fastaFileToAfpChain(File fastaFile, Structure structure1, Structure structure2) throws IOException, StructureException { InputStream inStream = new FileInputStream(fastaFile); SequenceCreatorInterface<AminoAcidCompound> creator = new CasePreservingProteinSequenceCreator( AminoAcidCompoundSet.getAminoAcidCompoundSet()); SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser = new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(); FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>( inStream, headerParser, creator); LinkedHashMap<String, ProteinSequence> sequences = fastaReader.process(); inStream.close(); return fastaToAfpChain(sequences, structure1, structure2); } /** * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The * sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures. * @throws StructureException * @throws CompoundNotFoundException */ public static AFPChain fastaStringToAfpChain(String sequence1, String sequence2, Structure structure1, Structure structure2) throws StructureException, CompoundNotFoundException { ProteinSequence seq1 = new ProteinSequence(sequence1); ProteinSequence seq2 = new ProteinSequence(sequence2); return fastaToAfpChain(seq1, seq2, structure1, structure2); } /** * Uses two sequences each with a corresponding structure to create an AFPChain corresponding to the alignment. Provided only for convenience since FastaReaders return such maps. * * @param sequences * A Map containing exactly two entries from sequence names as Strings to gapped ProteinSequences; the name is ignored * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) * @throws StructureException */ public static AFPChain fastaToAfpChain(Map<String, ProteinSequence> sequences, Structure structure1, Structure structure2) throws StructureException { if (sequences.size() != 2) { throw new IllegalArgumentException("There must be exactly 2 sequences, but there were " + sequences.size()); } if (structure1 == null || structure2 == null) { throw new IllegalArgumentException("A structure is null"); } List<ProteinSequence> seqs = new ArrayList<ProteinSequence>(); List<String> names = new ArrayList<String>(2); for (Map.Entry<String, ProteinSequence> entry : sequences.entrySet()) { seqs.add(entry.getValue()); names.add(entry.getKey()); } return fastaToAfpChain(seqs.get(0), seqs.get(1), structure1, structure2); } /** * TODO Write comment * @param sequence1 * @param sequence2 * @param structure1 * @param structure2 * @return * @throws StructureException * @throws CompoundNotFoundException */ public static AFPChain fastaToAfpChain(String sequence1, String sequence2, Structure structure1, Structure structure2) throws StructureException, CompoundNotFoundException { ProteinSequence s1 = new ProteinSequence(sequence1); s1.setUserCollection(getAlignedUserCollection(sequence1)); ProteinSequence s2 = new ProteinSequence(sequence2); s2.setUserCollection(getAlignedUserCollection(sequence2)); return fastaToAfpChain(s1, s2, structure1, structure2); } /** * Returns an AFPChain corresponding to the alignment between {@code structure1} and {@code structure2}, which is given by the gapped protein sequences {@code sequence1} and {@code sequence2}. The * sequences need not correspond to the entire structures, since local alignment is performed to match the sequences to structures. Assumes that a residue is aligned if and only if it is given by * an uppercase letter. * @param sequence1 <em>Must</em> have {@link ProteinSequence#getUserCollection()} set to document upper- and lower-case as aligned and unaligned; see {@link #getAlignedUserCollection(String)} * @throws StructureException */ public static AFPChain fastaToAfpChain(ProteinSequence sequence1, ProteinSequence sequence2, Structure structure1, Structure structure2) throws StructureException { if (structure1 == null || structure2 == null) { throw new IllegalArgumentException("A structure is null"); } if (sequence1 == null || sequence2 == null) { throw new IllegalArgumentException("A sequence is null"); } Atom[] ca1 = StructureTools.getRepresentativeAtomArray(structure1); Atom[] ca2 = StructureTools.getRepresentativeAtomArray(structure2); ResidueNumber[] residues1 = StructureSequenceMatcher.matchSequenceToStructure(sequence1, structure1); ResidueNumber[] residues2 = StructureSequenceMatcher.matchSequenceToStructure(sequence2, structure2); // nullify ResidueNumbers that have a lowercase sequence character if (sequence1.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence1, residues1); } if (sequence2.getUserCollection() != null) { CasePreservingProteinSequenceCreator.setLowercaseToNull(sequence2, residues2); } return buildAlignment(ca1, ca2, residues1, residues2); } /** * Provided only for convenience. * * @see #fastaToAfpChain(ProteinSequence, ProteinSequence, Structure, Structure) * @throws StructureException */ public static AFPChain fastaToAfpChain(SequencePair<Sequence<AminoAcidCompound>, AminoAcidCompound> alignment, Structure structure1, Structure structure2) throws StructureException { List<AlignedSequence<Sequence<AminoAcidCompound>, AminoAcidCompound>> seqs = alignment.getAlignedSequences(); StringBuilder sb1 = new StringBuilder(); for (AminoAcidCompound a : seqs.get(0)) { sb1.append(a.getBase()); } try { ProteinSequence seq1 = new ProteinSequence(sb1.toString()); StringBuilder sb2 = new StringBuilder(); for (AminoAcidCompound a : seqs.get(1)) { sb1.append(a.getBase()); } ProteinSequence seq2 = new ProteinSequence(sb2.toString()); LinkedHashMap<String, ProteinSequence> map = new LinkedHashMap<String, ProteinSequence>(); map.put(structure1.getName(), seq1); map.put(structure2.getName(), seq2); return fastaToAfpChain(map, structure1, structure2); } catch (CompoundNotFoundException e) { logger.error("Unexpected error while creating protein sequences: {}. This is most likely a bug.",e.getMessage()); return null; } } /** * Builds an {@link AFPChain} from already-matched arrays of atoms and residues. * * @param ca1 * An array of atoms in the first structure * @param ca2 * An array of atoms in the second structure * @param residues1 * An array of {@link ResidueNumber ResidueNumbers} in the first structure that are aligned. Only null ResidueNumbers are considered to be unaligned * @param residues2 * An array of {@link ResidueNumber ResidueNumbers} in the second structure that are aligned. Only null ResidueNumbers are considered to be unaligned * @throws StructureException */ private static AFPChain buildAlignment(Atom[] ca1, Atom[] ca2, ResidueNumber[] residues1, ResidueNumber[] residues2) throws StructureException { // remove any gap // this includes the ones introduced by the nullifying above List<ResidueNumber> alignedResiduesList1 = new ArrayList<ResidueNumber>(); List<ResidueNumber> alignedResiduesList2 = new ArrayList<ResidueNumber>(); for (int i = 0; i < residues1.length; i++) { if (residues1[i] != null && residues2[i] != null) { alignedResiduesList1.add(residues1[i]); alignedResiduesList2.add(residues2[i]); } } ResidueNumber[] alignedResidues1 = alignedResiduesList1.toArray(new ResidueNumber[alignedResiduesList1.size()]); ResidueNumber[] alignedResidues2 = alignedResiduesList2.toArray(new ResidueNumber[alignedResiduesList2.size()]); AFPChain afpChain = AlignmentTools.createAFPChain(ca1, ca2, alignedResidues1, alignedResidues2); afpChain.setAlgorithmName("unknown"); AlignmentTools.updateSuperposition(afpChain, ca1, ca2); afpChain.setBlockSize(new int[] {afpChain.getNrEQR()}); afpChain.setBlockRmsd(new double[] {afpChain.getTotalRmsdOpt()}); afpChain.setBlockGap(new int[] {afpChain.getGapLen()}); return afpChain; } /** * Takes a protein sequence string with capital and lowercase letters and sets its {@link ProteinSequence#getUserCollection() user collection} to record which letters are uppercase (aligned) and which are lowercase (unaligned). * @param sequence Make sure <em>not</em> to use {@link ProteinSequence#getSequenceAsString()} for this, as it won't preserve upper- and lower-case */ public static List<Object> getAlignedUserCollection(String sequence) { List<Object> aligned = new ArrayList<Object>(sequence.length()); for (char c : sequence.toCharArray()) { aligned.add(Character.isUpperCase(c)); } return aligned; } /** * Prints out the XML representation of an AFPChain from a file containing exactly two FASTA sequences. * * @param args * A String array of fasta-file structure-1-name structure-2-name * @throws StructureException * @throws IOException */ public static void main(String[] args) throws StructureException, IOException { if (args.length != 3) { System.err.println("Usage: FastaAFPChainConverter fasta-file structure-1-name structure-2-name"); return; } File fasta = new File(args[0]); Structure structure1 = StructureTools.getStructure(args[1]); Structure structure2 = StructureTools.getStructure(args[2]); if (structure1 == null) throw new IllegalArgumentException("No structure for " + args[1] + " was found"); if (structure2 == null) throw new IllegalArgumentException("No structure for " + args[2] + " was found"); AFPChain afpChain = fastaFileToAfpChain(fasta, structure1, structure2); String xml = AFPChainXMLConverter.toXML(afpChain); System.out.println(xml); } }