StructureSequenceMatcher.java example

Explorer
biojava-master
/**
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * Created by Spencer Bliven
 *
 */
package org.biojava.nbio.structure.io;

import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix;
import org.biojava.nbio.core.alignment.template.AlignedSequence;
import org.biojava.nbio.core.alignment.template.SequencePair;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.structure.*;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
import org.biojava.nbio.core.sequence.template.CompoundSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStreamReader;
import java.util.*;


/**
 * A utility class with methods for matching ProteinSequences with
 * Structures.
 * @author Spencer Bliven
 *
 */
public class StructureSequenceMatcher {

	private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class);

	/**
	 * Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in
	 * {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty.
	 * The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not.
	 * @param sequence The input protein sequence
	 * @param wholeStructure The structure from which to take a substructure
	 * @return The resulting structure
	 * @throws StructureException
	 * @see {@link #matchSequenceToStructure(ProteinSequence, Structure)}
	 */
	public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) {
		ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure);
		Structure structure = wholeStructure.clone();
		structure.setChains(new ArrayList<>());
//		structure.getHetGroups().clear();
		Chain currentChain = null;
		for (ResidueNumber rn : rns) {
			if (rn == null) continue;
			Group group; // note that we don't clone
			try {
				group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn);
			} catch (StructureException e) {
				throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e);
			}
			Chain chain = new ChainImpl();
			chain.setName(group.getChain().getName());
			chain.setId(group.getChain().getId());
			if (currentChain == null || !currentChain.getId().equals(chain.getId())) {
				structure.addChain(chain);
				chain.setEntityInfo(group.getChain().getEntityInfo());
				chain.setStructure(structure);
				chain.setSwissprotId(group.getChain().getSwissprotId());
				chain.setId(group.getChain().getId());
				chain.setName(group.getChain().getName());
				currentChain = chain;
			}
			currentChain.addGroup(group);
		}
		return structure;
	}

	/**
	 * Generates a ProteinSequence corresponding to the sequence of struct,
	 * and maintains a mapping from the sequence back to the original groups.
	 *
	 * Chains are appended to one another. 'X' is used for heteroatoms.
	 *
	 * @param struct Input structure
	 * @param groupIndexPosition An empty map, which will be populated with
	 *  (residue index in returned ProteinSequence) -> (Group within struct)
	 * @return A ProteinSequence with the full sequence of struct. Chains are
	 *  concatenated in the same order as the input structures
	 *
	 * @see {@link SeqRes2AtomAligner#getFullAtomSequence(List, Map)}, which
	 * 	does the heavy lifting.
	 *
	 */
	public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map<Integer,Group> groupIndexPosition ) {

		if( groupIndexPosition != null) {
			groupIndexPosition.clear();
		}

		StringBuilder seqStr = new StringBuilder();

		for(Chain chain : struct.getChains()) {
			List<Group> groups = chain.getAtomGroups();
			Map<Integer,Integer> chainIndexPosition = new HashMap<Integer, Integer>();
			int prevLen = seqStr.length();

			// get the sequence for this chain
			String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false);
			seqStr.append(chainSeq);


			// fix up the position to include previous chains, and map the value back to a Group
			for(Integer seqIndex : chainIndexPosition.keySet()) {
				Integer groupIndex = chainIndexPosition.get(seqIndex);
				groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex));
			}
		}

		ProteinSequence s = null;
		try {
			s = new ProteinSequence(seqStr.toString());
		} catch (CompoundNotFoundException e) {
			// I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24
			// we can log an error if it does, it would mean there's a bad bug somewhere
			logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage());
		}

		return s;
	}

	/**
	 * Given a sequence and the corresponding Structure, get the ResidueNumber
	 * for each residue in the sequence.
	 *
	 * <p>Smith-Waterman alignment is used to match the sequences. Residues
	 * in the sequence but not the structure or mismatched between sequence
	 * and structure will have a null atom, while
	 * residues in the structure but not the sequence are ignored with a warning.
	 * @param seq The protein sequence. Should match the sequence of struct very
	 * 	closely.
	 * @param struct The corresponding protein structure
	 * @return A list of ResidueNumbers of the same length as seq, containing
	 *  either the corresponding residue or null.
	 */
	public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) {

		//1. Create ProteinSequence for struct while remembering to which group each residue corresponds
		Map<Integer,Group> atomIndexPosition   = new HashMap<Integer, Group>();

		ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition);

		// TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which)
		//2. Run Smith-Waterman to get the alignment
		// Identity substitution matrix with +1 for match, -1 for mismatch
		// TODO
		SubstitutionMatrix<AminoAcidCompound> matrix =
			new SimpleSubstitutionMatrix<AminoAcidCompound>(
					AminoAcidCompoundSet.getAminoAcidCompoundSet(),
					(short)1, (short)-1 );
		matrix = new SimpleSubstitutionMatrix<AminoAcidCompound>(
				AminoAcidCompoundSet.getAminoAcidCompoundSet(),
				new InputStreamReader(
						SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")),
		"blosum100");
		SequencePair<ProteinSequence, AminoAcidCompound> pair =
			Alignments.getPairwiseAlignment(seq, structSeq,
					PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix);

		//System.out.print(pair.toString());

		//3. Convert the alignment back to Atoms
		AlignedSequence<ProteinSequence,AminoAcidCompound> alignedSeq = pair.getQuery();
		AlignedSequence<ProteinSequence,AminoAcidCompound> alignedStruct = pair.getTarget();


		assert(alignedSeq.getLength() == alignedStruct.getLength());

//		System.out.println(pair.toString(80));
//		System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(),
//				alignedSeq.getLength()-alignedSeq.getNumGaps(),
//				alignedStruct.getLength()-alignedStruct.getNumGaps());

		ResidueNumber[] ca = new ResidueNumber[seq.getLength()];

		for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed
			//skip missing residues from sequence. These probably represent alignment errors
			if(alignedSeq.isGap(pos)) {
				int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;
				assert(structIndex > 0);//should be defined since seq gap

				Group g = atomIndexPosition.get(structIndex);

				logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.",
						g.getChainId(),
						g.getResidueNumber().toString(),
						g.getChain().getStructure().getPDBCode() );
				continue;
			}

			if(! alignedStruct.isGap(pos) ) {
				int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed
				int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed
				Group g = atomIndexPosition.get(structIndex);

				assert(0<=seqIndex && seqIndex < ca.length);

				ca[seqIndex] = g.getResidueNumber(); //remains null for gaps
			}
		}
		return ca;
	}


	/**
	 * Removes all gaps ('-') from a protein sequence
	 * @param gapped
	 * @return
	 */
	public static ProteinSequence removeGaps(ProteinSequence gapped) {
		final String[] gapStrings = {"-","."};

		StringBuilder seq = new StringBuilder();

		CompoundSet<AminoAcidCompound> aaSet = gapped.getCompoundSet();
		AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length];

		for(int i=0;i<gapStrings.length;i++) {
			gaps[i] = aaSet.getCompoundForString(gapStrings[i]);
		}

		for(int i=1; i<=gapped.getLength();i++) { //1-indexed
			AminoAcidCompound aa = gapped.getCompoundAt(i);
			boolean isGap = false;
			for(AminoAcidCompound gap : gaps) {
				if( aa.equals(gap)) {
					isGap = true;
					break;
				}
			}
			if(!isGap) {
				seq.append(aa.getShortName());
			}
		}

		ProteinSequence ungapped = null;
		try {
			ungapped = new ProteinSequence(seq.toString());
		} catch (CompoundNotFoundException e) {
			// this can't happen, if it does there's a bug somewhere
			logger.error("Could not create ungapped protein sequence, found unknown compounds: {}. This is most likely a bug.", e.getMessage());
		}

		return ungapped;
	}

	/**
	 * Creates a new list consisting of all columns of gapped where no row
	 * contained a null value.
	 *
	 * Here, "row" refers to the first index and "column" to the second, eg
	 * gapped.get(row).get(column)
	 * @param gapped A rectangular matrix containing null to mark gaps
	 * @return A new List without columns containing nulls
	 */
	public static <T> T[][] removeGaps(final T[][] gapped) {
		if(gapped == null ) return null;
		if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length);

		final int nProts = gapped.length;
		final int protLen = gapped[0].length; // length of gapped proteins

		// Verify that input is rectangular
		for(int i=0;i<nProts;i++) {
			if(gapped[i].length != protLen) {
				throw new IllegalArgumentException(String.format(
						"Expected a rectangular array, but row 0 has %d elements " +
						"while row %d has %d.", protLen,i,gapped[i].length));

			}
		}

		// determine where gaps exist in any structures
		boolean[] isGap = new boolean[protLen];
		int gaps = 0;
		for(int j=0;j<protLen;j++) {
			for(int i=0;i<nProts;i++) {
				if(gapped[i][j] == null ) {
					isGap[j] = true;
					gaps++;
					break; //go to next position
				}
			}
		}

		// Create ungapped array
		T[][] ungapped = Arrays.copyOf(gapped,nProts);
		final int ungappedLen = protLen-gaps;
		for(int i=0;i<nProts;i++) {
			ungapped[i] = Arrays.copyOf(gapped[i],ungappedLen);
			int k = 0;
			for(int j=0;j<protLen;j++) {
				if(!isGap[j]) { //skip gaps
					assert(gapped[i][j] != null);
					ungapped[i][k] = gapped[i][j];
					k++;
				}
			}
			assert(k == ungappedLen);
		}

		return ungapped;
	}
}