/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ /* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package bio.pih.genoogle.alignment; import bio.pih.genoogle.seq.SymbolList; /* * Created on 05.09.2005 * */ /** * Smith and Waterman developed an efficient dynamic programing algorithm to perform local sequence * alignments, which returns the most conserved region of two sequences (longes common substring * with modifications). This algorithm is performed by the method <code>pairwiseAlignment</code> of * this class. It uses affine gap penalties if and only if the expenses of a delete or insert * operation are unequal to the expenses of gap extension. This uses significantly more memory (four * times as much) and increases the runtime if swaping is performed. * * @author Felipe Albrecht * @author Andreas Dräger * @author Gero Greiner * @since 1.5 */ public class GenoogleSmithWaterman extends GenoogleSequenceAlignment { /** * Expenses for insterts. */ protected int insert; /** * Expenses for deletes. */ protected int delete; /** * Expenses for the extension of a gap. */ protected int gapExt; /** * Expenses for matches. */ protected int match; /** * Expenses for replaces. */ protected int replace; /* * Variables needed for traceback */ int score = Integer.MIN_VALUE; String[] align = new String[2]; String path = null; int identitySize; private static final long serialVersionUID = 2884980510887845616L; /** * Constructs the new SmithWaterman alignment object. Alignments are only performed, if the * alphabet of the given <code>SubstitutionMatrix</code> equals the alpabet of both the query * and the target <code>Sequence</code>. The alignment parameters here are expenses and not * scores as they are in the <code>NeedlemanWunsch</code> object. scores are just given by * multipliing the expenses with <code>(-1)</code>. For example you could use parameters like * "-2, 5, 3, 3, 0". If the expenses for gap extension are equal to the cost of starting a gap * (delete or insert), no affine gap penalties are used, which saves memory. * * @param match * expenses for a match * @param replace * expenses for a replace operation * @param insert * expenses for a gap opening in the query sequence * @param delete * expenses for a gap opening in the target sequence * @param gapExtend * expenses for the extension of a gap which was started earlier. */ public GenoogleSmithWaterman(int match, int replace, int insert, int delete, int gapExtend) { this.insert = insert; this.delete = delete; this.gapExt = gapExtend; this.match = match; this.replace = replace; } int maxI = 0, maxJ = 0, queryStart = 0, targetStart = 0; /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#pairwiseAlignment(bio.pih.genoogle.seq.SymbolList, bio.pih.genoogle.seq.SymbolList) */ public int pairwiseAlignment(SymbolList query, SymbolList subject) { int[][] scoreMatrix = new int[query.getLength() + 1][subject.getLength() + 1]; int builderLength = (int) (Math.max(query.getLength(), subject.getLength()) * 1.20); StringBuilder pathBuilder = new StringBuilder(builderLength); StringBuilder[] alignBuilder = new StringBuilder[2]; alignBuilder[0] = new StringBuilder(builderLength); alignBuilder[1] = new StringBuilder(builderLength); /* * Use affine gap panalties. */ if ((gapExt != delete) || (gapExt != insert)) { affinedGapAlignment(query, subject, scoreMatrix, pathBuilder, alignBuilder); /* * No affine gap penalties to save memory. */ } else { nonAfinedGapAlignment(query, subject, scoreMatrix, pathBuilder, alignBuilder); } this.score = scoreMatrix[maxI][maxJ]; this.path = pathBuilder.reverse().toString(); this.align[0] = alignBuilder[0].reverse().toString(); this.align[1] = alignBuilder[1].reverse().toString(); return this.score; } private void nonAfinedGapAlignment(SymbolList query, SymbolList subject, int[][] scoreMatrix, StringBuilder pathBuilder, StringBuilder[] alignBuilder) { int i; int j; int k = 3; for (i = 0; i <= query.getLength(); i++) scoreMatrix[i][0] = 0; for (j = 0; j <= subject.getLength(); j++) scoreMatrix[0][j] = 0; for (i = 1; i <= query.getLength(); i++) { int from = Math.max(1, i - k); int to = Math.min(subject.getLength(), i + k); for (j = from; j <= to; j++) { scoreMatrix[i][j] = max(0, scoreMatrix[i - 1][j] + delete, scoreMatrix[i][j - 1] + insert, scoreMatrix[i - 1][j - 1] + matchReplace(query, subject, i, j)); if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) { maxI = i; maxJ = j; } } } /* * Here starts the traceback for non-affine gap penalities */ backtrace(query, subject, scoreMatrix, pathBuilder, alignBuilder); } private void affinedGapAlignment(SymbolList query, SymbolList subject, int[][] scoreMatrix, StringBuilder pathBuilder, StringBuilder[] alignBuilder) { int i; int j; int[][] E = new int[query.getLength() + 1][subject.getLength() + 1]; // Inserts int[][] F = new int[query.getLength() + 1][subject.getLength() + 1]; // Deletes scoreMatrix[0][0] = 0; E[0][0] = F[0][0] = Integer.MIN_VALUE; for (i = 1; i <= query.getLength(); i++) { scoreMatrix[i][0] = F[i][0] = 0; E[i][0] = Integer.MIN_VALUE; } for (j = 1; j <= subject.getLength(); j++) { scoreMatrix[0][j] = E[0][j] = 0; F[0][j] = Integer.MIN_VALUE; } for (i = 1; i <= query.getLength(); i++) for (j = 1; j <= subject.getLength(); j++) { E[i][j] = Math.max(E[i][j - 1], scoreMatrix[i][j - 1] + insert) + gapExt; F[i][j] = Math.max(F[i - 1][j], scoreMatrix[i - 1][j] + delete) + gapExt; scoreMatrix[i][j] = max(0, E[i][j], F[i][j], scoreMatrix[i - 1][j - 1] + matchReplace(query, subject, i, j)); if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) { maxI = i; maxJ = j; } } /* * Here starts the traceback for affine gap penalities */ boolean[] gap_extend = { false, false }; j = maxJ; for (i = maxI; i > 0;) { do { // only Deletes or Inserts or Replaces possible. // That's not what we want to have. if (scoreMatrix[i][j] == 0) { queryStart = i; targetStart = j; i = j = 0; // Match/Replace } else if ((Math.abs(scoreMatrix[i][j] - (scoreMatrix[i - 1][j - 1] + matchReplace(query, subject, i, j))) < 0.0001) && !(gap_extend[0] || gap_extend[1])) { if (query.symbolAt(i) == subject.symbolAt(j)) { pathBuilder.append('|'); identitySize++; } else { pathBuilder.append(' '); } alignBuilder[0].append(query.symbolAt(i--)); alignBuilder[1].append(subject.symbolAt(j--)); // Insert || finish gap if extended gap is // opened } else if (scoreMatrix[i][j] == E[i][j] || gap_extend[0]) { // check if gap has been extended or freshly // opened gap_extend[0] = Math.abs(E[i][j] - (scoreMatrix[i][j - 1] + insert + gapExt)) > 0.0001; alignBuilder[0].append('-'); alignBuilder[1].append(subject.symbolAt(j--)); pathBuilder.append(' '); // Delete || finish gap if extended gap is // opened } else { // check if gap has been extended or freshly // opened gap_extend[1] = Math.abs(F[i][j] - (scoreMatrix[i - 1][j] + delete + gapExt)) > 0.0001; alignBuilder[0].append(query.symbolAt(i--)); alignBuilder[1].append('-'); pathBuilder.append(' '); } } while (j > 0); } } /** * Backtrace phase for the Smith-Waterman algorithm. */ private void backtrace(SymbolList query, SymbolList subject, int[][] scoreMatrix, StringBuilder pathBuilder, StringBuilder[] alignBuilder) { int i; int j; j = maxJ; for (i = maxI; i > 0;) { do { // only Deletes or Inserts or Replaces possible. // That's not what // we want to have. if (scoreMatrix[i][j] == 0) { queryStart = i; targetStart = j; i = j = 0; // Match/Replace } else if (Math.abs(scoreMatrix[i][j] - (scoreMatrix[i - 1][j - 1] + matchReplace(query, subject, i, j))) < 0.0001) { if (query.symbolAt(i) == subject.symbolAt(j)) { pathBuilder.append('|'); identitySize++; } else { pathBuilder.append(' '); } alignBuilder[0].append(query.symbolAt(i--)); alignBuilder[1].append(subject.symbolAt(j--)); // Insert } else if (Math.abs(scoreMatrix[i][j] - (scoreMatrix[i][j - 1] + insert)) < 0.0001) { alignBuilder[0].append('-'); alignBuilder[1].append(subject.symbolAt(j--)); pathBuilder.append(' '); // Delete } else { alignBuilder[0].append(query.symbolAt(i--)); alignBuilder[1].append('-'); pathBuilder.append(' '); } } while (j > 0); } } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getQueryAligned() */ @Override public String getQueryAligned() { return align[0]; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getTargetAligned() */ @Override public String getTargetAligned() { return align[1]; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getPath() */ @Override public String getPath() { return path; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getQueryStart() */ @Override public int getQueryStart() { return queryStart + 1; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getQueryEnd() */ @Override public int getQueryEnd() { return maxI; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getTargetStart() */ @Override public int getTargetStart() { return targetStart + 1; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getTargetEnd() */ @Override public int getTargetEnd() { return maxJ; } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getScore() */ @Override public int getScore() { return score; } /** * This method computes the scores for the substution of the i-th symbol of query by the j-th * symbol of subject. * * @param query * The query sequence * @param subject * The target sequence * @param i * The position of the symbol under consideration within the query sequence (starting * from one) * @param j * The position of the symbol under consideration within the target sequence * @return The score for the given substitution. */ private int matchReplace(SymbolList query, SymbolList subject, int i, int j) { // Symbols are singletons, so it is possible to use '=='. if (query.symbolAt(i) == subject.symbolAt(j)) { return match; } else { return replace; } } /* (non-Javadoc) * @see bio.pih.genoogle.alignment.X#getIdentitySize() */ @Override public int getIdentitySize() { return identitySize; } }