/* * $Id: SmithWatermanGotoh.java,v 1.10 2006/02/09 13:27:36 ahmed Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package jaligner; import jaligner.matrix.Matrix; import java.util.logging.Logger; /** * An implementation of the Smith-Waterman algorithm with Gotoh's improvement * for biological local pairwise sequence alignment. * * <strong>Recursive definition:</strong> * <ul> * <li> * <strong>Base conditions:</strong> * <ul> * <li><code>V(0, 0) = 0</code></li> * <li><code>V(i, 0) = E(i, 0) = W<sub>g</sub> + iW<sub>s</sub></code></li> * <li><code>V(0, j) = F(0, j) = W<sub>g</sub> + jW<sub>s</sub></code></li> * </ul> * </li> * <li> * <strong>Recurrence relation:</strong> * <ul> * <li><code>V(i, j) = max{E(i, j), F(i, j), G(i, j)}</code>, where:</li> * <li><code>G(i, j) = V(i - 1, j - 1) + similarity(S<sub>i</sub>, T<sub>j</sub>)</code></li> * <li><code>E(i, j) = max{E(i, j - 1) + W<sub>s</sub>, V(i, j - 1) + W<sub>g</sub> + W<sub>s</sub>}</code></li> * <li><code>F(i, j) = max{F(i - 1, j) + W<sub>s</sub>, V(i - 1, j) + W<sub>g</sub> + W<sub>s</sub>}</code></li> * </ul> * </ul> * * @author Ahmed Moustafa (ahmed@users.sf.net) */ public final class SmithWatermanGotoh { /** * Hidden constructor */ private SmithWatermanGotoh() { super(); } /** * Logger */ private static final Logger logger = Logger .getLogger(SmithWatermanGotoh.class.getName()); /** * Aligns two sequences by Smith-Waterman (local) * * @param s1 * sequene #1 ({@link Sequence}) * @param s2 * sequene #2 ({@link Sequence}) * @param matrix * scoring matrix ({@link Matrix}) * @param o * open gap penalty * @param e * extend gap penalty * @return alignment object contains the two aligned sequences, the * alignment score and alignment statistics * @see Sequence * @see Matrix */ public static Alignment align(Sequence s1, Sequence s2, Matrix matrix, float o, float e) { //Sequence s1=new Sequence(s3); //Sequence s2=new Sequence(s4); //logger.info("Started..."); long start = System.currentTimeMillis(); float[][] scores = matrix.getScores(); int m = s1.length() + 1; int n = s2.length() + 1; byte[] pointers = new byte[m * n]; // Initializes the boundaries of the traceback matrix to STOP. for (int i = 0, k = 0; i < m; i++, k += n) { pointers[k] = Directions.STOP; } for (int j = 1; j < n; j++) { pointers[j] = Directions.STOP; } short[] sizesOfVerticalGaps = new short[m * n]; short[] sizesOfHorizontalGaps = new short[m * n]; for (int i = 0, k = 0; i < m; i++, k += n) { for (int j = 0; j < n; j++) { sizesOfVerticalGaps[k + j] = sizesOfHorizontalGaps[k + j] = 1; } } Cell cell = SmithWatermanGotoh.construct(s1, s2, scores, o, e, pointers, sizesOfVerticalGaps, sizesOfHorizontalGaps); Alignment alignment = SmithWatermanGotoh.traceback(s1, s2, matrix, pointers, cell, sizesOfVerticalGaps, sizesOfHorizontalGaps); alignment.setOriginalSequence1(s1); alignment.setOriginalSequence2(s2); alignment.setMatrix(matrix); alignment.setOpen(o); alignment.setExtend(e); if (s1.getId() != null) { alignment.setName1(s1.getId()); } if (s2.getId() != null) { alignment.setName2(s2.getId()); } //logger.info("Finished in " + (System.currentTimeMillis() - start) //+ " milliseconds"); return alignment; } /** * Constructs directions matrix for the traceback * * @param s1 * sequence #1 * @param s2 * sequence #2 * @param matrix * scoring matrix * @param o * open gap penalty * @param e * extend gap penalty * @return The cell where the traceback starts. */ private static Cell construct(Sequence s1, Sequence s2, float[][] matrix, float o, float e, byte[] pointers, short[] sizesOfVerticalGaps, short[] sizesOfHorizontalGaps) { //logger.info("Started..."); long start = System.currentTimeMillis(); char[] a1 = s1.toArray(); char[] a2 = s2.toArray(); int m = s1.length() + 1; int n = s2.length() + 1; float f; // score of alignment x1...xi to y1...yi if xi aligns to yi float[] g = new float[n]; // score if xi aligns to a gap after yi float h; // score if yi aligns to a gap after xi float[] v = new float[n]; // best score of alignment x1...xi to // y1...yi float vDiagonal; g[0] = Float.NEGATIVE_INFINITY; h = Float.NEGATIVE_INFINITY; v[0] = 0; for (int j = 1; j < n; j++) { g[j] = Float.NEGATIVE_INFINITY; v[j] = 0; } float similarityScore, g1, g2, h1, h2; Cell cell = new Cell(); for (int i = 1, k = n; i < m; i++, k += n) { h = Float.NEGATIVE_INFINITY; vDiagonal = v[0]; for (int j = 1, l = k + 1; j < n; j++, l++) { similarityScore = matrix[a1[i - 1]][a2[j - 1]]; // Fill the matrices f = vDiagonal + similarityScore; g1 = g[j] - e; g2 = v[j] - o; if (g1 > g2) { g[j] = g1; sizesOfVerticalGaps[l] = (short) (sizesOfVerticalGaps[l - n] + 1); } else { g[j] = g2; } h1 = h - e; h2 = v[j - 1] - o; if (h1 > h2) { h = h1; sizesOfHorizontalGaps[l] = (short) (sizesOfHorizontalGaps[l - 1] + 1); } else { h = h2; } vDiagonal = v[j]; v[j] = maximum(f, g[j], h, 0); // Determine the traceback direction if (v[j] == 0) { pointers[l] = Directions.STOP; } else if (v[j] == f) { pointers[l] = Directions.DIAGONAL; } else if (v[j] == g[j]) { pointers[l] = Directions.UP; } else { pointers[l] = Directions.LEFT; } // Set the traceback start at the current cell i, j and score if (v[j] > cell.getScore()) { cell.set(i, j, v[j]); } } } //logger.info("Finished in " + (System.currentTimeMillis() - start) //+ " milliseconds"); return cell; } /** * Returns the alignment of two sequences based on the passed array of * pointers * * @param s1 * sequence #1 * @param s2 * sequence #2 * @param m * scoring matrix * @param cell * The cell where the traceback starts. * @return {@link Alignment}with the two aligned sequences and alignment * score. * @see Cell * @see Alignment */ private static Alignment traceback(Sequence s1, Sequence s2, Matrix m, byte[] pointers, Cell cell, short[] sizesOfVerticalGaps, short[] sizesOfHorizontalGaps) { //logger.info("Started..."); long start = System.currentTimeMillis(); char[] a1 = s1.toArray(); char[] a2 = s2.toArray(); float[][] scores = m.getScores(); int n = s2.length() + 1; Alignment alignment = new Alignment(); alignment.setScore(cell.getScore()); int maxlen = s1.length() + s2.length(); // maximum length after the // aligned sequences char[] reversed1 = new char[maxlen]; // reversed sequence #1 char[] reversed2 = new char[maxlen]; // reversed sequence #2 char[] reversed3 = new char[maxlen]; // reversed markup int len1 = 0; // length of sequence #1 after alignment int len2 = 0; // length of sequence #2 after alignment int len3 = 0; // length of the markup line int identity = 0; // count of identitcal pairs int similarity = 0; // count of similar pairs int gaps = 0; // count of gaps char c1, c2; int i = cell.getRow(); // traceback start row int j = cell.getCol(); // traceback start col int k = i * n; boolean stillGoing = true; // traceback flag: true -> continue & false // -> stop while (stillGoing) { switch (pointers[k + j]) { case Directions.UP: for (int l = 0, len = sizesOfVerticalGaps[k + j]; l < len; l++) { reversed1[len1++] = a1[--i]; reversed2[len2++] = Alignment.GAP; reversed3[len3++] = Markups.GAP; k -= n; gaps++; } break; case Directions.DIAGONAL: c1 = a1[--i]; c2 = a2[--j]; k -= n; reversed1[len1++] = c1; reversed2[len2++] = c2; if (c1 == c2) { reversed3[len3++] = Markups.IDENTITY; identity++; similarity++; } else if (scores[c1][c2] > 0) { reversed3[len3++] = Markups.SIMILARITY; similarity++; } else { reversed3[len3++] = Markups.MISMATCH; } break; case Directions.LEFT: for (int l = 0, len = sizesOfHorizontalGaps[k + j]; l < len; l++) { reversed1[len1++] = Alignment.GAP; reversed2[len2++] = a2[--j]; reversed3[len3++] = Markups.GAP; gaps++; } break; case Directions.STOP: stillGoing = false; } } alignment.setSequence1(reverse(reversed1, len1)); alignment.setStart1(i); alignment.setSequence2(reverse(reversed2, len2)); alignment.setStart2(j); alignment.setMarkupLine(reverse(reversed3, len3)); alignment.setIdentity(identity); alignment.setGaps(gaps); alignment.setSimilarity(similarity); //logger.info("Finished in " + (System.currentTimeMillis() - start) //+ " milliseconds"); return alignment; } /** * Returns the maximum of 4 float numbers. * * @param a * float #1 * @param b * float #2 * @param c * float #3 * @param d * float #4 * @return The maximum of a, b, c and d. */ private static float maximum(float a, float b, float c, float d) { if (a > b) { if (a > c) { return a > d ? a : d; } else { return c > d ? c : d; } } else if (b > c) { return b > d ? b : d; } else { return c > d ? c : d; } } /** * Reverses an array of chars * * @param a * @param len * @return the input array of char reserved */ private static char[] reverse(char[] a, int len) { char[] b = new char[len]; for (int i = len - 1, j = 0; i >= 0; i--, j++) { b[j] = a[i]; } return b; } }