package com.milaboratory.core.alignment; import com.milaboratory.core.Range; import com.milaboratory.core.mutations.MutationsBuilder; import com.milaboratory.core.sequence.NucleotideSequence; /** * @author Dmitry Bolotin * @author Stanislav Poslavsky */ public class BandedAffineAligner { private BandedAffineAligner() { } /** * Classical Banded Alignment with affine gap scoring. * * Both sequences must be highly similar. * * Align 2 sequence completely (i.e. while first sequence will be aligned against whole second sequence). * * @param scoring scoring system * @param seq1 first sequence * @param seq2 second sequence * @param offset1 offset in first sequence * @param length1 length of first sequence's part to be aligned * @param offset2 offset in second sequence * @param length2 length of second sequence's part to be aligned * @param width width of banded alignment matrix. In other terms max allowed number of indels * @param mutations mutations array where all mutations will be kept * @param cache matrix cache */ public static int align0(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int offset2, final int length2, final int width, final MutationsBuilder<NucleotideSequence> mutations, final MatrixCache cache) { if (length1 == 0 && length2 == 0) return 0; int size1 = length1 + 1, size2 = length2 + 1; cache.prepareMatrices(size1, size2, width, scoring); BandedMatrix main = cache.main; BandedMatrix gapIn1 = cache.gapIn1; BandedMatrix gapIn2 = cache.gapIn2; int i, j; int match, gap1, gap2, to; final int gapExtensionPenalty = scoring.getGapExtensionPenalty(); for (i = 0; i < length1; ++i) { to = Math.min(i + main.getRowFactor() - main.getColumnDelta() + 1, length2); for (j = Math.max(0, i - main.getColumnDelta()); j < to; ++j) { match = main.get(i, j) + scoring.getScore(seq1.codeAt(offset1 + i), seq2.codeAt(offset2 + j)); gap1 = Math.max(main.get(i + 1, j) + scoring.getGapOpenPenalty(), gapIn1.get(i + 1, j) + gapExtensionPenalty); gap2 = Math.max(main.get(i, j + 1) + scoring.getGapOpenPenalty(), gapIn2.get(i, j + 1) + gapExtensionPenalty); gapIn1.set(i + 1, j + 1, gap1); gapIn2.set(i + 1, j + 1, gap2); main.set(i + 1, j + 1, Math.max(match, Math.max(gap1, gap2))); } } to = mutations.size(); i = length1 - 1; j = length2 - 1; int pScore = main.get(i + 1, j + 1); byte c1, c2; while (i >= 0 || j >= 0) { if (i >= 0 && pScore == gapIn2.get(i + 1, j + 1)) { if (pScore == gapIn2.get(i, j + 1) + gapExtensionPenalty) pScore = gapIn2.get(i, j + 1); else pScore = main.get(i, j + 1); mutations.appendDeletion(offset1 + i, seq1.codeAt(offset1 + i)); --i; } else if (j >= 0 && pScore == gapIn1.get(i + 1, j + 1)) { if (pScore == gapIn1.get(i + 1, j) + gapExtensionPenalty) pScore = gapIn1.get(i + 1, j); else pScore = main.get(i + 1, j); mutations.appendInsertion(offset1 + i + 1, seq2.codeAt(offset2 + j)); --j; } else if (i >= 0 && j >= 0 && pScore == main.get(i, j) + scoring.getScore(c1 = seq1.codeAt(offset1 + i), c2 = seq2.codeAt(offset2 + j))) { pScore = main.get(i, j); if (c1 != c2) mutations.appendSubstitution(offset1 + i, c1, c2); --i; --j; } else throw new RuntimeException(); } mutations.reverseRange(to, mutations.size()); return main.get(length1, length2); } public static BandedSemiLocalResult semiLocalRight0(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, int length1, final int offset2, int length2, final int width, final MutationsBuilder<NucleotideSequence> mutations, final MatrixCache cache) { if (length1 == 0 || length2 == 0) return new BandedSemiLocalResult(offset1 - 1, offset2 - 1, 0); int minLength = Math.min(length1, length2) + width; length1 = Math.min(length1, minLength); length2 = Math.min(length2, minLength); int size1 = length1 + 1, size2 = length2 + 1; cache.prepareMatrices(size1, size2, width, scoring); BandedMatrix main = cache.main; BandedMatrix gapIn1 = cache.gapIn1; BandedMatrix gapIn2 = cache.gapIn2; int i, j; int match, gap1, gap2, to; int maxI = -1, maxJ = -1, maxScore = 0; final int gapExtensionPenalty = scoring.getGapExtensionPenalty(); for (i = 0; i < length1; ++i) { to = Math.min(i + main.getRowFactor() - main.getColumnDelta() + 1, length2); for (j = Math.max(0, i - main.getColumnDelta()); j < to; ++j) { match = main.get(i, j) + scoring.getScore(seq1.codeAt(offset1 + i), seq2.codeAt(offset2 + j)); gap1 = Math.max(main.get(i + 1, j) + scoring.getGapOpenPenalty(), gapIn1.get(i + 1, j) + gapExtensionPenalty); gap2 = Math.max(main.get(i, j + 1) + scoring.getGapOpenPenalty(), gapIn2.get(i, j + 1) + gapExtensionPenalty); gapIn1.set(i + 1, j + 1, gap1); gapIn2.set(i + 1, j + 1, gap2); int score = Math.max(match, Math.max(gap1, gap2)); main.set(i + 1, j + 1, score); if (score > maxScore) { maxScore = score; maxI = i; maxJ = j; } } } to = mutations.size(); i = maxI; j = maxJ; int pScore = main.get(i + 1, j + 1); byte c1, c2; while (i >= 0 || j >= 0) { if (i >= 0 && pScore == gapIn2.get(i + 1, j + 1)) { if (pScore == gapIn2.get(i, j + 1) + gapExtensionPenalty) pScore = gapIn2.get(i, j + 1); else pScore = main.get(i, j + 1); mutations.appendDeletion(offset1 + i, seq1.codeAt(offset1 + i)); --i; } else if (j >= 0 && pScore == gapIn1.get(i + 1, j + 1)) { if (pScore == gapIn1.get(i + 1, j) + gapExtensionPenalty) pScore = gapIn1.get(i + 1, j); else pScore = main.get(i + 1, j); mutations.appendInsertion(offset1 + i + 1, seq2.codeAt(offset2 + j)); --j; } else if (i >= 0 && j >= 0 && pScore == main.get(i, j) + scoring.getScore(c1 = seq1.codeAt(offset1 + i), c2 = seq2.codeAt(offset2 + j))) { pScore = main.get(i, j); if (c1 != c2) mutations.appendSubstitution(offset1 + i, c1, c2); --i; --j; } else throw new RuntimeException(); } mutations.reverseRange(to, mutations.size()); return new BandedSemiLocalResult(offset1 + maxI, offset2 + maxJ, maxScore); } public static BandedSemiLocalResult semiLocalLeft0(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, int offset1, int length1, int offset2, int length2, final int width, final MutationsBuilder<NucleotideSequence> mutations, final MatrixCache cache) { if (length1 == 0 || length2 == 0) return new BandedSemiLocalResult(offset1 + length1, offset2 + length2, 0); offset1 += length1; offset2 += length2; int minLength = Math.min(length1, length2) + width; length1 = Math.min(length1, minLength); length2 = Math.min(length2, minLength); offset1 -= length1; offset2 -= length2; int size1 = length1 + 1, size2 = length2 + 1; cache.prepareMatrices(size1, size2, width, scoring); BandedMatrix main = cache.main; BandedMatrix gapIn1 = cache.gapIn1; BandedMatrix gapIn2 = cache.gapIn2; int i, j; int match, gap1, gap2, to; int maxI = -1, maxJ = -1, maxScore = 0; final int gapExtensionPenalty = scoring.getGapExtensionPenalty(); for (i = 0; i < length1; ++i) { to = Math.min(i + main.getRowFactor() - main.getColumnDelta() + 1, length2); for (j = Math.max(0, i - main.getColumnDelta()); j < to; ++j) { match = main.get(i, j) + scoring.getScore(seq1.codeAt(offset1 + length1 - 1 - i), seq2.codeAt(offset2 + length2 - 1 - j)); gap1 = Math.max(main.get(i + 1, j) + scoring.getGapOpenPenalty(), gapIn1.get(i + 1, j) + gapExtensionPenalty); gap2 = Math.max(main.get(i, j + 1) + scoring.getGapOpenPenalty(), gapIn2.get(i, j + 1) + gapExtensionPenalty); gapIn1.set(i + 1, j + 1, gap1); gapIn2.set(i + 1, j + 1, gap2); int score = Math.max(match, Math.max(gap1, gap2)); main.set(i + 1, j + 1, score); if (score > maxScore) { maxScore = score; maxI = i; maxJ = j; } } } i = maxI; j = maxJ; int pScore = main.get(i + 1, j + 1); byte c1, c2; while (i >= 0 || j >= 0) { if (i >= 0 && pScore == gapIn2.get(i + 1, j + 1)) { if (pScore == gapIn2.get(i, j + 1) + gapExtensionPenalty) pScore = gapIn2.get(i, j + 1); else pScore = main.get(i, j + 1); mutations.appendDeletion(offset1 + length1 - 1 - i, seq1.codeAt(offset1 + length1 - 1 - i)); --i; } else if (j >= 0 && pScore == gapIn1.get(i + 1, j + 1)) { if (pScore == gapIn1.get(i + 1, j) + gapExtensionPenalty) pScore = gapIn1.get(i + 1, j); else pScore = main.get(i + 1, j); mutations.appendInsertion(offset1 + length1 - 1 - i, seq2.codeAt(offset2 + length2 - 1 - j)); --j; } else if (i >= 0 && j >= 0 && pScore == main.get(i, j) + scoring.getScore(c1 = seq1.codeAt(offset1 + length1 - 1 - i), c2 = seq2.codeAt(offset2 + length2 - 1 - j))) { pScore = main.get(i, j); if (c1 != c2) mutations.appendSubstitution(offset1 + length1 - 1 - i, c1, c2); --i; --j; } else throw new RuntimeException(); } return new BandedSemiLocalResult(offset1 + length1 - 1 - maxI, offset2 + length2 - 1 - maxJ, maxScore); } public static BandedSemiLocalResult semiGlobalRight0(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int addedNucleotides1, final int offset2, final int length2, final int addedNucleotides2, final int width, final MutationsBuilder<NucleotideSequence> mutations, final MatrixCache cache) { int size1 = length1 + 1, size2 = length2 + 1; cache.prepareMatrices(size1, size2, width, scoring); BandedMatrix main = cache.main; BandedMatrix gapIn1 = cache.gapIn1; BandedMatrix gapIn2 = cache.gapIn2; int i, j; int match, gap1, gap2, to; int gapExtensionPenalty = scoring.getGapExtensionPenalty(); for (i = 0; i < length1; ++i) { to = Math.min(i + main.getRowFactor() - main.getColumnDelta() + 1, length2); for (j = Math.max(0, i - main.getColumnDelta()); j < to; ++j) { match = main.get(i, j) + scoring.getScore(seq1.codeAt(offset1 + i), seq2.codeAt(offset2 + j)); gap1 = Math.max(main.get(i + 1, j) + scoring.getGapOpenPenalty(), gapIn1.get(i + 1, j) + gapExtensionPenalty); gap2 = Math.max(main.get(i, j + 1) + scoring.getGapOpenPenalty(), gapIn2.get(i, j + 1) + gapExtensionPenalty); gapIn1.set(i + 1, j + 1, gap1); gapIn2.set(i + 1, j + 1, gap2); main.set(i + 1, j + 1, Math.max(match, Math.max(gap1, gap2))); } } int maxI = 0, maxJ = 0, maxScore = Integer.MIN_VALUE; j = length2; for (i = length1 - addedNucleotides1; i < size1; ++i) if (maxScore < main.get(i, j)) { maxScore = main.get(i, j); maxI = i - 1; maxJ = j - 1; } i = length1; for (j = length2 - addedNucleotides2; j < size2; ++j) if (maxScore < main.get(i, j)) { maxScore = main.get(i, j); maxI = i - 1; maxJ = j - 1; } to = mutations.size(); i = maxI; j = maxJ; int pScore = main.get(i + 1, j + 1); byte c1, c2; while (i >= 0 || j >= 0) { if (i >= 0 && pScore == gapIn2.get(i + 1, j + 1)) { if (pScore == gapIn2.get(i, j + 1) + gapExtensionPenalty) pScore = gapIn2.get(i, j + 1); else pScore = main.get(i, j + 1); mutations.appendDeletion(offset1 + i, seq1.codeAt(offset1 + i)); --i; } else if (j >= 0 && pScore == gapIn1.get(i + 1, j + 1)) { if (pScore == gapIn1.get(i + 1, j) + gapExtensionPenalty) pScore = gapIn1.get(i + 1, j); else pScore = main.get(i + 1, j); mutations.appendInsertion(offset1 + i + 1, seq2.codeAt(offset2 + j)); --j; } else if (i >= 0 && j >= 0 && pScore == main.get(i, j) + scoring.getScore(c1 = seq1.codeAt(offset1 + i), c2 = seq2.codeAt(offset2 + j))) { pScore = main.get(i, j); if (c1 != c2) mutations.appendSubstitution(offset1 + i, c1, c2); --i; --j; } else throw new RuntimeException(); } mutations.reverseRange(to, mutations.size()); return new BandedSemiLocalResult(offset1 + maxI, offset2 + maxJ, maxScore); } public static BandedSemiLocalResult semiGlobalLeft0(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int addedNucleotides1, final int offset2, final int length2, final int addedNucleotides2, final int width, final MutationsBuilder<NucleotideSequence> mutations, final MatrixCache cache) { int size1 = length1 + 1, size2 = length2 + 1; cache.prepareMatrices(size1, size2, width, scoring); BandedMatrix main = cache.main; BandedMatrix gapIn1 = cache.gapIn1; BandedMatrix gapIn2 = cache.gapIn2; int i, j; int match, gap1, gap2, to; int gapExtensionPenalty = scoring.getGapExtensionPenalty(); for (i = 0; i < length1; ++i) { to = Math.min(i + main.getRowFactor() - main.getColumnDelta() + 1, length2); for (j = Math.max(0, i - main.getColumnDelta()); j < to; ++j) { match = main.get(i, j) + scoring.getScore(seq1.codeAt(offset1 + length1 - 1 - i), seq2.codeAt(offset2 + length2 - 1 - j)); gap1 = Math.max(main.get(i + 1, j) + scoring.getGapOpenPenalty(), gapIn1.get(i + 1, j) + gapExtensionPenalty); gap2 = Math.max(main.get(i, j + 1) + scoring.getGapOpenPenalty(), gapIn2.get(i, j + 1) + gapExtensionPenalty); gapIn1.set(i + 1, j + 1, gap1); gapIn2.set(i + 1, j + 1, gap2); main.set(i + 1, j + 1, Math.max(match, Math.max(gap1, gap2))); } } int maxI = 0, maxJ = 0, maxScore = Integer.MIN_VALUE; j = length2; for (i = length1 - addedNucleotides1; i < size1; ++i) if (maxScore < main.get(i, j)) { maxScore = main.get(i, j); maxI = i - 1; maxJ = j - 1; } i = length1; for (j = length2 - addedNucleotides2; j < size2; ++j) if (maxScore < main.get(i, j)) { maxScore = main.get(i, j); maxI = i - 1; maxJ = j - 1; } i = maxI; j = maxJ; int pScore = main.get(i + 1, j + 1); byte c1, c2; while (i >= 0 || j >= 0) { if (i >= 0 && pScore == gapIn2.get(i + 1, j + 1)) { if (pScore == gapIn2.get(i, j + 1) + gapExtensionPenalty) pScore = gapIn2.get(i, j + 1); else pScore = main.get(i, j + 1); mutations.appendDeletion(offset1 + length1 - 1 - i, seq1.codeAt(offset1 + length1 - 1 - i)); --i; } else if (j >= 0 && pScore == gapIn1.get(i + 1, j + 1)) { if (pScore == gapIn1.get(i + 1, j) + gapExtensionPenalty) pScore = gapIn1.get(i + 1, j); else pScore = main.get(i + 1, j); mutations.appendInsertion(offset1 + length1 - 1 - i, seq2.codeAt(offset2 + length2 - 1 - j)); --j; } else if (i >= 0 && j >= 0 && pScore == main.get(i, j) + scoring.getScore(c1 = seq1.codeAt(offset1 + length1 - 1 - i), c2 = seq2.codeAt(offset2 + length2 - 1 - j))) { pScore = main.get(i, j); if (c1 != c2) mutations.appendSubstitution(offset1 + length1 - 1 - i, c1, c2); --i; --j; } else throw new RuntimeException(); } return new BandedSemiLocalResult(offset1 + length1 - 1 - maxI, offset2 + length2 - 1 - maxJ, maxScore); } public static Alignment<NucleotideSequence> align(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int offset2, final int length2, final int width) { MutationsBuilder<NucleotideSequence> mutations = new MutationsBuilder<>(NucleotideSequence.ALPHABET); int score = align0(scoring, seq1, seq2, offset1, length1, offset2, length2, width, mutations, new MatrixCache()); return new Alignment<>(seq1, mutations.createAndDestroy(), new Range(offset1, offset1 + length1), new Range(offset2, offset2 + length2), score); } public static Alignment<NucleotideSequence> align(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int width) { return align(scoring, seq1, seq2, 0, seq1.size(), 0, seq2.size(), width); } public static Alignment<NucleotideSequence> semiLocalRight(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int offset2, final int length2, final int width) { MutationsBuilder<NucleotideSequence> mutations = new MutationsBuilder<>(NucleotideSequence.ALPHABET); BandedSemiLocalResult res = semiLocalRight0(scoring, seq1, seq2, offset1, length1, offset2, length2, width, mutations, new MatrixCache()); return new Alignment<>(seq1, mutations.createAndDestroy(), new Range(offset1, res.sequence1Stop + 1), new Range(offset2, res.sequence2Stop + 1), res.score); } public static Alignment<NucleotideSequence> semiLocalRight(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int width) { return semiLocalRight(scoring, seq1, seq2, 0, seq1.size(), 0, seq2.size(), width); } public static Alignment<NucleotideSequence> semiLocalLeft(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int offset2, final int length2, final int width) { MutationsBuilder<NucleotideSequence> mutations = new MutationsBuilder<>(NucleotideSequence.ALPHABET); BandedSemiLocalResult res = semiLocalLeft0(scoring, seq1, seq2, offset1, length1, offset2, length2, width, mutations, new MatrixCache()); return new Alignment<>(seq1, mutations.createAndDestroy(), new Range(res.sequence1Stop, offset1 + length1), new Range(res.sequence2Stop, offset2 + length2), res.score); } public static Alignment<NucleotideSequence> semiLocalLeft(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int width) { return semiLocalLeft(scoring, seq1, seq2, 0, seq1.size(), 0, seq2.size(), width); } public static Alignment<NucleotideSequence> semiGlobalRight(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int addedNucleotides1, final int offset2, final int length2, final int addedNucleotides2, final int width) { MutationsBuilder<NucleotideSequence> mutations = new MutationsBuilder<>(NucleotideSequence.ALPHABET); BandedSemiLocalResult res = semiGlobalRight0(scoring, seq1, seq2, offset1, length1, addedNucleotides1, offset2, length2, addedNucleotides2, width, mutations, new MatrixCache()); return new Alignment<>(seq1, mutations.createAndDestroy(), new Range(offset1, res.sequence1Stop + 1), new Range(offset2, res.sequence2Stop + 1), res.score); } public static Alignment<NucleotideSequence> semiGlobalLeft(final AffineGapAlignmentScoring<NucleotideSequence> scoring, final NucleotideSequence seq1, final NucleotideSequence seq2, final int offset1, final int length1, final int addedNucleotides1, final int offset2, final int length2, final int addedNucleotides2, final int width) { MutationsBuilder<NucleotideSequence> mutations = new MutationsBuilder<>(NucleotideSequence.ALPHABET); BandedSemiLocalResult res = semiGlobalLeft0(scoring, seq1, seq2, offset1, length1, addedNucleotides1, offset2, length2, addedNucleotides2, width, mutations, new MatrixCache()); return new Alignment<>(seq1, mutations.createAndDestroy(), new Range(res.sequence1Stop, offset1 + length1), new Range(res.sequence2Stop, offset2 + length2), res.score); } public static final class MatrixCache { private final CachedIntArray mainCache, gapIn1Cache, gapIn2Cache; private BandedMatrix main, gapIn1, gapIn2; public MatrixCache() { this.mainCache = new CachedIntArray(); this.gapIn1Cache = new CachedIntArray(); this.gapIn2Cache = new CachedIntArray(); } private void prepareMatrices(int size1, int size2, int width, AffineGapAlignmentScoring<NucleotideSequence> scoring) { BandedMatrix main = this.main = new BandedMatrix(mainCache, size1, size2, width); BandedMatrix gapIn1 = this.gapIn1 = new BandedMatrix(gapIn1Cache, size1, size2, width); BandedMatrix gapIn2 = this.gapIn2 = new BandedMatrix(gapIn2Cache, size1, size2, width); for (int i = main.getRowFactor() - main.getColumnDelta(); i > 0; --i) { int v = scoring.getGapOpenPenalty() + scoring.getGapExtensionPenalty() * (i - 1); main.set(0, i, v); gapIn1.set(0, i, v); gapIn2.set(0, i, BandedMatrix.DEFAULT_VALUE); } for (int i = main.getColumnDelta(); i > 0; --i) { int v = scoring.getGapOpenPenalty() + scoring.getGapExtensionPenalty() * (i - 1); main.set(i, 0, v); gapIn1.set(i, 0, BandedMatrix.DEFAULT_VALUE); gapIn2.set(i, 0, v); } main.set(0, 0, 0); gapIn1.set(0, 0, BandedMatrix.DEFAULT_VALUE); gapIn2.set(0, 0, BandedMatrix.DEFAULT_VALUE); } } }