/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009,2010,2011,2012 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.search; import java.io.IOException; import java.util.List; import java.util.ListIterator; import java.util.concurrent.CountDownLatch; import bio.pih.genoogle.alignment.DividedSubstitutionMatrixSmithWaterman; import bio.pih.genoogle.alignment.SubstitutionMatrix; import bio.pih.genoogle.encoder.SequenceEncoder; import bio.pih.genoogle.io.AbstractSequenceDataBank; import bio.pih.genoogle.io.Utils; import bio.pih.genoogle.io.proto.Io.StoredSequence; import bio.pih.genoogle.search.results.HSP; import bio.pih.genoogle.search.results.Hit; import bio.pih.genoogle.search.results.SearchResults; import bio.pih.genoogle.seq.SymbolList; import com.google.common.collect.Lists; /** * Class responsible to extend and align the HSPs. * * @author albrecht */ public class SequenceAligner implements Runnable { private final CountDownLatch countDown; private final RetrievedSequenceAreas retrievedAreas; private final SearchResults sr; private final StoredSequence storedSequence; private final SequenceEncoder encoderDatabankConverted; // private final SequenceEncoder encoderDatabankReduced; // private final SequenceEncoder encoderDatabankInputReader; private final AbstractSequenceDataBank databank; private final SubstitutionMatrix substitutionTable; private final IndexSearcher[] indexes; /** * @param countDown * Synchronizer use to wait until all HSPs from all Sub sub banks * are extended and aligned. * @param retrievedAreas * retrievedAre which the HSPs that will be extended and * retrieved. * @param sr * Where the results are stored. */ public SequenceAligner(CountDownLatch countDown, IndexSearcher[] indexes, RetrievedSequenceAreas retrievedAreas, SearchResults sr, AbstractSequenceDataBank databank) throws IOException { this(countDown, indexes, retrievedAreas, sr, databank, databank.getEncoder(), databank.getEncoder(), databank.getEncoder(), SubstitutionMatrix.DUMMY); } public SequenceAligner(CountDownLatch countDown, IndexSearcher[] indexes, RetrievedSequenceAreas retrievedAreas, SearchResults sr, AbstractSequenceDataBank databank, SequenceEncoder encoderDatabankInputReader, SequenceEncoder encoderDatabankConverted, SequenceEncoder encoderDatabankReduced, SubstitutionMatrix substitutionTable) throws IOException { this.countDown = countDown; this.indexes = indexes; this.retrievedAreas = retrievedAreas; this.sr = sr; this.databank = databank; this.substitutionTable = substitutionTable; this.storedSequence = retrievedAreas.getStoredSequence(); this.encoderDatabankConverted = encoderDatabankConverted; } @Override public void run() { try { extendAndAlignHSPs(this.retrievedAreas, this.storedSequence); } catch (Exception e) { sr.addFail(e); } catch (AssertionError ae) { ae.printStackTrace(); } finally { countDown.countDown(); } } private void extendAndAlignHSPs(RetrievedSequenceAreas retrievedAreas, StoredSequence storedSequence) throws Exception { int[] encodedDatabankSequence = Utils.getEncodedSequenceAsArray(storedSequence); int targetLength = SequenceEncoder.getSequenceLength(encodedDatabankSequence); int offset = (indexes.length / 2); String databankSequence = encoderDatabankConverted.decodeIntegerArrayToString(encodedDatabankSequence); Hit hit = new Hit(storedSequence.getName(), storedSequence.getGi(), storedSequence.getDescription(), storedSequence.getAccession(), targetLength, databank.getAbsolutParent().getName()); List<RetrievedArea>[] areas = retrievedAreas.getAreas(); for (int i = 0; i < retrievedAreas.getFrames(); i++) { if (areas[i].size() > 0) { IndexSearcher searcher = indexes[i]; SymbolList query = searcher.getQuery(); int queryLength = query.getLength(); int[] encodedQuery = encoderDatabankConverted.encodeSymbolListToIntegerArray(searcher.getQuery()); List<ExtendSequences> extendedSequences = extendAreas(encodedDatabankSequence, targetLength, queryLength, encodedQuery, areas[i], searcher); extendedSequences = mergeExtendedAreas(extendedSequences); alignHSPs(hit, query, queryLength, targetLength, extendedSequences, searcher, databankSequence); } } List<RetrievedArea>[] reverseComplementAreas = retrievedAreas.getReverseComplementAreas(); for (int i = 0; i < retrievedAreas.getFrames(); i++) { if (reverseComplementAreas[i].size() > 0) { IndexSearcher searcher = indexes[i+offset]; SymbolList query = searcher.getQuery(); int queryLength = query.getLength(); int[] reverseEncodedQuery = encoderDatabankConverted.encodeSymbolListToIntegerArray(query); List<ExtendSequences> rcExtendedSequences = extendAreas(encodedDatabankSequence, targetLength, queryLength, reverseEncodedQuery, reverseComplementAreas[i], searcher); rcExtendedSequences = mergeExtendedAreas(rcExtendedSequences); alignHSPs(hit, query, queryLength, targetLength, rcExtendedSequences, searcher, databankSequence); } } sr.addHit(hit); } private List<ExtendSequences> extendAreas(int[] encodedSequence, int targetLength, int queryLength, int[] encodedQuery, List<RetrievedArea> areas, IndexSearcher searcher) { List<ExtendSequences> extendedSequencesList = Lists.newLinkedList(); for (int i = 0; i < areas.size(); i++) { RetrievedArea retrievedArea = areas.get(i); int sequenceAreaBegin = retrievedArea.getSequenceAreaBegin(); int sequenceAreaEnd = retrievedArea.getSequenceAreaEnd(); if (sequenceAreaEnd > targetLength) { sequenceAreaEnd = targetLength; } int queryAreaBegin = retrievedArea.getQueryAreaBegin(); int queryAreaEnd = retrievedArea.getQueryAreaEnd(); if (queryAreaEnd > queryLength) { queryAreaBegin = queryLength; } ExtendSequences extensionResult = ExtendSequences.doExtension(encodedQuery, queryAreaBegin, queryAreaEnd, encodedSequence, sequenceAreaBegin, sequenceAreaEnd, searcher.getSearchParams().getSequencesExtendDropoff(), encoderDatabankConverted, substitutionTable, searcher.getReadFrame()); if (extendedSequencesList.contains(extensionResult)) { continue; } extendedSequencesList.add(extensionResult); } return extendedSequencesList; } private void alignHSPs(Hit hit, SymbolList query, int queryLength, int targetLength, List<ExtendSequences> extendedSequencesList, IndexSearcher searcher, String reducedDatabankSequence) { String queryString = query.seqString(); for (ExtendSequences extensionResult : extendedSequencesList) { DividedSubstitutionMatrixSmithWaterman smithWaterman = new DividedSubstitutionMatrixSmithWaterman(substitutionTable, -5, -5, 2000); int beginQuerySegment; int endnQuerySegment; int beginTargetSegment; int endTargetSegment; beginQuerySegment = extensionResult.getBeginQuerySegment(); endnQuerySegment = extensionResult.getEndQuerySegment(); beginTargetSegment = extensionResult.getBeginTargetSegment(); endTargetSegment = extensionResult.getEndTargetSegment(); String targetSubSequence = reducedDatabankSequence.substring(beginTargetSegment, endTargetSegment); String querySubSequence = queryString.substring(beginQuerySegment, endnQuerySegment); smithWaterman.pairwiseAlignment(querySubSequence, targetSubSequence); double normalizedScore = searcher.getStatistics().nominalToNormalizedScore(smithWaterman.getScore()); double evalue = searcher.getStatistics().calculateEvalue(normalizedScore); HSP hsp = searcher.createHSP(extensionResult, smithWaterman, normalizedScore, evalue, queryLength, targetLength); hit.addHSP(hsp); } } /** * Check if the extended areas has overlapped positions and merge them. * * @param extendedSequences * @return {@link List} of {@link ExtendSequences} that are merged when they * have overlapped areas. */ private List<ExtendSequences> mergeExtendedAreas(List<ExtendSequences> extendedSequences) { ListIterator<ExtendSequences> iterator1 = extendedSequences.listIterator(); while (iterator1.hasNext()) { ExtendSequences extSeqs1 = iterator1.next(); ListIterator<ExtendSequences> iterator2 = extendedSequences.listIterator(iterator1.nextIndex()); while (iterator2.hasNext()) { ExtendSequences extSeqs2 = iterator2.next(); ExtendSequences merged = tryToMerge(extSeqs1, extSeqs2); if (merged != null) { extendedSequences.remove(extSeqs1); extendedSequences.remove(extSeqs2); extendedSequences.add(merged); return mergeExtendedAreas(extendedSequences); } } } return extendedSequences; } /** * Check if the {@link ExtendSequences} seq1 and seq2 are overlapped. * * @param seq1 * an {@link ExtendSequences} * @param seq2 * an {@link ExtendSequences} * @return a merged {@link ExtendSequences} or <code>null</code> if was not * merged. */ private ExtendSequences tryToMerge(ExtendSequences seq1, ExtendSequences seq2) { if (seq1.getReadFrame() != seq2.getReadFrame()) { return null; } int seq1QueryBegin = seq1.getBeginQuerySegment(); int seq1QueryEnd = seq1.getEndQuerySegment(); int seq1TargetBegin = seq1.getBeginTargetSegment(); int seq1TargetEnd = seq1.getEndTargetSegment(); int seq2QueryBegin = seq2.getBeginQuerySegment(); int seq2QueryEnd = seq2.getEndQuerySegment(); int seq2TargetBegin = seq2.getBeginTargetSegment(); int seq2TargetEnd = seq2.getEndTargetSegment(); int queryEnd = Math.max(seq1QueryEnd, seq2QueryEnd); int targetEnd = Math.max(seq1TargetEnd, seq2TargetEnd); if (Utils.contains(seq2TargetBegin, seq2TargetEnd, seq1TargetBegin, seq1TargetEnd) || Utils.contains(seq1TargetBegin, seq1TargetEnd, seq2TargetBegin, seq2TargetEnd)) { if ((Utils.isIn(seq1QueryBegin, seq1QueryEnd, seq2QueryBegin)) || Utils.isIn(seq2QueryBegin, seq2QueryEnd, seq1QueryBegin)) { return new ExtendSequences(seq1.getEncodedQuery(), seq2.getEncodedTarget(), Math.min(seq1QueryBegin, seq2QueryBegin), queryEnd, Math.min(seq1TargetBegin, seq2TargetBegin), targetEnd, seq1.getReadFrame()); } } if (Utils.contains(seq1QueryBegin, seq1QueryEnd, seq2QueryBegin, seq2QueryEnd) || Utils.contains(seq2QueryBegin, seq2QueryEnd, seq1QueryBegin, seq1QueryEnd)) { if ((Utils.isIn(seq1TargetBegin, seq1TargetEnd, seq2TargetBegin)) || Utils.isIn(seq2TargetBegin, seq2TargetEnd, seq1TargetBegin)) { return new ExtendSequences(seq1.getEncodedQuery(), seq2.getEncodedTarget(), Math.min(seq1QueryBegin, seq2QueryBegin), queryEnd, Math.min(seq1TargetBegin, seq2TargetBegin), targetEnd, seq1.getReadFrame()); } } if ((Utils.isIn(seq1QueryBegin, seq1QueryEnd, seq2QueryBegin)) || Utils.isIn(seq2QueryBegin, seq2QueryEnd, seq1QueryBegin)) { if (Utils.contains(seq2TargetBegin, seq2TargetEnd, seq1TargetBegin, seq1TargetEnd) || Utils.contains(seq1TargetBegin, seq1TargetEnd, seq2TargetBegin, seq2TargetEnd)) { return new ExtendSequences(seq1.getEncodedQuery(), seq2.getEncodedTarget(), Math.min(seq1QueryBegin, seq2QueryBegin), queryEnd, Math.min(seq1TargetBegin, seq2TargetBegin), targetEnd, seq1.getReadFrame()); } } if ((Utils.isIn(seq1TargetBegin, seq1TargetEnd, seq2TargetBegin)) || Utils.isIn(seq2TargetBegin, seq2TargetEnd, seq1TargetBegin)) { if (Utils.contains(seq1QueryBegin, seq1QueryEnd, seq2QueryBegin, seq2QueryEnd) || Utils.contains(seq2QueryBegin, seq2QueryEnd, seq1QueryBegin, seq1QueryEnd)) { return new ExtendSequences(seq1.getEncodedQuery(), seq2.getEncodedTarget(), Math.min(seq1QueryBegin, seq2QueryBegin), queryEnd, Math.min(seq1TargetBegin, seq2TargetBegin), targetEnd, seq1.getReadFrame()); } } if (Utils.contains(seq2TargetBegin, seq2TargetEnd, seq1TargetBegin, seq1TargetEnd) || Utils.contains(seq1TargetBegin, seq1TargetEnd, seq2TargetBegin, seq2TargetEnd)) { if (Utils.contains(seq1QueryBegin, seq1QueryEnd, seq2QueryBegin, seq2QueryEnd) || Utils.contains(seq2QueryBegin, seq2QueryEnd, seq1QueryBegin, seq1QueryEnd)) { return new ExtendSequences(seq1.getEncodedQuery(), seq2.getEncodedTarget(), Math.min(seq1QueryBegin, seq2QueryBegin), queryEnd, Math.min(seq1TargetBegin, seq2TargetBegin), targetEnd, seq1.getReadFrame()); } } if (Utils.isIn(seq1QueryBegin, seq1QueryEnd, seq2QueryBegin)) { if (Utils.isIn(seq1TargetBegin, seq1TargetEnd, seq2TargetBegin)) { return new ExtendSequences(seq1.getEncodedQuery(), seq1.getEncodedTarget(), seq1QueryBegin, queryEnd, seq1TargetBegin, targetEnd, seq1.getReadFrame()); } } if (Utils.isIn(seq2QueryBegin, seq2QueryEnd, seq1QueryBegin)) { if (Utils.isIn(seq2TargetBegin, seq2TargetEnd, seq1TargetBegin)) { return new ExtendSequences(seq1.getEncodedQuery(), seq1.getEncodedTarget(), seq2QueryBegin, queryEnd, seq2TargetBegin, targetEnd, seq1.getReadFrame()); } } return null; } }