/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009, 2010, 2011, 2012 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.search; import java.io.IOException; import java.util.List; import java.util.concurrent.CountDownLatch; import org.apache.log4j.Logger; import bio.pih.genoogle.alignment.GenoogleSequenceAlignment; import bio.pih.genoogle.encoder.MaskEncoder; import bio.pih.genoogle.encoder.SequenceEncoder; import bio.pih.genoogle.index.ValueOutOfBoundsException; import bio.pih.genoogle.io.IndexedSequenceDataBank; import bio.pih.genoogle.io.RemoteSimilaritySequenceDataBank; import bio.pih.genoogle.search.results.HSP; import bio.pih.genoogle.seq.SymbolList; import bio.pih.genoogle.statistics.Statistics; import com.google.common.collect.Lists; /** * Interface witch defines methods for search for similar sequences and checks. * * @author albrecht */ public class IndexSearcher implements Runnable { private static final Logger logger = Logger.getLogger(IndexSearcher.class.getName()); protected final long id; protected final SearchParams sp; protected final SequenceEncoder encoder; protected final IndexedSequenceDataBank databank; private final Statistics statistics; private final List<RetrievedArea>[] retrievedAreas; private final CountDownLatch countDown; private final int subSequenceLength; private final SymbolList fullQuery; private final int offset; private final int[] encodedQuery; private final String sliceQuery; private final List<Throwable> fails; private final int readFrame; public IndexSearcher(long id, SearchParams sp, IndexedSequenceDataBank databank, SequenceEncoder encoder, int subSequenceLength, String sliceQuery, int offset, SymbolList fullQuery, int[] encodedQuery, List<RetrievedArea>[] retrievedAreas, Statistics statistics, CountDownLatch countDown, List<Throwable> fails, int readFrame) { this.id = id; this.sp = sp; this.databank = databank; this.sliceQuery = sliceQuery; this.offset = offset; this.fullQuery = fullQuery; this.encodedQuery = encodedQuery; this.retrievedAreas = retrievedAreas; this.statistics = statistics; this.countDown = countDown; this.fails = fails; this.encoder = encoder; this.subSequenceLength = subSequenceLength; this.readFrame = readFrame; } public IndexSearcher(long id, SearchParams sp, IndexedSequenceDataBank databank, String sliceQuery, int offset, SymbolList fullQuery, int[] encodedQuery, List<RetrievedArea>[] retrievedAreas, Statistics statistics, CountDownLatch countDown, List<Throwable> fails, int readFrame) { this(id, sp, databank, databank.getEncoder(), databank.getMaskEncoder() == null ? databank.getSubSequenceLength() : databank.getMaskEncoder().getPatternLength(), sliceQuery, offset, fullQuery, encodedQuery, retrievedAreas, statistics, countDown, fails, readFrame); } @Override public String toString() { StringBuilder sb = new StringBuilder(Long.toString(id)); sb.append(" (direct) "); return sb.toString(); } @Override public void run() { try { int queryLength = sliceQuery.length(); if (queryLength < subSequenceLength) { logger.info("Sequence: \"" + sliceQuery + "\" is too short. Its length is " + queryLength + " but should to be at least " + subSequenceLength + "."); return; } logger.info("[" + this.toString() + "] Begining the search at " + databank.getName() + " with the sequence with " + sliceQuery.length() + " bases and min subSequenceLength >= " + this.sp.getMinHspLength()); int[] iess = getEncodedSubSequences(sliceQuery, databank.getMaskEncoder()); long init = System.currentTimeMillis(); IndexRetrievedData retrievedData = getIndexPositions(iess, offset); retrievedData.finish(); List<RetrievedArea>[] retrievedAreasArray = retrievedData.getRetrievedAreasArray(); if (this.retrievedAreas == retrievedAreasArray) { logger.info("[" + this.toString() + "] Index search time:" + (System.currentTimeMillis() - init) + " with " + retrievedData.hits + " hits."); return; } final int length = retrievedAreasArray.length; for (int i = 0; i < length; i++) { List<RetrievedArea> localRetrievedAreas = retrievedAreasArray[i]; // TODO LOCK HERE BY THE "I" if (localRetrievedAreas != null) { List<RetrievedArea> retrievedAreasList = retrievedAreas[i]; // LOCK HERE if (retrievedAreasList == null) { retrievedAreas[i] = localRetrievedAreas; } else { List<RetrievedArea> toAdd = Lists.newArrayList(); for (RetrievedArea existingArea : retrievedAreasList) { for (RetrievedArea newArea : localRetrievedAreas) { if (!existingArea.testAndSet(newArea.getQueryAreaBegin(), newArea.getSequenceAreaBegin(), sp.getMaxSubSequencesDistance(), subSequenceLength)) { toAdd.add(newArea); } } } retrievedAreasList.addAll(toAdd); } } } logger.info("[" + this.toString() + "] Index search time:" + (System.currentTimeMillis() - init) + " with " + retrievedData.hits + " hits."); } catch (Throwable t) { fails.add(t); } finally { countDown.countDown(); } } private IndexRetrievedData getIndexPositions(final int[] iess, final int offset) throws ValueOutOfBoundsException, IOException { IndexRetrievedData retrievedData; if (fullQuery.getLength() == sliceQuery.length()) { retrievedData = new IndexRetrievedData(databank.getNumberOfSequences(), sp, subSequenceLength, this, this.retrievedAreas); } else { retrievedData = new IndexRetrievedData(databank.getNumberOfSequences(), sp, subSequenceLength, this); } for (int ss = 0; ss < iess.length; ss++) { retrieveIndexPosition(iess[ss], retrievedData, ss + offset); } return retrievedData; } private void retrieveIndexPosition(int encodedSubSequence, IndexRetrievedData retrievedData, int queryPos) throws ValueOutOfBoundsException, IOException { final long[] indexPositions = databank.getMatchingSubSequence(encodedSubSequence); for (int i = 0; i < indexPositions.length; i++) { retrievedData.addSubSequenceInfoIntRepresention(queryPos, indexPositions[i]); } } private int[] getEncodedSubSequences(String querySequence) { int size = querySequence.length() - (subSequenceLength - 1); int[] iess = new int[size]; for (int i = 0; i < size; i++) { String subSequence = querySequence.substring(i, i + subSequenceLength); iess[i] = encoder.encodeSubSequenceToInteger(subSequence); } return iess; } private int[] getEncodedSubSequences(String querySequence, MaskEncoder maskEncoder) { if (maskEncoder == null) { return getEncodedSubSequences(querySequence); } int size = querySequence.length() - (maskEncoder.getPatternLength() - 1); int[] iess = new int[size]; for (int i = 0; i < size; i++) { iess[i] = maskEncoder.applyMask(i, i + maskEncoder.getPatternLength(), querySequence); } return iess; } public SymbolList getQuery() { return fullQuery; } public int[] getEncodedQuery() { return encodedQuery; } public Statistics getStatistics() { return statistics; } public IndexedSequenceDataBank getDatabank() { return databank; } public SearchParams getSearchParams() { return sp; } public int getReadFrame() { return readFrame; } protected HSP createHSP(ExtendSequences extensionResult, GenoogleSequenceAlignment smithWaterman, double normalizedScore, double evalue, int queryLength, int targetLength) { int queryStart; int queryEnd; int targetStart; int targetEnd; if (databank instanceof RemoteSimilaritySequenceDataBank) { queryStart = getQueryStart(extensionResult, smithWaterman); queryEnd = getQueryEnd(extensionResult, smithWaterman); targetStart = getTargetStart(extensionResult, smithWaterman); targetEnd = getTargetEnd(extensionResult, smithWaterman); queryStart = ((queryStart - 1) * 3) + this.readFrame; queryEnd = ((queryEnd - 1) * 3) + this.readFrame; targetStart = ((targetStart - 1) * 3) + this.readFrame; targetEnd = ((targetEnd - 1) * 3) + this.readFrame; assert queryStart >= 1; assert queryEnd <= fullQuery.getLength() * 3; assert targetStart >= 1; } else { queryStart = getQueryStart(extensionResult, smithWaterman); queryEnd = getQueryEnd(extensionResult, smithWaterman); targetStart = getTargetStart(extensionResult, smithWaterman); targetEnd = getTargetEnd(extensionResult, smithWaterman); } return new HSP(smithWaterman, queryStart, queryEnd, targetStart, targetEnd, normalizedScore, evalue); } private int getQueryStart(ExtendSequences extensionResult, GenoogleSequenceAlignment smithWaterman) { return extensionResult.getBeginQuerySegment() + smithWaterman.getQueryStart(); } private int getQueryEnd(ExtendSequences extensionResult, GenoogleSequenceAlignment smithWaterman) { return extensionResult.getBeginQuerySegment() + smithWaterman.getQueryEnd(); } private int getTargetStart(ExtendSequences extensionResult, GenoogleSequenceAlignment smithWaterman) { return extensionResult.getBeginTargetSegment() + smithWaterman.getTargetStart(); } private int getTargetEnd(ExtendSequences extensionResult, GenoogleSequenceAlignment smithWaterman) { return extensionResult.getBeginTargetSegment() + smithWaterman.getTargetEnd(); } }