/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.tests.index; import java.io.File; import java.io.IOException; import junit.framework.TestCase; import bio.pih.genoogle.encoder.SequenceEncoder; import bio.pih.genoogle.encoder.SequenceEncoderFactory; import bio.pih.genoogle.index.IndexConstructionException; import bio.pih.genoogle.index.MemoryInvertedIndex; import bio.pih.genoogle.index.SubSequenceIndexInfo; import bio.pih.genoogle.index.ValueOutOfBoundsException; import bio.pih.genoogle.index.builder.InvertedIndexBuilder; import bio.pih.genoogle.io.IndexedSequenceDataBank; import bio.pih.genoogle.seq.DNAAlphabet; import bio.pih.genoogle.seq.IllegalSymbolException; import bio.pih.genoogle.seq.LightweightSymbolList; import bio.pih.genoogle.seq.Sequence; import bio.pih.genoogle.seq.SymbolList; import bio.pih.genoogle.util.SymbolListWindowIterator; import bio.pih.genoogle.util.SymbolListWindowIteratorFactory; /** * Tests for the {@link MemorySubSequencesInvertedIndex} * * @author albrecht */ public class SubSequencesArrayIndexTest_8 extends TestCase { private static final String MASK = "11111111"; private static final int SUB_SEQUENCE_LENGTH = MASK.length(); private static final SequenceEncoder ENCODER = SequenceEncoderFactory.getEncoder(DNAAlphabet.SINGLETON, MASK.length()); IndexedSequenceDataBank dataBank; SequenceEncoder encoder; @Override protected void setUp() throws Exception { this.dataBank = new IndexedSequenceDataBank("TestDB", DNAAlphabet.SINGLETON, SUB_SEQUENCE_LENGTH, ENCODER, MASK, File.createTempFile(this.getName(), ".tmp"), null); encoder = SequenceEncoderFactory.getEncoder(DNAAlphabet.SINGLETON, 8); } @Override protected void tearDown() { this.dataBank = null; } private void populateNonSoRandomSequences(IndexedSequenceDataBank dataBank) throws IllegalSymbolException, IOException, IndexConstructionException { InvertedIndexBuilder indexBuilder = new InvertedIndexBuilder(dataBank); indexBuilder.constructIndex(); String stringSequence = "CATGACTGGCATCAGTGCATGCATGCAGTCAGTATATATGACGC"; Sequence ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 1"); indexBuilder.addSequence(1, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "ACATGCTCGATGTGTGTGTATCAGTACTGACCTAGCATGACTCAGTACACATGACGTCATCATGTAGCGTCTAGACTGACTACGTACGACTGCATACGACTATCAGACTGACTACGCATGACGTACGTGTACGTACTGATGACGTACTATCGTAGCATGACTACGTACGACTGAC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 1"); indexBuilder.addSequence(2, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "ATGCTAGCATTCAGTACGTACGCATGATGCTAGATCGCATGACTAGCACGTACTGCATCGTGTGTGTCATGTGACTGAC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 2"); indexBuilder.addSequence(3, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 3"); indexBuilder.addSequence(4, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 4"); indexBuilder.addSequence(5, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 5"); indexBuilder.addSequence(6, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 6"); indexBuilder.addSequence(7, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCAACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 7"); indexBuilder.addSequence(8, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "ATCTGAGTCATGCGATCAGTGTTGGTCATGTCAGGTCAGTACTACGTAGCATGCATGCATACGATCGACTATATTGCATGAC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 8"); indexBuilder.addSequence(9, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "AAAAAAACAAAAAAAGAAAAAAATTTTTTTGCATCAGATTTTTTTTCAGTACTGCATGACTACTGTGAC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 9"); indexBuilder.addSequence(10, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "TGCAGTACGTACGTGTTGAGTGCTATGCATGTTTAGGCGCGGCGCTAGCATGCATCAGACGCATACGTGTACGTACGTACTGATTCAGACTGAC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 10"); indexBuilder.addSequence(11, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "ACGTAGCTTACTATTGATATGAGTCGTGACGACTGACTACGTACGTACGACTGACTACGTATCGTCAGCTGCGTCATGCATTACTGACTGACTGAGTCTGATCATGACTTGACTGACTGACTGGTACTACGTGTACTACGTGTACTACGTAGCTACGACGTACGTACTGGTACTGACTGACGTGTACGCTAGCATGCATCGATGACGTACGTGATCTACTGACTGTACTGACTGGTACGACTACGTACGACTGACTGACTGACTACGATGCTGACTGACGTTGACGTACTGAC"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "Sequence 11"); indexBuilder.addSequence(12, encoder.encodeSymbolListToIntegerArray(ss)); stringSequence = "GGTTAATAAACGCAACGACAGTAATCCCCCGCTGCCATAGTGACAGACCGAGAGAAGCGAGCGGAGAAACCATAATATAATTTACCACTTACCTATTCATTTATCTACAGAAACAATGGACAACTCCGGCAAAGAAAAGGAGGCTATTCAGCTCATGGCTGAAGCCGACAAGAAAGTGAAGTCTTCCGGCTCTTTTTTAGGAGGAATGTTTGGAGGAAATCACAAAGTGGAGGAGGCTTGTGAGATGTACGCCAGAGCCGCCAACATGTTCAAAATGGCCAAGAACTGGAGTGCTGCAGGCAATGCTTTCTGTCAGGCAGCCAGAATTCATATGCAGCTTCAGAATAAACACGATTCTGCCACCAGCTACGTTGATGCTGGAAACGCCTTCAAGAAAGCAGATCCCAAGAGGCTATCAAGTGCTTAAACGCAGCAATTGATATTTACACAGACATGGTAAGATGTTTTTGTAGCTGTCAAAATCATATAATGTTGAGCCAGGCTGTTCTATTCCTGTACTGTGTTTGATCTGTGAACATTTTAAACGGCTACACA"; ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "NM_001045156.1"); indexBuilder.addSequence(13, encoder.encodeSymbolListToIntegerArray(ss)); indexBuilder.finishConstruction(); MemoryInvertedIndex index = dataBank.getIndex(); index.loadFromFile(); } public void testIfFindSubSequences() throws IllegalSymbolException, ValueOutOfBoundsException, IOException, IndexConstructionException { populateNonSoRandomSequences(dataBank); MemoryInvertedIndex index = dataBank.getIndex(); long[] matchingSubSequence = index.getMatchingSubSequence(LightweightSymbolList.createDNA("AAAAAAAA")); assertEquals(8, matchingSubSequence.length); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[0])); assertEquals(0, SubSequenceIndexInfo.getStart(matchingSubSequence[0])); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[1])); assertEquals(8, SubSequenceIndexInfo.getStart(matchingSubSequence[1])); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[2])); assertEquals(16, SubSequenceIndexInfo.getStart(matchingSubSequence[2])); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[3])); assertEquals(24, SubSequenceIndexInfo.getStart(matchingSubSequence[3])); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[4])); assertEquals(32, SubSequenceIndexInfo.getStart(matchingSubSequence[4])); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[5])); assertEquals(40, SubSequenceIndexInfo.getStart(matchingSubSequence[5])); assertEquals(4, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[6])); assertEquals(48, SubSequenceIndexInfo.getStart(matchingSubSequence[6])); matchingSubSequence = index.getMatchingSubSequence(LightweightSymbolList.createDNA("GCATGCAT")); assertEquals(2, matchingSubSequence.length); assertEquals(1, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[0])); assertEquals(16, SubSequenceIndexInfo.getStart(matchingSubSequence[0])); assertEquals(9, SubSequenceIndexInfo.getSequenceId(matchingSubSequence[1])); assertEquals(48, SubSequenceIndexInfo.getStart(matchingSubSequence[1])); String stringSequence = "GGTTAATAAACGCAACGACAGTAATCCCCCGCTGCCATAGTGACAGACCGAGAGAAGCGAGCGGAGAAACCATAATATAATTTACCACTTACCTATTCATTTATCTACAGAAACAATGGACAACTCCGGCAAAGAAAAGGAGGCTATTCAGCTCATGGCTGAAGCCGACAAGAAAGTGAAGTCTTCCGGCTCTTTTTTAGGAGGAATGTTTGGAGGAAATCACAAAGTGGAGGAGGCTTGTGAGATGTACGCCAGAGCCGCCAACATGTTCAAAATGGCCAAGAACTGGAGTGCTGCAGGCAATGCTTTCTGTCAGGCAGCCAGAATTCATATGCAGCTTCAGAATAAACACGATTCTGCCACCAGCTACGTTGATGCTGGAAACGCCTTCAAGAAAGCAGATCCCAAGAGGCTATCAAGTGCTTAAACGCAGCAATTGATATTTACACAGACATGGTAAGATGTTTTTGTAGCTGTCAAAATCATATAATGTTGAGCCAGGCTGTTCTATTCCTGTACTGTGTTTGATCTGTGAACATTTTAAACGGCTACACA"; Sequence ss = new Sequence(DNAAlphabet.SINGLETON, stringSequence, "NM_001045156.1"); SymbolListWindowIterator iterator = SymbolListWindowIteratorFactory.getNotOverlappedFactory().newSymbolListWindowIterator( ss, 8); int pos = 0; while (iterator.hasNext()) { SymbolList symbolList = iterator.next(); int encodedSubSequence = encoder.encodeSubSequenceToInteger(symbolList); matchingSubSequence = index.getMatchingSubSequence(encodedSubSequence); assertTrue(matchingSubSequence.length > 0); int sequenceId = SubSequenceIndexInfo.getSequenceId(matchingSubSequence[0]); int start = SubSequenceIndexInfo.getStart(matchingSubSequence[0]); assertEquals(pos, start); assertEquals(13, sequenceId); pos += SUB_SEQUENCE_LENGTH; } } }