/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools.reference;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.StringUtil;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Test the indexed fasta sequence file reader.
*/
public class IndexedFastaSequenceFileTest{
private static File TEST_DATA_DIR = new File("testdata/htsjdk/samtools/reference");
private static File SEQUENCE_FILE = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta");
private static File SEQUENCE_FILE_NODICT = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.nodict.fasta");
private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" +
"TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG" +
"GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT";
private final String lastBasesOfChr20 = "ttgtctgatgctcatattgt";
private final int CHR20_LENGTH = 1000000;
@DataProvider(name="homosapiens")
public Object[][] provideSequenceFile() throws FileNotFoundException {
return new Object[][] { new Object[]
{ new IndexedFastaSequenceFile(SEQUENCE_FILE) },
{ new IndexedFastaSequenceFile(SEQUENCE_FILE_NODICT) }};
}
@DataProvider(name="comparative")
public Object[][] provideOriginalAndNewReaders() throws FileNotFoundException {
return new Object[][] {
new Object[] { ReferenceSequenceFileFactory.getReferenceSequenceFile(SEQUENCE_FILE),
new IndexedFastaSequenceFile(SEQUENCE_FILE) },
new Object[] { ReferenceSequenceFileFactory.getReferenceSequenceFile(SEQUENCE_FILE, true),
new IndexedFastaSequenceFile(SEQUENCE_FILE) },};
}
@Test(dataProvider="homosapiens")
public void testOpenFile(IndexedFastaSequenceFile sequenceFile) {
long startTime = System.currentTimeMillis();
Assert.assertNotNull(sequenceFile);
long endTime = System.currentTimeMillis();
CloserUtil.close(sequenceFile);
System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="homosapiens")
public void testFirstSequence(IndexedFastaSequenceFile sequenceFile) {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,firstBasesOfChrM.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),firstBasesOfChrM,"First n bases of chrM are incorrect");
CloserUtil.close(sequenceFile);
System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="homosapiens")
public void testFirstSequenceExtended(IndexedFastaSequenceFile sequenceFile) {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,extendedBasesOfChrM.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),extendedBasesOfChrM,"First n bases of chrM are incorrect");
CloserUtil.close(sequenceFile);
System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="homosapiens")
public void testReadStartingInCenterOfFirstLine(IndexedFastaSequenceFile sequenceFile) {
final int bytesToChopOff = 5;
String truncated = extendedBasesOfChrM.substring(bytesToChopOff);
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",
bytesToChopOff + 1,
bytesToChopOff + truncated.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),truncated,"First n bases of chrM are incorrect");
CloserUtil.close(sequenceFile);
System.err.printf("testReadStartingInCenterOfFirstLine runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="homosapiens")
public void testReadStartingInCenterOfMiddleLine(IndexedFastaSequenceFile sequenceFile) {
final int bytesToChopOff = 120;
String truncated = extendedBasesOfChrM.substring(bytesToChopOff);
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",
bytesToChopOff + 1,
bytesToChopOff + truncated.length());
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),truncated,"First n bases of chrM are incorrect");
CloserUtil.close(sequenceFile);
System.err.printf("testReadStartingInCenterOfMiddleLine runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="comparative")
public void testFirstCompleteContigRead(ReferenceSequenceFile originalSequenceFile, IndexedFastaSequenceFile sequenceFile) {
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSequence("chrM");
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect");
CloserUtil.close(originalSequenceFile);
CloserUtil.close(sequenceFile);
System.err.printf("testFirstCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="homosapiens",expectedExceptions=SAMException.class)
public void testReadThroughEndOfContig(IndexedFastaSequenceFile sequenceFile) {
long startTime = System.currentTimeMillis();
try {
sequenceFile.getSubsequenceAt("chrM",16500,16600);
}
finally {
long endTime = System.currentTimeMillis();
CloserUtil.close(sequenceFile);
System.err.printf("testReadThroughEndOfContig runtime: %dms%n", (endTime - startTime)) ;
}
}
@Test(dataProvider="homosapiens",expectedExceptions=SAMException.class)
public void testReadPastEndOfContig(IndexedFastaSequenceFile sequenceFile) {
long startTime = System.currentTimeMillis();
try {
sequenceFile.getSubsequenceAt("chrM",16800,16900);
}
finally {
long endTime = System.currentTimeMillis();
CloserUtil.close(sequenceFile);
System.err.printf("testReadPastEndOfContig runtime: %dms%n", (endTime - startTime)) ;
}
}
@Test(dataProvider="comparative")
public void testLastCompleteContigRead(ReferenceSequenceFile originalSequenceFile, IndexedFastaSequenceFile sequenceFile) {
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
while( !expectedSequence.getName().equals("chr20") )
expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSequence("chr20");
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrX_random is incorrect");
CloserUtil.close(originalSequenceFile);
CloserUtil.close(sequenceFile);
System.err.printf("testLastCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="homosapiens")
public void testLastOfChr20(IndexedFastaSequenceFile sequenceFile) {
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr20",
CHR20_LENGTH - lastBasesOfChr20.length()+1,
CHR20_LENGTH);
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),lastBasesOfChr20,"First n bases of chr1 are incorrect");
CloserUtil.close(sequenceFile);
System.err.printf("testFirstOfChr1 runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="comparative")
public void testFirstElementOfIterator(ReferenceSequenceFile originalSequenceFile,IndexedFastaSequenceFile sequenceFile) {
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.nextSequence();
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(), "chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(), 0,"Sequence contig index is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect");
CloserUtil.close(originalSequenceFile);
CloserUtil.close(sequenceFile);
System.err.printf("testFirstElementOfIterator runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="comparative")
public void testNextElementOfIterator(ReferenceSequenceFile originalSequenceFile, IndexedFastaSequenceFile sequenceFile) {
// Skip past the first one and load the second one.
originalSequenceFile.nextSequence();
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
sequenceFile.nextSequence();
ReferenceSequence sequence = sequenceFile.nextSequence();
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct");
Assert.assertEquals(sequence.length(),expectedSequence.length(),"Sequence size is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chr1 is incorrect");
CloserUtil.close(originalSequenceFile);
CloserUtil.close(sequenceFile);
System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ;
}
@Test(dataProvider="comparative")
public void testReset(ReferenceSequenceFile originalSequenceFile, IndexedFastaSequenceFile sequenceFile) {
// Skip past the first one and load the second one.
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
sequenceFile.nextSequence();
sequenceFile.nextSequence();
sequenceFile.reset();
ReferenceSequence sequence = sequenceFile.nextSequence();
long endTime = System.currentTimeMillis();
Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct");
Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct");
Assert.assertEquals(sequence.length(),expectedSequence.length(), "Sequence size is not correct");
Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect");
CloserUtil.close(originalSequenceFile);
CloserUtil.close(sequenceFile);
System.err.printf("testReset runtime: %dms%n", (endTime - startTime)) ;
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testMissingFile() throws Exception {
new IndexedFastaSequenceFile(new File(TEST_DATA_DIR, "non-existent.fasta"));
Assert.fail("FileNotFoundException should have been thrown");
}
}