// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package tests.it.crs4.seal.common; import it.crs4.seal.common.AlignOp; import it.crs4.seal.common.ReadPair; import it.crs4.seal.common.BamInputFormat; import it.crs4.seal.common.BamInputFormat.BamRecordReader; import it.crs4.seal.common.AbstractTaggedMapping; import it.crs4.seal.common.Utils; import org.junit.*; import static org.junit.Assert.*; import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.commons.codec.binary.Base64; import org.seqdoop.hadoop_bam.FileVirtualSplit; @Ignore("Bam input not ready yet") public class TestBamInputFormat { // a base64-encoded bam file with one read /* @HD VN:1.0 GO:none SO:coordinate @SQ SN:chr6 LN:249250621 ERR020229.100000/1 81 chr6 3558357 37 91M = 3558678 -400 AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91 */ public static final String oneRecord = "H4sIBAAAAAAA/wYAQkMCAGkAc3L0ZbRiYGBw8HDhDPOzMtQz4HT3t8rLz0vlDPa3Ss7PL0rJzEssSeVyCA7kDPazSs4oMuP08bMyMrE0MjUwMzLkYgTqZgVikAyDrfNtPgA/6Q/JUwAAAB+LCAQAAAAAAP8GAEJDAgDeAE2KQW7CMBBFhwV7cFzhRGPHMxISK5pkEYmlSVAXVVqVFilqL9ErEGXh43AYTsEhCI5gwdMf/a/Ru8Cd83sJ0ZKjCXzB3+P30pTwfx2G3X6fFVlRbNZ5NvKaw2kKIFed70XXOeWPW3F0cq5kL51XSnly0q+IqVaKnBCqm8mYKWamEB1zmlDAGEqMnqUpMkaW2ZKwC2RGNkHEUX4ildqgYWsJBWoMSI02qO2PO3w0FXw31dKFbrNq0uahx/0Z7q2Cpv7d5HAD4/H1G+4AAAAfiwgEAAAAAAD/BgBCQwIAGwADAAAAAAAAAAAA"; /* @HD VN:1.0 GO:none SO:coordinate @SQ SN:1 LN:249250621 Read/2 153 1 10018 25 101M = 10018 0 CTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTTAACCTAACCCTAACCCTAACCCTTAACCTAACCCTAACCCTAACCCTAACCCAACCCTAACC ##@@A;;4<2<=@.>.@7?5='?B;@B>EGEADAD?@<.>5@B<A0>>>:=>EE@EF@BGEF:ECFEAEDEEEDE>:=>:9AFEFDGFHHHHDHFHFHHHH Read/1 117 1 10018 0 * = 10018 0 TTAGGGCTAGGGCTAGGGCTAGGGCTAGGGTTAGGGTTAGGGCTAGGGCTAGGGCTAGGGCTAGGGCTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGG ##########DD=A2DBD><D:??.@D8DF>FBDFGDFFHHHADFFHEEFEEC=AFFCFF9GFEGECBGEFHHHGHHHHEHGHEHHHHGHHHHGHHGHHFH */ public static final String twoRecords = "H4sIBAAAAAAA/wYAQkMCAGQAc3L0ZTRnYGBw8HDhDPOzMtQz4HT3t8rLz0vlDPa3Ss7PL0rJzEssSeVyCA7kDAYq4PTxszIysTQyNTAzMuRiBGplAmJDBlvn23wAVOTHhU0AAAAfiwgEAAAAAAD/BgBCQwIA9wBtjz1OxDAQhYcUSxWhmAg28d/YM2OnQ3ACCprtVnsDJPYIHGC7vQKnoeBi2IFAin36rHl+08z7gh+FDHA97NQVfMBxlVUdjq9vD0+w3wBMXfxFLeak/sO172JQ8aSwabzHcbzVnTa+ta2/c73ZuDD6YDkxEpLzurW9DxpvrLWDscyexYfEMnAURibm8upq2KKwUJJcRFnybD7XNWCnAN4v13iEc/8yrTjPTJfChefmT0QGOwpkNQ3OtZ7uSawEkkRSD8E6mIU5GhSJItskpWWsXco+1WO5DM7LL81I/gZTG9aYigEAAB+LCAQAAAAAAP8GAEJDAgAbAAMAAAAAAAAAAAA="; private Configuration conf; private TaskAttemptContext context; private InputSplit split; private File tempFile; private LongWritable key; private ReadPair pair; private BamInputFormat format; @Before public void setup() throws IOException { tempFile = File.createTempFile("test_bam_input_format", ".bam"); conf = new Configuration(); key = null; pair = null; format = new BamInputFormat(); context = Utils.getTaskAttemptContext(conf); } @After public void tearDown() { tempFile.delete(); } private long writeToTemp(String b64) throws IOException { long written = 0; BufferedOutputStream out = new BufferedOutputStream( new FileOutputStream(tempFile) ); // Below, we use Base64.decodeBase64(byte[]) for compatibility with apache commons 1.3, // which is bundled with Hadoop 0.20 byte[] data = Base64.decodeBase64(b64.getBytes()); out.write(data); out.close(); written += data.length; return written; } private FileVirtualSplit makeVirtualSplit(FileSplit fsplit) throws IOException { List<InputSplit> list = new ArrayList<InputSplit>(1); list.add(fsplit); list = format.getVirtualSplits(list, conf); assertEquals(1, list.size()); return (FileVirtualSplit)list.get(0); } private BamRecordReader createReaderForData(String b64Data) throws IOException, InterruptedException { return createReaderForData(b64Data, 0, -1); } private BamRecordReader createReaderForData(String b64Data, long splitStart, long splitEnd) throws IOException, InterruptedException { long fileSize = writeToTemp(b64Data); if (splitEnd < 0) splitEnd = fileSize; FileSplit fsplit = new FileSplit(new Path(tempFile.toURI().toString()), splitStart, splitEnd, null); split = makeVirtualSplit(fsplit); BamRecordReader reader = (BamRecordReader)format.createRecordReader(split, context); reader.initialize(split, context); return reader; } @Test public void testReadFromStart() throws IOException, NoSuchFieldException, InterruptedException { BamRecordReader reader = createReaderForData(oneRecord); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.nextKeyValue(); assertTrue(retval); pair = reader.getCurrentValue(); AbstractTaggedMapping map = pair.getRead1(); assertNotNull(map); assertNull(pair.getRead2()); // test that the record has been read correctly assertEquals("ERR020229.100000/1", map.getName()); assertEquals(81, map.getFlag()); assertEquals("chr6", map.getContig()); assertEquals(3558357, map.get5Position()); assertEquals(37, map.getMapQ()); assertEquals("91M", AlignOp.cigarStr(map.getAlignment())); assertTrue(map.isTemplateLengthAvailable()); assertEquals(400, map.getTemplateLength()); assertEquals("AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA", map.getSequenceString()); assertEquals("5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C", map.getBaseQualitiesString()); assertEquals(0, map.getIntTag("NM")); assertEquals(37, map.getIntTag("SM")); assertEquals(0, map.getIntTag("AM")); assertEquals(1, map.getIntTag("X0")); assertEquals(0, map.getIntTag("X1")); assertEquals(0, map.getIntTag("XM")); assertEquals(0, map.getIntTag("XO")); assertEquals(0, map.getIntTag("XG")); assertEquals("U", map.getTag("XT")); assertEquals("91", map.getTag("MD")); // This test on the progress is kind of arbitrary. BamInputFormat doesn't report // very accurate progress due the underlying implementation in Hadoop-BAM. assertTrue(reader.getProgress() > 0.5); retval = reader.nextKeyValue(); assertFalse(retval); } @Ignore("test doesn't work. Need appropriate split location") @Test public void testReadStartInMiddle() throws IOException, InterruptedException { BamRecordReader reader = createReaderForData(twoRecords, 102, -1); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.nextKeyValue(); assertTrue(retval); pair = reader.getCurrentValue(); assertEquals("Read", pair.getName()); assertNull("reader didn't skip the first record", pair.getRead2()); AbstractTaggedMapping map = pair.getAnyRead(); assertEquals("Read/1", map.getName()); assertEquals(1.0, reader.getProgress(), 0.21); retval = reader.nextKeyValue(); assertFalse(retval); } @Test public void testSliceEndsBeforeEndOfFile() throws IOException, InterruptedException { BamRecordReader reader = createReaderForData(twoRecords, 0, 370); boolean retval = reader.nextKeyValue(); assertTrue(retval); assertFalse("BamRecordReader is reading a record that starts after the end of the slice", reader.nextKeyValue()); } @Test public void testProgress() throws IOException, InterruptedException { // getProgress() in BamRecordReader isn't really accurate, at least on a small // scale such as the two records we're using for this unit test. BamRecordReader reader = createReaderForData(twoRecords); assertEquals(0.0, reader.getProgress(), 0.01); reader.nextKeyValue(); //assertEquals(0.5, reader.getProgress(), 0.2); reader.nextKeyValue(); assertEquals(1.0, reader.getProgress(), 0.21); } @Test public void testClose() throws IOException, InterruptedException { BamRecordReader reader = createReaderForData(oneRecord); // doesn't really do anything but exercise the code reader.close(); } public static void main(String args[]) { org.junit.runner.JUnitCore.main(TestBamInputFormat.class.getName()); } }