// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package tests.it.crs4.seal.common; import it.crs4.seal.common.AlignOp; import it.crs4.seal.common.ReadPair; import it.crs4.seal.common.SamInputFormat; import it.crs4.seal.common.SamInputFormat.SamRecordReader; import it.crs4.seal.common.AbstractTaggedMapping; import it.crs4.seal.common.Utils; import org.junit.*; import static org.junit.Assert.*; import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.nio.ByteBuffer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; public class TestSamInputFormat { public static final String oneRecord = "ERR020229.100000/1 81 chr6 3558357 37 91M = 3558678 400 AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA 5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:91"; public static final String twoRecords = "Read/2 153 1 10018 25 101M = 10018 0 CTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTTAACCTAACCCTAACCCTAACCCTTAACCTAACCCTAACCCTAACCCTAACCCAACCCTAACC ##@@A;;4<2<=@.>.@7?5='?B;@B>EGEADAD?@<.>5@B<A0>>>:=>EE@EF@BGEF:ECFEAEDEEEDE>:=>:9AFEFDGFHHHHDHFHFHHHH" + "\n" + "Read/1 117 1 10018 0 * = 10018 0 TTAGGGCTAGGGCTAGGGCTAGGGCTAGGGTTAGGGTTAGGGCTAGGGCTAGGGCTAGGGCTAGGGCTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGG ##########DD=A2DBD><D:??.@D8DF>FBDFGDFFHHHADFFHEEFEEC=AFFCFF9GFEGECBGEFHHHGHHHHEHGHEHHHHGHHHHGHHGHHFH"; public static final String unmapped = "UNMAPPED 93 * * 0 * = 3558678 * AGCTT 5:CB:"; private Configuration conf; private FileSplit split; private File tempFile; private File tempGz; private LongWritable key; private ReadPair pair; @Before public void setup() throws IOException { tempFile = File.createTempFile("test_sam_input_format", ".sam"); tempGz = File.createTempFile("test_sam_input_format", ".gz"); conf = new Configuration(); key = null; pair = null; } @After public void tearDown() { tempFile.delete(); tempGz.delete(); split = null; } private void writeToTemp(String s) throws IOException { PrintWriter out = new PrintWriter( new BufferedWriter( new FileWriter(tempFile) ) ); out.write(s); out.close(); } private SamRecordReader createReaderForOneRecord() throws IOException { writeToTemp(oneRecord); split = new FileSplit(new Path(tempFile.toURI().toString()), 0, oneRecord.length(), null); SamRecordReader reader = new SamRecordReader(); reader.initialize(split, Utils.getTaskAttemptContext(conf)); return reader; } @Test public void testReadFromStart() throws IOException, NoSuchFieldException { SamRecordReader reader = createReaderForOneRecord(); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.nextKeyValue(); assertTrue(retval); assertEquals(new LongWritable(0), reader.getCurrentKey()); pair = reader.getCurrentValue(); AbstractTaggedMapping map = pair.getRead1(); assertNotNull(map); assertNull(pair.getRead2()); // test that the record has been read correctly assertEquals("ERR020229.100000/1", map.getName()); assertEquals(81, map.getFlag()); assertEquals("chr6", map.getContig()); assertEquals(3558357, map.get5Position()); assertEquals(37, map.getMapQ()); assertEquals("91M", AlignOp.cigarStr(map.getAlignment())); assertTrue(map.isTemplateLengthAvailable()); assertEquals(400, map.getTemplateLength()); ByteBuffer buffer = map.getSequence(); assertEquals("AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA", new String(buffer.array(), buffer.position(), (buffer.limit() - buffer.position()))); buffer = map.getBaseQualities(); assertEquals("5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C", new String(buffer.array(), buffer.position(), (buffer.limit() - buffer.position()))); assertEquals(0, map.getIntTag("NM")); assertEquals(37, map.getIntTag("SM")); assertEquals(0, map.getIntTag("AM")); assertEquals(1, map.getIntTag("X0")); assertEquals(0, map.getIntTag("X1")); assertEquals(0, map.getIntTag("XM")); assertEquals(0, map.getIntTag("XO")); assertEquals(0, map.getIntTag("XG")); assertEquals("U", map.getTag("XT")); assertEquals("91", map.getTag("MD")); assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.nextKeyValue(); assertFalse(retval); } @Test public void testReadStartInMiddle() throws IOException { writeToTemp(twoRecords); split = new FileSplit(new Path(tempFile.toURI().toString()), 10, twoRecords.length() - 10, null); SamRecordReader reader = new SamRecordReader(); reader.initialize(split, Utils.getTaskAttemptContext(conf)); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.nextKeyValue(); assertTrue(retval); assertEquals(new LongWritable(241), reader.getCurrentKey()); pair = reader.getCurrentValue(); assertEquals("Read", pair.getName()); assertNull(pair.getRead2()); AbstractTaggedMapping map = pair.getAnyRead(); assertEquals("Read/1", map.getName()); assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.nextKeyValue(); assertFalse(retval); } @Test public void testSliceEndsBeforeEndOfFile() throws IOException { writeToTemp(twoRecords); // slice ends at position 10--i.e. somewhere in the first record. The second record should not be read. split = new FileSplit(new Path(tempFile.toURI().toString()), 0, 10, null); SamRecordReader reader = new SamRecordReader(); reader.initialize(split, Utils.getTaskAttemptContext(conf)); boolean retval = reader.nextKeyValue(); assertTrue(retval); assertEquals(new LongWritable(0), reader.getCurrentKey()); assertFalse("SamRecordReader is reading a record that starts after the end of the slice", reader.nextKeyValue()); } @Test public void testProgress() throws IOException { writeToTemp(twoRecords); split = new FileSplit(new Path(tempFile.toURI().toString()), 0, twoRecords.length(), null); SamRecordReader reader = new SamRecordReader(); reader.initialize(split, Utils.getTaskAttemptContext(conf)); assertEquals(0.0, reader.getProgress(), 0.01); reader.nextKeyValue(); assertEquals(0.5, reader.getProgress(), 0.01); reader.nextKeyValue(); assertEquals(1.0, reader.getProgress(), 0.01); } @Test public void testClose() throws IOException { SamRecordReader reader = createReaderForOneRecord(); // doesn't really do anything but exercise the code reader.close(); } @Test public void testGzCompressedInput() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter out = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); out.write(twoRecords); out.close(); // now try to read it split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoRecords.length(), null); SamRecordReader reader = new SamRecordReader(); reader.initialize(split, Utils.getTaskAttemptContext(conf)); boolean retval = reader.nextKeyValue(); assertTrue(retval); assertEquals("Read/2", reader.getCurrentValue().getAnyRead().getName()); retval = reader.nextKeyValue(); assertTrue(retval); assertEquals("Read/1", reader.getCurrentValue().getAnyRead().getName()); } @Test(expected=RuntimeException.class) public void testCompressedSplit() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter out = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); out.write(twoRecords); out.close(); // now try to read it starting from the middle SamInputFormat inputFormat = new SamInputFormat(); split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoRecords.length(), null); RecordReader<LongWritable, ReadPair> reader = inputFormat.createRecordReader(split, Utils.getTaskAttemptContext(conf)); } public static void main(String args[]) { org.junit.runner.JUnitCore.main(TestSamInputFormat.class.getName()); } }