package org.apache.hadoop.io.simpleseekableformat; import java.io.OutputStream; import junit.framework.Assert; import junit.framework.TestCase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CodecPrematureEOFException; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.ReflectionUtils; public class TestSimpleSeekableFormatCodec extends TestCase { /** * Test the SeekableFileFormatCodec with TextInputFormat. */ public void testTextInputFormat() throws Exception { testTextInputFormat(null); testTextInputFormat(GzipCodec.class); } public void testTextInputFormat(final Class<? extends CompressionCodec> codecClass) throws Exception { // Try basic test with no truncation testTextInputFormat(codecClass, 1, 0, -1); testTextInputFormat(codecClass, 1000, 0, -1); testTextInputFormat(codecClass, 1000 * 1000, 0, -1); // Try truncate the file at different positions // SimpleSeekableFormat should read partial files without any exceptions. testTextInputFormat(codecClass, 1000 * 1000, 0, 0); testTextInputFormat(codecClass, 1000 * 1000, 0, 11); testTextInputFormat(codecClass, 1000 * 1000, 0, 1024); testTextInputFormat(codecClass, 1000 * 1000, 0, 500 * 1000); testTextInputFormat(codecClass, 1000 * 1000, 0, 1024 * 1024); testTextInputFormat(codecClass, 1000 * 1000, 0, 1024 * 1024 + 100); testTextInputFormat(codecClass, 1000 * 1000, 0, 1024 * 1024 + 1024); testTextInputFormat(codecClass, 1000 * 1000, 0, 1024 * 1024 + 50 * 1000); // Try flush every 100 records testTextInputFormat(codecClass, 1000 * 1000, 500, 500 * 1000); testTextInputFormat(codecClass, 1000 * 1000, 500, 1024 * 1024 + 50 * 1000); } String getRecord(int recordId) { return "" + recordId; } /** * @param truncateSize < 0 means do not truncate the file. */ @SuppressWarnings("deprecation") void testTextInputFormat(final Class<? extends CompressionCodec> codecClass, final int numRecords, final int flushRecords, final int truncateSize) throws Exception { System.out.println("STARTING: numRecords=" + numRecords + " flushRecords=" + flushRecords + " truncateSize=" + truncateSize); final String fileName = System.getProperty("user.dir") + "/test.ssf"; Path path = new Path(fileName); Configuration conf = new Configuration(); FileSystem fs = LocalFileSystem.getLocal(conf); // Set codec class for compression inside the .ssf file. if (codecClass != null) { conf.setClass(SimpleSeekableFormat.FILEFORMAT_SSF_CODEC_CONF, codecClass, CompressionCodec.class); } // Write an SSF file to the FileSystem { OutputStream dataOut = null; OutputStream fileOut = fs.create(path); if (truncateSize >= 0) { fileOut = new UtilsForTests.TruncatedOutputStream(fileOut, truncateSize); } SimpleSeekableFormatCodec codec = ReflectionUtils.newInstance(SimpleSeekableFormatCodec.class, conf); dataOut = codec.createOutputStream(fileOut); // Try flush() with no data dataOut.flush(); for (int r = 0; r < numRecords; r++) { if (flushRecords > 0 && r % flushRecords == 0) { dataOut.flush(); } dataOut.write((getRecord(r) + "\n").getBytes("UTF-8")); } dataOut.close(); } // Check file size FileStatus fileStatus = fs.getFileStatus(path); // Make TextInputFormat recognize .ssf via config JobConf job = new JobConf(); job.set("fs.default.name", "file:///"); // System.out.println("FS DEFAULT: " + job.get("fs.default.name")); job.set("io.compression.codecs", SimpleSeekableFormatCodec.class.getName()); TextInputFormat textInputFormat = new TextInputFormat(); textInputFormat.configure(job); // Open the file using TextInputFormat TextInputFormat.addInputPath(job, path); InputSplit[] splits = textInputFormat.getSplits(job, 1); Assert.assertEquals(1, splits.length); RecordReader<LongWritable, Text> recordReader = textInputFormat.getRecordReader(splits[0], job, Reporter.NULL); // Verify the data LongWritable key = recordReader.createKey(); Text value = recordReader.createValue(); int correctRecords = 0; boolean hitException = false; for (int r = 0; r < numRecords; r++) { boolean gotData = false; try { gotData = recordReader.next(key, value); } catch (CodecPrematureEOFException e) { // ignore this exception // gotData will be false and we will treat it as end of file. hitException = true; System.out.println("Hit CodecPrematureEOFException."); } if (truncateSize < 0) { // Do asserts only if the file is not truncated. Assert.assertTrue(gotData); } if (gotData) { Assert.assertEquals(getRecord(r), value.toString()); correctRecords ++; } else { // Failed to get more records break; } } // Verify EOF if (!hitException) { boolean eof = !recordReader.next(key, value); Assert.assertTrue(eof); } recordReader.close(); // Print stats System.out.println("DONE: File length=" + fileStatus.getLen() + " records=" + numRecords + " correctRecords=" + correctRecords + " truncateSize=" + truncateSize); // Clean up fs.delete(path); } }