/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.BufferedReader; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackInputStream; import java.io.SequenceInputStream; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.Charset; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.LinkedList; import java.util.StringTokenizer; import java.util.Vector; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.zip.CRC32; import java.util.zip.CheckedInputStream; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.crawl.common.shared.Constants; import org.commoncrawl.io.NIODataSink; import org.commoncrawl.protocol.shared.ArcFileHeaderItem; import org.commoncrawl.protocol.shared.ArcFileItem; import org.junit.Assert; import org.junit.Test; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.record.Buffer; import org.apache.hadoop.util.StringUtils; /** * Decompresses ARC file and returns documents as individual ArcFileItems * * @author rana * */ public final class ArcFileReader extends InflaterInputStream implements NIODataSink { // //////////////////////////////////////////////////////////////////////////////// // data members // //////////////////////////////////////////////////////////////////////////////// /** logging **/ private static final Log LOG = LogFactory.getLog(ArcFileReader.class); /** internal ByteBuffer wrapper for queuing byte buffers **/ private static final class BufferItem { public BufferItem(ByteBuffer bufferItem) { _buffer = bufferItem; } public ByteBuffer _buffer; }; /** * helper dummy stream used to get around some stupid design decisions in * InflaterInputStream */ private static InputStream _dummyStream = new InputStream() { @Override public int read() throws IOException { return 0; } }; /** blocking consumer queue **/ private LinkedBlockingQueue<BufferItem> _consumerQueue = null; /** 32 bit crc **/ private CRC32 _crc = new CRC32(); /** flag indicating that this arc file has a header item **/ private boolean _hasHeaderItem = true; /** the arc file header , when available **/ private String _arcFileHeader = null; /** End Of Stream Indicator **/ private boolean _eosReached = false; /** block size used for various operations **/ public static final int DEFAULT_BLOCK_SIZE = 32 * 1024; /** default buffer queue size **/ public static final int DEFAULT_BUFFER_QUEUE_SIZE = 1024; /** default timeout value **/ public static final int DEFAULT_TIMEOUT_VALUE = -1; /** allocation block size **/ private static int _blockSize = DEFAULT_BLOCK_SIZE; /** buffer queue size **/ private static int _bufferQueueSize = DEFAULT_BUFFER_QUEUE_SIZE; /** timeout value for blocking input stream operations **/ private static int _ioTimeoutValue = DEFAULT_TIMEOUT_VALUE; /** stream pos variable **/ private int _streamPos = 0; // //////////////////////////////////////////////////////////////////////////////// // public API // //////////////////////////////////////////////////////////////////////////////// /** * Costructs a new ArcFileReader object with specified block size (for * allocations) */ public ArcFileReader() { super(_dummyStream, new Inflater(true), _blockSize); // set up buffer queue ... _consumerQueue = new LinkedBlockingQueue<BufferItem>(_bufferQueueSize); // setup the proper stream... super.in = new PushbackInputStream(new InputStream() { ByteBuffer _activeBuffer = null; byte oneByteArray[] = new byte[1]; @Override public int read() throws IOException { if (read(oneByteArray, 0, 1) != -1) { return oneByteArray[0] & 0xff; } return -1; } @Override public int read(byte b[], int off, int len) throws IOException { if (_activeBuffer == null || _activeBuffer.remaining() == 0) { BufferItem nextItem = null; try { // when io timeout is not specified, block indefinitely... if (_ioTimeoutValue == -1) { nextItem = _consumerQueue.take(); } // otherwise wait for specified time on io else { nextItem = _consumerQueue.poll(_ioTimeoutValue, TimeUnit.MILLISECONDS); if (nextItem == null) { throw new IOException("IO Timeout waiting for Buffer"); } } } catch (InterruptedException e) { throw new IOException("Thread Interrupted waiting for Buffer"); } if (nextItem._buffer == null) { _eosReached = true; // EOF CONDITION ... return -1; } else { _activeBuffer = nextItem._buffer; } } final int sizeAvailable = _activeBuffer.remaining(); final int readSize = Math.min(sizeAvailable, len); _activeBuffer.get(b, off, readSize); _streamPos += readSize; return readSize; } }, _blockSize); } /** set the default block size **/ public static void setBlockSize(int blockSize) { _blockSize = blockSize; } /** set the buffer queue size **/ public static void setBufferQueueSize(int bufferQueueSize) { _bufferQueueSize = bufferQueueSize; } /** set the default timeout value for blocking io operations **/ public static void setIOTimeoutValue(int timeoutInMilliseconds) { _ioTimeoutValue = timeoutInMilliseconds; } /** indicate whether this arc file has a header item **/ public void setArcFileHasHeaderItemFlag(boolean value) { _hasHeaderItem = value; } /** * Reset all interal variables and get the Reader ready to process a new * ArcFile */ public void resetState() { _arcFileHeader = null; _consumerQueue.clear(); _crc.reset(); _eosReached = false; resetInflater(); } /** * Checks to see if additional ArcFileItems can be extracted from the current * ARC File Stream This is a BLOCKING CALL - it will block on * _consumerQueue.take if no data is available... * * @return true if another ArcFileItem can be extracted from the stream. * @throws IOException * if an error occurs processing ARC file data */ public boolean hasMoreItems() throws IOException { if (_arcFileHeader == null && _hasHeaderItem) { readARCHeader(); } return readHeader(); } /** * blocking call to retrieve next ArcFileItem contained within an ARC File * * @return Fully constructed ArcFileItem * @throws IOException */ public void getNextItem(ArcFileItem itemOut) throws IOException { // preserve incoming arc file name ... String arcFileName = itemOut.getArcFileName(); // reset item itemOut.clear(); // restore arc file name itemOut.setArcFileName(arcFileName); // read content _crc.reset(); // and reset inflater resetInflater(); // set the arc file stream positon up front itemOut.setArcFilePos(getARCFileStreamPos()); ArcFileBuilder builder = new ArcFileBuilder(itemOut); // read header line buffer for (;;) { byte scanBuffer[] = new byte[_blockSize]; ByteBuffer byteBuffer = ByteBuffer.wrap(scanBuffer); // read up to scan buffer size of data ... int readAmount = read(scanBuffer, 0, scanBuffer.length); if (readAmount != -1) { // update crc calculation _crc.update(scanBuffer, 0, readAmount); // and limit byte buffer ... byteBuffer.limit(readAmount); // and then input data input builder builder.inputData(byteBuffer); } else { // validate crc and header length ... readTrailer(); builder.finish(); // set the compressed content size ... itemOut.setArcFileSize(getARCFileStreamPos() - itemOut.getArcFilePos()); return; } } } /** * NIODataSink method - called by implementor when all ARC File data has been * exhauseted * */ public void finished() { try { _consumerQueue.put(new BufferItem(null)); } catch (InterruptedException e) { } } /** * NIODataSink method - called by the implementor to queue up compressed ARC * File data for processing */ public void available(ByteBuffer availableReadBuffer) { try { _consumerQueue.put(new BufferItem(availableReadBuffer)); } catch (InterruptedException e) { } } // //////////////////////////////////////////////////////////////////////////////// // internal helpers // //////////////////////////////////////////////////////////////////////////////// private void resetInflater() { inf.reset(); } private void readARCHeader() throws IOException { readHeader(); byte accumBuffer[] = new byte[4096]; int accumAmount = 0; int readAmt = 0; while ((readAmt = this.read(accumBuffer, accumAmount, accumBuffer.length - accumAmount)) > 0) { accumAmount += readAmt; if (accumAmount == accumBuffer.length) { throw new IOException("Invalid ARC File Header"); } } if (readAmt == 0 || accumAmount == 0) { throw new IOException("Invalid ARC File Header"); } else { // calculate header crc ... _crc.reset(); _crc.update(accumBuffer, 0, accumAmount); // validate crc and header length ... readTrailer(); // and decode header string ... _arcFileHeader = new String(accumBuffer, 0, accumAmount, "ISO-8859-1"); } } /** * GZIP Code derived from GZIPInputStream code */ // GZIP header magic number. private final static int GZIP_MAGIC = 0x8b1f; /* * File header flags. */ private final static int FHCRC = 2; // Header CRC private final static int FEXTRA = 4; // Extra field private final static int FNAME = 8; // File name private final static int FCOMMENT = 16; // File comment /* * Reads GZIP member header. */ private boolean readHeader() throws IOException { if (!_eosReached) { CheckedInputStream in = new CheckedInputStream(this.in, _crc); _crc.reset(); try { // Check header magic if (readUShort(in) != GZIP_MAGIC) { throw new IOException("Not in GZIP format"); } // Check compression method if (readUByte(in) != 8) { throw new IOException("Unsupported compression method"); } // Read flags int flg = readUByte(in); // Skip MTIME, XFL, and OS fields skipBytes(in, 6); // Skip optional extra field if ((flg & FEXTRA) == FEXTRA) { skipBytes(in, readUShort(in)); } // Skip optional file name if ((flg & FNAME) == FNAME) { while (readUByte(in) != 0) ; } // Skip optional file comment if ((flg & FCOMMENT) == FCOMMENT) { while (readUByte(in) != 0) ; } // Check optional header CRC if ((flg & FHCRC) == FHCRC) { int v = (int) _crc.getValue() & 0xffff; if (readUShort(in) != v) { throw new IOException("Corrupt GZIP header"); } } return true; } catch (EOFException e) { } } return false; } /* * Reads GZIP member trailer. */ private void readTrailer() throws IOException { PushbackInputStream in = (PushbackInputStream) this.in; int n = inf.getRemaining(); if (n > 0) { in.unread(buf, len - n, n); } // Uses left-to-right evaluation order if ((readUInt(in) != _crc.getValue()) || // rfc1952; ISIZE is the input size modulo 2^32 (readUInt(in) != (inf.getBytesWritten() & 0xffffffffL))) throw new IOException("Corrupt GZIP trailer"); } /* * Reads unsigned integer in Intel byte order. */ private static long readUInt(InputStream in) throws IOException { long s = readUShort(in); return ((long) readUShort(in) << 16) | s; } /* * Reads unsigned short in Intel byte order. */ private static int readUShort(InputStream in) throws IOException { int b = readUByte(in); return ((int) readUByte(in) << 8) | b; } /* * Reads unsigned byte. */ private static int readUByte(InputStream in) throws IOException { int b = in.read(); if (b == -1) { throw new EOFException(); } if (b < -1 || b > 255) { // Report on this.in, not argument in; see read{Header, Trailer}. throw new IOException("read() returned value out of range -1..255: " + b); } return b; } private byte[] tmpbuf = new byte[128]; /* * Skips bytes of input data blocking until all bytes are skipped. Does not * assume that the input stream is capable of seeking. */ private void skipBytes(InputStream in, int n) throws IOException { while (n > 0) { int len = in.read(tmpbuf, 0, n < tmpbuf.length ? n : tmpbuf.length); if (len == -1) { throw new EOFException(); } n -= len; } } /** * calculated raw arc file stream pos (taking into account any buffered data * contained within PushBackInputStream * * @return current stream position in bytes * @throws IOException * if error occurs */ private final int getARCFileStreamPos() throws IOException { PushbackInputStream in = (PushbackInputStream) this.in; return _streamPos - in.available(); } /** * * ArcFileBuilder helper class - used to construct ArcFileItem objects from an * ARC File Entry in a stateful manner * */ public static class ArcFileBuilder { // various states of processing an ARC FILE private enum State { LookingForMetadata, LookingForHeaderTerminator, ReadingContent, Finished } // ARC FILE HEADER TIMESTAMP FORMAT // Note: Not Thread-Safe, so every instance of builder needs its own copy // ... SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat( "yyyyMMddHHmmss"); // ArcFileItem this builder returns ArcFileItem _item = null; // the content buffer associated this item ... Buffer _buffer = new Buffer(); // Builder State State _state = State.LookingForMetadata; // Queued Input State LinkedList<ByteBuffer> _buffers = new LinkedList<ByteBuffer>(); // Active Input Buffer ByteBuffer _activeBuffer = null; // last matched pattern char byte lastMatchChar = 0; // match count int matchCount = 0; // End Of Stream Indicator boolean eos = false; // Charsets used during decoding process static Charset UTF8_Charset = Charset.forName("UTF8"); static Charset ASCII_Charset = Charset.forName("ASCII"); /** * Constructor * * @param itemToConstruct * - the ArcFileItem to build */ public ArcFileBuilder(ArcFileItem itemToConstruct) { _item = itemToConstruct; } private final boolean checkForCRLFTerminator(byte matchingChar) { if (matchingChar == '\n') { switch (matchCount) { case 1: ++matchCount; break; case 3: matchCount = 0; return true; default: matchCount = 0; } } else if (matchingChar == '\r') { switch (matchCount) { case 2: ++matchCount; break; default: matchCount = 1; break; } } else { matchCount = 0; } return false; } /** check for terminator pattern **/ private final boolean checkForTerminator(byte matchingChar) { boolean terminatorFound = false; switch (_state) { // metadata line is terminated by a single line-feed case LookingForMetadata: { if (matchingChar == '\n') { terminatorFound = true; } } break; // http headers are terminated by the standard crlf-crlf pattern case LookingForHeaderTerminator: { terminatorFound = checkForCRLFTerminator(matchingChar); } break; } if (terminatorFound) { // reset state ... matchCount = 0; // if active buffer contains no more characters... if (_activeBuffer.remaining() == 0) { // add entire active buffer to input state _activeBuffer.rewind(); _buffers.addLast(_activeBuffer); _activeBuffer = null; } else { // otherwise, slice buffer at current position, and // add one buffer to input state, and make the other current ByteBuffer oldBuffer = _activeBuffer; _activeBuffer = _activeBuffer.slice(); oldBuffer.limit(oldBuffer.position()); oldBuffer.rewind(); _buffers.addLast(oldBuffer); } } return terminatorFound; } /** * newInputStream * * @param buf * - ByteBuffer to wrap as an InputStream * @return InputStream - wrapped InputStream object */ private static InputStream newInputStream(final ByteBuffer buf) { return new InputStream() { public synchronized int read() throws IOException { if (!buf.hasRemaining()) { return -1; } return buf.get(); } public synchronized int read(byte[] bytes, int off, int len) throws IOException { // Read only what's left len = Math.min(len, buf.remaining()); buf.get(bytes, off, len); return len; } }; } /** construct a reader given a list of ByteBuffers **/ private static InputStreamReader readerFromScanBufferList( LinkedList<ByteBuffer> buffers, Charset charset) throws IOException { Vector<InputStream> inputStreams = new Vector<InputStream>(); for (ByteBuffer buffer : buffers) { inputStreams.add(newInputStream(buffer)); } buffers.clear(); SequenceInputStream seqInputStream = new SequenceInputStream(inputStreams .elements()); ; return new InputStreamReader(seqInputStream, charset); } /** construct a single line from the current input state **/ private final String readLine(Charset charset) throws IOException { BufferedReader reader = new BufferedReader(readerFromScanBufferList( _buffers, charset)); return reader.readLine(); } /** process the metadata line of an ARC File Entry **/ private final void processMetadataLine(String metadata) throws IOException { StringTokenizer tokenizer = new StringTokenizer(metadata, " "); int tokenCount = 0; while (tokenizer.hasMoreElements() && tokenCount <= 5) { switch (++tokenCount) { // URI case 1: { _item.setUri(tokenizer.nextToken()); } break; // Host IP Address case 2: { _item.setHostIP(tokenizer.nextToken()); } break; // Timestamp case 3: { String timeStamp = tokenizer.nextToken(); try { _item.setTimestamp(TIMESTAMP14.parse(timeStamp).getTime()); } catch (ParseException e) { LOG.error("Invalid Timestamp Encountered in Item Metdata. URL:" + _item.getUri() + " Timestamp:" + timeStamp + " Metadata:" + metadata); _item.setTimestamp(0); } } break; // MimeType case 4: { _item.setMimeType(tokenizer.nextToken()); } break; // and Record Length case 5: { _item.setRecordLength(Integer.parseInt(tokenizer.nextToken())); } break; } } } /** extract http headers from the current input state **/ private final void processHeaders() throws IOException { BufferedReader reader = new BufferedReader(readerFromScanBufferList( _buffers, ArcFileBuilder.UTF8_Charset)); String line = null; _item.setFieldDirty(ArcFileItem.Field_HEADERITEMS); while ((line = reader.readLine()) != null) { if (line.length() != 0) { int colonPos = line.indexOf(':'); ArcFileHeaderItem item = new ArcFileHeaderItem(); if (colonPos != -1 && colonPos != line.length() - 1) { item.setItemKey(line.substring(0, colonPos)); item.setItemValue(line.substring(colonPos + 1)); // if this is our special truncation flag ... if (item.getItemKey().equals( Constants.ARCFileHeader_ContentTruncated)) { String parts[] = item.getItemValue().split(","); for (String part : parts) { if (part.equals(ArcFileItem.Flags .toString(ArcFileItem.Flags.TruncatedInInflate))) { _item.setFlags(_item.getFlags() | ArcFileItem.Flags.TruncatedInDownload); } else if (part.equals(ArcFileItem.Flags .toString(ArcFileItem.Flags.TruncatedInInflate))) { _item.setFlags(_item.getFlags() | ArcFileItem.Flags.TruncatedInInflate); } } } } else { item.setItemValue(line); } _item.getHeaderItems().add(item); } } } /** transition from the current input state to the next input state **/ private final void transitionState() throws IOException { switch (_state) { case LookingForMetadata: { // decode the string as a utf-8 string processMetadataLine(readLine(ASCII_Charset)); // and advance to next state ... _state = ArcFileBuilder.State.LookingForHeaderTerminator; } break; case LookingForHeaderTerminator: { // found header terminator processHeaders(); // and advance to next state ... _state = ArcFileBuilder.State.ReadingContent; // and set up arc file item for read ... _buffer.setCapacity(_blockSize); } break; } } /** * inform builder that input for the current item has been exhauseted * * @return ArcFileItem - the fully constructed ArcFileItem object if * construction was successfull * @throws IOException * - if building fails */ public final void finish() throws IOException { if (_state == State.ReadingContent) { _state = State.Finished; // generate warning in case of zero content edge case ... if (_buffer.getCount() == 0) { LOG .error("ArcFileBuilder Encountered Item with Zero Length Content. URI:" + _item.getUri()); } else { _item.setContent(_buffer); _buffer = new Buffer(); } _item = null; } else { throw new IOException( "ArcBuilder finish calledin Invalid State. State:" + _state + " ArcFile:" + _item.getArcFileName() + " Position:" + _item.getArcFilePos() + " Item URI:" + _item.getUri()); } } /** * Input Data into the builder * * @param buffer * - a piece of uncompressed content * @throws IOException * - throws exception if building fails */ public final void inputData(ByteBuffer buffer) throws IOException { // set the buffer as the active buffer ... _activeBuffer = buffer; // scan looking for terminator while (_activeBuffer != null && _activeBuffer.remaining() != 0) { // if not reading content then if (_state != ArcFileBuilder.State.ReadingContent) { // read a byte at a time ... byte b = _activeBuffer.get(); // and if the byte is a delimiter ... if (b == '\r' || b == '\n') { // and check for pattern match (terminator match) if (checkForTerminator(b)) { transitionState(); } } // otherwise reset pattern buffer else { matchCount = 0; } } else { // calculate available storage in buffer ... int available = _buffer.getCapacity() - _buffer.getCount(); // if we need more room ... if (available < _activeBuffer.remaining()) { // figure out how much to grow buffer by ... int growByAmount = Math.max(_activeBuffer.remaining() - available, _blockSize * 2); // and grow the buffer ... _buffer.setCapacity(_buffer.getCapacity() + growByAmount); } // copy the buffer data in one go ... _buffer.append(_activeBuffer.array(), _activeBuffer.position() + _activeBuffer.arrayOffset(), _activeBuffer.remaining()); _activeBuffer = null; } } // now if we reached the end of the buffer while scanning for a token ... if (_activeBuffer != null) { // add entire buffer to buffer list ... _activeBuffer.rewind(); _buffers.add(_activeBuffer); _activeBuffer = null; } } } // //////////////////////////////////////////////////////////////////////////////// // test routines // //////////////////////////////////////////////////////////////////////////////// public void checkCRLFStateMachine() throws Exception { ArcFileItem item = new ArcFileItem(); ArcFileBuilder builder = new ArcFileBuilder(item); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n')); Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r')); Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n')); } @Test public void testReader(InputStream stream) throws IOException { setIOTimeoutValue(30000); resetState(); Thread thread = new Thread(new Runnable() { public void run() { try { while (hasMoreItems()) { ArcFileItem item = new ArcFileItem(); getNextItem(item); LOG.info("GOT Item URL:" + item.getUri() + " StreamPos:" + item.getArcFilePos() + " Content Length:" + item.getContent().getCount()); for (ArcFileHeaderItem headerItem : item.getHeaderItems()) { if (headerItem.isFieldDirty(ArcFileHeaderItem.Field_ITEMKEY)) { // LOG.info("Header Item:" + headerItem.getItemKey() + " :" + // headerItem.getItemValue()); } else { // LOG.info("Header Item:" + headerItem.getItemValue()); } } // LOG.info("Content Length:" + item.getContent().getCount()); // LOG.info("Content:"); /* * ByteArrayInputStream inputStream = new * ByteArrayInputStream(item.getContent * ().getReadOnlyBytes(),0,item.getContent().getCount()); * BufferedReader reader = new BufferedReader(new * InputStreamReader(inputStream,Charset.forName("ASCII"))); String * line = null; while ((line = reader.readLine()) != null) { * LOG.info(line); } */ } LOG.info("NO MORE ITEMS... BYE"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); } } }); // run the thread ... thread.start(); ReadableByteChannel channel = Channels .newChannel(stream); try { int totalBytesRead = 0; for (;;) { ByteBuffer buffer = ByteBuffer.allocate(32768); int bytesRead = channel.read(buffer); // LOG.info("Read "+bytesRead + " From File"); if (bytesRead == -1) { finished(); break; } else { buffer.flip(); totalBytesRead += buffer.remaining(); available(buffer); } } } finally { channel.close(); } // now wait for thread to die ... LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE"); try { thread.join(); } catch (InterruptedException e) { e.printStackTrace(); } LOG.info("Done Reading File.... ArcFileThread to DIED"); } public static void main(String[] args)throws IOException { Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); FileSystem fs = FileSystem.get(inputPath.toUri(),conf); FSDataInputStream stream = fs.open(inputPath); try { ArcFileReader reader = new ArcFileReader(); reader.testReader(stream); } finally { stream.close(); } } }