/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.LinkedList;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import java.util.zip.ZipException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.record.Buffer;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.NIODataSink;
import org.commoncrawl.protocol.shared.ArcFileHeaderItem;
import org.commoncrawl.protocol.shared.ArcFileItem;
/**
*
* StreamingArcFileDecoder - Decoder capable of extracting ArcFileItem(s) from an ARC file in a non-blocking, streaming
* manner.
*
* @author rana
*
*/
public final class StreamingArcFileReader implements NIODataSink {
//////////////////////////////////////////////////////////////////////////////////
// data members
//////////////////////////////////////////////////////////////////////////////////
/** logging **/
private static final Log LOG = LogFactory.getLog(StreamingArcFileReader.class);
/** max expected arc header size **/
private static final int MAX_ARCHEADER_SIZE = 4096;
/** block size used for various operations **/
private static final int BLOCK_SIZE = 32 * 1024;
/** internal ByteBuffer wrapper for queuing byte buffers **/
private static final class BufferItem {
public BufferItem(ByteBuffer bufferItem) {
_buffer = bufferItem;
}
public ByteBuffer _buffer;
};
/** blocking consumer queue **/
private LinkedBlockingQueue<BufferItem> _consumerQueue = new LinkedBlockingQueue<BufferItem>();
/** current data available */
private int _bytesAvailable = 0;
/** 32 bit crc **/
private CRC32 _crc = new CRC32();
/** End Of Stream Indicator **/
private boolean _eosReached = false;
/** arc file header accumulator **/
private byte[] _arcFileHeader = new byte[MAX_ARCHEADER_SIZE];
/** arc file header size **/
private int _arcFileHeaderSize = 0;
/** input streams **/
private InputStream _rawInput = null;
private CheckedInputStream _checkedInput = null;
/** content bytes read counter **/
private int _contentBytesRead = 0;
/** inflater object **/
private Inflater _inflater = new Inflater(true);
/** the active input buffer **/
private ByteBuffer _activeInputBuffer = null;
/** flag indicating that this arc file has a header item **/
private boolean _hasHeaderItem = true;
long _streamPos = 0;
long _arcFileStartOffset;
private final static int FixedHeaderBytes = 2 + 1 + 1 + 6;
enum ReadState {
ReadingArcHeader,
ReadingArcHeaderData,
ReadingArcHeaderTrailer,
ReadingEntryHeader,
ReadingEntryData,
ReadingEntryTrailer,
Done
}
enum HeaderReadState {
ReadingFixedHeader,
ReadingFlagValues
}
ReadState _readState = ReadState.ReadingArcHeader;
ArcFileBuilder _builder = null;
HeaderReadState _headerReadState = HeaderReadState.ReadingFixedHeader;
int _headerFlags = 0;
int _headerExtraBytes = -1;
//////////////////////////////////////////////////////////////////////////////////
// public API
//////////////////////////////////////////////////////////////////////////////////
/**
* Costructs a new StreamingArcFileReader object
*
*/
public StreamingArcFileReader(boolean hasArcFileHeader) {
// setup the proper stream...
_rawInput = new InputStream() {
byte oneByteArray[] = new byte[1];
@Override
public synchronized int available() throws IOException {
return _bytesAvailable;
}
@Override
public int read() throws IOException {
if (read(oneByteArray,0,1) != -1) {
_streamPos++;
return oneByteArray[0] & 0xff;
}
return -1;
}
@Override
public int read(byte b[], int off, int len) throws IOException {
if (_activeInputBuffer == null || _activeInputBuffer.remaining() == 0) {
_activeInputBuffer = null;
BufferItem nextItem = null;
try {
if (_consumerQueue.size() != 0) {
nextItem = _consumerQueue.take();
}
} catch (InterruptedException e) {
}
if (nextItem != null) {
if (nextItem._buffer == null) {
return -1;
}
else {
_activeInputBuffer = nextItem._buffer;
}
}
}
if (_activeInputBuffer != null || _activeInputBuffer.remaining() != 0) {
final int sizeAvailable = _activeInputBuffer.remaining();
final int sizeToRead = Math.min(sizeAvailable,len);
_activeInputBuffer.get(b, off, sizeToRead);
_streamPos += sizeToRead;
synchronized(this) {
_bytesAvailable -= sizeToRead;
}
return sizeToRead;
}
else {
return 0;
}
}
};
_checkedInput = new CheckedInputStream(_rawInput,_crc);
if (!hasArcFileHeader) {
_readState = ReadState.ReadingEntryHeader;
}
}
/**
* Reset all interal variables and get the Reader ready to process a new ArcFile
*/
public void resetState() {
_readState = ReadState.ReadingArcHeader;
_builder = null;
_headerReadState = HeaderReadState.ReadingFixedHeader;
_headerFlags = 0;
_headerExtraBytes = -1;
_activeInputBuffer = null;
_consumerQueue.clear();
_crc.reset();
_eosReached = false;
_bytesAvailable = 0;
resetInflater();
}
/** indicate whether this arc file has a header item **/
public void setArcFileHasHeaderItemFlag(boolean value) {
_hasHeaderItem = value;
}
enum TriStateResult {
NeedsMoreData,
MoreItems,
NoMoreItems,
}
/**
* Checks to see if additional ArcFileItems can be extracted from the current ARC File Stream
* NON-BLOCKING version.
* @return TriStateResult.MoreItems if more items can be decoded from the stream,
* TriStateResult.NoMoreItems if we have reached the end of this stream,
* TriStateResult.NeedsMoreData if the decoder needs more data to determine next valid state
* @throws IOException if an error occurs processing ARC file data
*/
public synchronized TriStateResult hasMoreItems() throws IOException {
synchronized (this) {
// if data is still queued in the buffer ...
if (_bytesAvailable != 0) {
// then this means we have more items to process ...
return TriStateResult.MoreItems;
}
else {
// otherwise if eos stream indicator is set ...
if (_eosReached) {
// set appropriate state
_readState = ReadState.Done;
// and return nomore items
return TriStateResult.NoMoreItems;
}
else {
return TriStateResult.NeedsMoreData;
}
}
}
}
/**
* Attempts to deflate and read the next ArcFileItem from bytes available - NON-BLOCKING version
*
* @return Fully constructed ArcFileItem or NULL if not enough data is available to service the request
* @throws EOFException if end of stream is reached decoding item, or generic IOException if a corrupt stream is detected
*/
public ArcFileItem getNextItem() throws IOException {
// check state ...
if (_readState.ordinal() <= ReadState.ReadingArcHeaderTrailer.ordinal()) {
if (_hasHeaderItem) {
if (readARCHeader()) {
_crc.reset();
_readState = ReadState.ReadingEntryHeader;
}
}
else {
// skip arc header
_readState = ReadState.ReadingEntryHeader;
}
}
// if reading header for entry
if (_readState == ReadState.ReadingEntryHeader) {
if (readHeader()) {
_readState = ReadState.ReadingEntryData;
// reset crc accumulator
_crc.reset();
// and allocate a fresh builder object ..
_builder = new ArcFileBuilder();
}
}
// if reading data for entry ...
if (_readState == ReadState.ReadingEntryData) {
// read header line buffer
for(;;) {
byte scanBuffer[] = new byte[BLOCK_SIZE];
ByteBuffer byteBuffer = ByteBuffer.wrap(scanBuffer);
// read up to scan buffer size of data ...
int readAmount = readInflatedBytes(scanBuffer,0,scanBuffer.length);
// if we did not read any bytes ... return immediately ...
if (readAmount == 0) {
return null;
}
else if (readAmount != -1) {
// update crc value ...
_crc.update(scanBuffer,0,readAmount);
// update content bytes read
_contentBytesRead += readAmount;
// and setup buffer pointers ...
byteBuffer.position(0);
byteBuffer.limit(readAmount);
// and input data into builder ...
_builder.inputData(byteBuffer);
}
// -1 indicates eos
else {
// reset inflater ...
resetInflater();
// and transition to reading trailing bytes
_readState = ReadState.ReadingEntryTrailer;
break;
}
}
}
if (_readState == ReadState.ReadingEntryTrailer) {
// validate crc and header length ...
if (readTrailer()) {
// transition to assumed state ...
_readState = ReadState.ReadingEntryHeader;
// get the arc file item
ArcFileItem itemOut = _builder.finish();
itemOut.setArcFilePos((int)_arcFileStartOffset);
// reset builder
_builder = null;
//reset crc
_crc.reset();
// if no more data coming down the pipe...
if (_rawInput.available() == 0 && _eosReached) {
// transition to done state ...
_readState = ReadState.Done;
}
return itemOut;
}
}
return null;
}
/**
* NIODataSink method - called by implementor when all ARC File data has been exhauseted
*
*/
public void finished() {
_consumerQueue.add(new BufferItem(null));
_eosReached = true;
}
/**
* NIODataSink method - called by the implementor to queue up compressed ARC File data for processing
*/
public void available(ByteBuffer availableReadBuffer) {
_consumerQueue.add(new BufferItem(availableReadBuffer));
synchronized(this) {
_bytesAvailable += availableReadBuffer.remaining();
}
}
//////////////////////////////////////////////////////////////////////////////////
// internal helpers
//////////////////////////////////////////////////////////////////////////////////
private void resetInflater() {
_inflater.reset();
}
private int readInflatedBytes(byte[] b,int off,int len)throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
try {
//try to output some bytes from the inflater
int n;
while ((n = _inflater.inflate(b, off, len)) == 0) {
if (_inflater.finished() || _inflater.needsDictionary()) {
// these are EOS conditions
//first reclaim any remaining data from the inflater ...
if (_inflater.getRemaining() != 0) {
if (_activeInputBuffer == null) {
throw new RuntimeException("Bad State");
}
else {
// increment bytes available ...
synchronized(this) {
_bytesAvailable += _inflater.getRemaining();
_streamPos -= _inflater.getRemaining();
}
// and reposition cursor ...
_activeInputBuffer.position(_activeInputBuffer.position() - _inflater.getRemaining());
}
}
// b
return -1;
}
// we produced no output .. check to see if have more input to add
if (_inflater.needsInput()) {
if (_activeInputBuffer == null || _activeInputBuffer.remaining() == 0) {
_activeInputBuffer = null;
if (_consumerQueue.size() != 0) {
BufferItem nextItem = null;
try {
nextItem = _consumerQueue.take();
} catch (InterruptedException e) {
LOG.error(StringUtils.stringifyException(e));
}
if (nextItem._buffer == null) {
throw new EOFException();
}
else {
_activeInputBuffer = nextItem._buffer;
}
}
}
if (_activeInputBuffer == null) {
return 0;
}
else {
// feed the buffer to the inflater ...
_inflater.setInput(_activeInputBuffer.array(), _activeInputBuffer.position(), _activeInputBuffer.remaining());
// decrement bytes available ...
synchronized(this) {
_bytesAvailable -= _activeInputBuffer.remaining();
_streamPos += _activeInputBuffer.remaining();
}
// and advance its position so
_activeInputBuffer.position(_activeInputBuffer.position() + _activeInputBuffer.remaining());
}
}
}
return n;
} catch (DataFormatException e) {
String s = e.getMessage();
throw new ZipException(s != null ? s : "Invalid ZLIB data format");
}
}
private boolean readARCHeader() throws IOException {
if (_readState == ReadState.ReadingArcHeader) {
if (readHeader()) {
LOG.info("*** Found Fixed Header. Reading Metadata");
// reset crc here...
_crc.reset();
// and transition to reading data state ..
_readState = ReadState.ReadingArcHeaderData;
}
}
if (_readState == ReadState.ReadingArcHeaderData) {
int readAmount = 0;
while ((readAmount = readInflatedBytes(_arcFileHeader,_arcFileHeaderSize,_arcFileHeader.length - _arcFileHeaderSize)) > 0) {
LOG.info("*** Read:" + readAmount + " Metadata Bytes");
// update crc ...
_crc.update(_arcFileHeader,_arcFileHeaderSize,_arcFileHeaderSize + readAmount);
// increment content bytes read ...
_contentBytesRead += readAmount;
// and update length ...
_arcFileHeaderSize += readAmount;
}
if (_arcFileHeaderSize == MAX_ARCHEADER_SIZE) {
throw new IOException("Invalid ARC File Header. Exceeded Arc File Header Size:" + _arcFileHeaderSize);
}
else if (readAmount == -1) {
LOG.info("*** ARC File Header Size is:" + _arcFileHeaderSize);
// reached eos ...
// reset inflater
resetInflater();
// go to next state
_readState = ReadState.ReadingArcHeaderTrailer;
}
}
if (_readState == ReadState.ReadingArcHeaderTrailer) {
// read trailing bytes in gzip stream ...
if (readTrailer()) {
return true;
}
}
return false;
}
/**
* GZIP Code derived from GZIPInputStream code
*/
// GZIP header magic number.
public final static int GZIP_MAGIC = 0x8b1f;
/*
* File header flags.
*/
private final static int FHCRC = 2; // Header CRC
private final static int FEXTRA = 4; // Extra field
private final static int FNAME = 8; // File name
private final static int FCOMMENT = 16; // File comment
/*
* Reads GZIP member header.
*/
private boolean readHeader() throws IOException {
if (_rawInput.available() == 0 && _eosReached) {
throw new EOFException();
}
switch (_headerReadState) {
case ReadingFixedHeader: {
if (_rawInput.available() >= FixedHeaderBytes ) {
_arcFileStartOffset = _streamPos;
// reset crc accumulator first ...
_crc.reset();
// reset content bytes read counter ..
_contentBytesRead = 0;
// Check header magic
if (readUShort(_checkedInput) != GZIP_MAGIC) {
throw new IOException("Not in GZIP format");
}
// Check compression method
if (readUByte(_checkedInput) != 8) {
throw new IOException("Unsupported compression method");
}
// Read flags
_headerFlags = readUByte(_checkedInput);
// Skip MTIME, XFL, and OS fields
skipBytes(_checkedInput, 6);
_headerReadState = HeaderReadState.ReadingFlagValues;
}
else {
break;
}
}
case ReadingFlagValues: {
boolean advanceToNext = true;
// Skip optional extra field
if ((_headerFlags & FEXTRA) == FEXTRA) {
advanceToNext = false;
if (_headerExtraBytes == -1) {
if (_checkedInput.available() >= 2) {
_headerExtraBytes = readUShort(_checkedInput);
}
}
if (_headerExtraBytes != -1) {
if (_checkedInput.available() >= _headerExtraBytes) {
// skip the requisite bytes
skipBytes(_checkedInput, _headerExtraBytes);
// mask out current flag value ...
_headerFlags &= ~FEXTRA;
// set advanceToNext flag
advanceToNext = true;
}
}
}
while (advanceToNext && (_headerFlags & (FNAME|FCOMMENT)) != 0) {
int activeFlag = FCOMMENT;
if ((_headerFlags & FNAME) == FNAME)
activeFlag = FNAME;
advanceToNext = false;
while (_checkedInput.available() != 0) {
// keep scanning for null terminator
if (readUByte(_checkedInput) == 0) {
_headerFlags &= ~activeFlag;
advanceToNext = true;
}
}
}
if (advanceToNext && (_headerFlags & FHCRC) == FHCRC) {
if (_checkedInput.available() >= 2) {
int v = (int)_crc.getValue() & 0xffff;
if (readUShort(_checkedInput) != v) {
throw new IOException("Corrupt GZIP header");
}
_headerFlags &= ~FHCRC;
}
}
if (_headerFlags == 0 && _headerReadState == HeaderReadState.ReadingFlagValues) {
//reset header state variables...
_headerReadState = HeaderReadState.ReadingFixedHeader;
_headerFlags = 0;
_headerExtraBytes = -1;
return true;
}
}
break;
}
return false;
}
private static int ZIPTraierBytes = 8;
/*
* Reads GZIP member trailer.
*/
private boolean readTrailer() throws IOException {
if (_rawInput.available() >= ZIPTraierBytes) {
// Uses left-to-right evaluation order
if ((readUInt(_rawInput) != _crc.getValue()) ||
// rfc1952; ISIZE is the input size modulo 2^32
(readUInt(_rawInput) != _contentBytesRead))
throw new IOException("Corrupt GZIP trailer");
return true;
}
return false;
}
/*
* Reads unsigned integer in Intel byte order.
*/
private static long readUInt(InputStream in) throws IOException {
long s = readUShort(in);
return ((long)readUShort(in) << 16) | s;
}
/*
* Reads unsigned short in Intel byte order.
*/
private static int readUShort(InputStream in) throws IOException {
int b = readUByte(in);
return ((int)readUByte(in) << 8) | b;
}
/*
* Reads unsigned byte.
*/
private static int readUByte(InputStream in) throws IOException {
int b = in.read();
if (b == -1) {
throw new EOFException();
}
if (b < -1 || b > 255) {
// Report on this.in, not argument in; see read{Header, Trailer}.
throw new IOException("read() returned value out of range -1..255: " + b);
}
return b;
}
private byte[] tmpbuf = new byte[128];
/*
* Skips bytes of input data blocking until all bytes are skipped.
* Does not assume that the input stream is capable of seeking.
*/
private void skipBytes(InputStream in, int n) throws IOException {
while (n > 0) {
int len = in.read(tmpbuf, 0, n < tmpbuf.length ? n : tmpbuf.length);
if (len == -1) {
throw new EOFException();
}
n -= len;
}
}
/**
*
* ArcFileBuilder helper class - used to construct ArcFileItem objects from an ARC File Entry in a stateful manner
*
*/
private static class ArcFileBuilder {
//various states of processing an ARC FILE
private enum State {
LookingForMetadata,
LookingForHeaderTerminator,
ReadingContent
}
// ARC FILE HEADER TIMESTAMP FORMAT
SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat("yyyyMMddHHmmss");
// ArcFileItem this builder returns
ArcFileItem _item = new ArcFileItem();
// underlying content buffer
Buffer _content = new Buffer();
// Builder State
State _state = State.LookingForMetadata;
// Queued Input State
LinkedList<ByteBuffer> _buffers = new LinkedList<ByteBuffer>();
// Active Input Buffer
ByteBuffer _activeBuffer = null;
// Pattern Buffer - for capturing termination patterns
byte patternBuffer[] = new byte[4];
// Captured Pattern Length
int patternSize = 0;
// End Of Stream Indicator
boolean eos = false;
// Charsets used during decoding process
static Charset UTF8_Charset = Charset.forName("UTF8");
static Charset ASCII_Charset = Charset.forName("ASCII");
/** check for terminator pattern **/
private final boolean checkForTerminator() {
boolean terminatorFound = false;
switch (_state) {
// metadata line is terminated by a single line-feed
case LookingForMetadata: {
if (patternBuffer[0] == '\n') {
terminatorFound = true;
}
}
break;
// http headers are terminated by the standard crlf-crlf pattern
case LookingForHeaderTerminator: {
if (patternSize == 4
&& patternBuffer[0] == '\r'
&& patternBuffer[1] == '\n'
&& patternBuffer[2] == '\r'
&& patternBuffer[3] == '\n') {
terminatorFound = true;
}
}
break;
}
if (terminatorFound) {
// if active buffer contains no more characters...
if (_activeBuffer.remaining() == 0) {
// add entire active buffer to input state
_activeBuffer.rewind();
_buffers.addLast(_activeBuffer);
_activeBuffer = null;
}
else {
// otherwise, slice buffer at current position, and
// add one buffer to input state, and make the other current
ByteBuffer oldBuffer = _activeBuffer;
_activeBuffer = _activeBuffer.slice();
oldBuffer.limit(oldBuffer.position());
oldBuffer.rewind();
_buffers.addLast(oldBuffer);
}
}
return terminatorFound;
}
/**newInputStream
*
* @param buf - ByteBuffer to wrap as an InputStream
* @return InputStream - wrapped InputStream object
*/
private static InputStream newInputStream(final ByteBuffer buf) {
return new InputStream() {
public synchronized int read() throws IOException {
if (!buf.hasRemaining()) {
return -1;
}
return buf.get();
}
public synchronized int read(byte[] bytes, int off, int len) throws IOException {
// Read only what's left
len = Math.min(len, buf.remaining());
buf.get(bytes, off, len);
return len;
}
};
}
/** construct a reader given a list of ByteBuffers **/
private static InputStreamReader readerFromScanBufferList(LinkedList<ByteBuffer> buffers, Charset charset) throws IOException {
Vector<InputStream> inputStreams = new Vector<InputStream>();
for (ByteBuffer buffer : buffers) {
inputStreams.add(newInputStream(buffer));
}
buffers.clear();
SequenceInputStream seqInputStream = new SequenceInputStream(inputStreams.elements());;
return new InputStreamReader(seqInputStream,charset);
}
/** construct a single line from the current input state **/
private final String readLine(Charset charset) throws IOException {
BufferedReader reader = new BufferedReader(readerFromScanBufferList(_buffers, charset));
return reader.readLine();
}
/** process the metadata line of an ARC File Entry **/
private final void processMetadataLine(String metadata)throws IOException {
//LOG.info("Metadata line is:" + metadata);
StringTokenizer tokenizer = new StringTokenizer(metadata," ");
int tokenCount = 0;
while (tokenizer.hasMoreElements() && tokenCount <=5) {
switch (++tokenCount) {
// URI
case 1: {
_item.setUri(tokenizer.nextToken());
}
break;
// Host IP Address
case 2: {
_item.setHostIP(tokenizer.nextToken());
}
break;
// Timestamp
case 3: {
String timestamp = tokenizer.nextToken();
try {
_item.setTimestamp(TIMESTAMP14.parse(timestamp).getTime());
} catch (ParseException e) {
LOG.error(StringUtils.stringifyException(e));
throw new IOException("Invalid Timestamp in Metadata");
}
catch (NumberFormatException e) {
LOG.error("Number Format Exception Parsing Metadata Line:" + metadata + " TimeStamp:" + timestamp);
throw e;
}
}
break;
// MimeType
case 4: {
_item.setMimeType(tokenizer.nextToken());
}
break;
// and Record Length
case 5: {
_item.setRecordLength(Integer.parseInt(tokenizer.nextToken()));
}
break;
}
}
}
/** extract http headers from the current input state **/
private final void processHeaders() throws IOException {
BufferedReader reader = new BufferedReader(readerFromScanBufferList(_buffers,ArcFileBuilder.UTF8_Charset));
String line = null;
_item.setFieldDirty(ArcFileItem.Field_HEADERITEMS);
while ((line = reader.readLine()) != null) {
if (line.length() != 0) {
int colonPos = line.indexOf(':');
ArcFileHeaderItem item = new ArcFileHeaderItem();
if (colonPos != -1 && colonPos != line.length() - 1) {
item.setItemKey(line.substring(0,colonPos));
item.setItemValue(line.substring(colonPos + 1));
// if this is our special truncation flag ...
if (item.getItemKey().equals(Constants.ARCFileHeader_ContentTruncated)) {
String parts[] = item.getItemValue().split(",");
for (String part : parts) {
if (part.equals(ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate))) {
_item.setFlags(_item.getFlags() | ArcFileItem.Flags.TruncatedInDownload);
}
else if (part.equals(ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate))) {
_item.setFlags(_item.getFlags() | ArcFileItem.Flags.TruncatedInInflate);
}
}
}
}
else {
item.setItemValue(line);
}
_item.getHeaderItems().add(item);
}
}
}
/** transition from the current input state to the next input state **/
private final void transitionState()throws IOException {
switch (_state) {
case LookingForMetadata: {
String metadataline = readLine(ASCII_Charset);
try {
// decode the string as a utf-8 string
processMetadataLine(metadataline);
}
catch (NumberFormatException e) {
LOG.error("Error Parsing Metadata Line:" + metadataline + " Length:" + metadataline.length());
throw e;
}
// and advance to next state ...
_state = ArcFileBuilder.State.LookingForHeaderTerminator;
}
break;
case LookingForHeaderTerminator: {
// found header terminator
processHeaders();
// and advance to next state ...
_state = ArcFileBuilder.State.ReadingContent;
// and set up arc file item for read ...
_content.setCapacity(BLOCK_SIZE);
}
break;
}
}
/** inform builder that input for the current item has been exhauseted
*
* @return ArcFileItem - the fully constructed ArcFileItem object if construction was successfull
* @throws IOException - if building fails
*/
public final ArcFileItem finish() throws IOException {
if (_state == State.ReadingContent && _content.getCount() != 0) {
_item.setContent(_content);
_content = new Buffer();
return _item;
}
else {
throw new IOException("Incomplete ARC File Data Stream");
}
}
/**
* Input Data into the builder
*
* @param buffer - a piece of uncompressed content
* @throws IOException - throws exception if building fails
*/
public final void inputData(ByteBuffer buffer) throws IOException {
// set the buffer as the active buffer ...
_activeBuffer = buffer;
// scan looking for terminator
while(_activeBuffer != null && _activeBuffer.remaining() != 0) {
// if not reading content then
if (_state != ArcFileBuilder.State.ReadingContent) {
// read a byte at a time ...
byte b = _activeBuffer.get();
// and if the byte is a delimiter ...
if (b == '\r' || b == '\n') {
// add it to our pattern buffer
patternBuffer[patternSize++] = b;
// and check for pattern match (terminator match)
if (checkForTerminator()) {
transitionState();
}
}
// otherwise reset pattern buffer
else {
patternSize = 0;
}
}
else {
// calculate available storage in buffer ...
int available = _content.getCapacity() - _content.getCount();
// if we need more room ...
if (available < _activeBuffer.remaining()) {
// figure out how much to grow buffer by ...
int growByAmount = Math.max(_activeBuffer.remaining() - available, BLOCK_SIZE * 2);
// and grow the buffer ...
_content.setCapacity(_content.getCapacity() + growByAmount);
}
// copy the buffer data in one go ...
_content.append(_activeBuffer.array(),_activeBuffer.position() + _activeBuffer.arrayOffset(),_activeBuffer.remaining());
_activeBuffer = null;
}
}
// now if we reached the end of the buffer while scanning for a token ...
if (_activeBuffer != null) {
// add entire buffer to buffer list ...
_activeBuffer.rewind();
_buffers.add(_activeBuffer);
_activeBuffer = null;
}
}
}
//////////////////////////////////////////////////////////////////////////////////
// test routines
//////////////////////////////////////////////////////////////////////////////////
public void testReader(File arcFileItem) throws Exception {
resetState();
Thread thread = new Thread(new Runnable() {
public void run() {
try {
TriStateResult result;
while ((result = hasMoreItems()) != TriStateResult.NoMoreItems) {
if (result == TriStateResult.MoreItems) {
ArcFileItem item = null;
while ((item = getNextItem()) == null) {
LOG.info("Waiting to Read Next Item...");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
}
}
LOG.info("GOT Item URL:" + item.getUri() + " OFFSET:" + item.getArcFilePos() + " ContentSize:" + item.getContent().getCount());
for (ArcFileHeaderItem headerItem : item.getHeaderItems()) {
if (headerItem.isFieldDirty(ArcFileHeaderItem.Field_ITEMKEY)) {
//LOG.info("Header Item:" + headerItem.getItemKey() + " :" + headerItem.getItemValue());
}
else {
//LOG.info("Header Item:" + headerItem.getItemValue());
}
}
//LOG.info("Content Length:" + item.getContent().getCount());
}
else {
// LOG.info("Has More Items Returned Need More Data. Sleeping");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
}
}
}
LOG.info("NO MORE ITEMS... BYE");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
}
}
});
// run the thread ...
thread.start();
ReadableByteChannel channel = Channels.newChannel(new FileInputStream(arcFileItem));
try {
for(;;){
ByteBuffer buffer = ByteBuffer.allocate(BLOCK_SIZE);
int bytesRead = channel.read(buffer);
LOG.info("Read "+bytesRead + " From File");
if (bytesRead == -1) {
finished();
break;
}
else {
buffer.flip();
available(buffer);
}
}
}
finally {
channel.close();
}
// now wait for thread to die ...
LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE");
thread.join();
LOG.info("Done Reading File.... ArcFileThread to DIED");
}
public static void main(String[] args) {
File file = new File(args[0]);
StreamingArcFileReader reader = new StreamingArcFileReader(true);
try {
reader.testReader(file);
} catch (Exception e) {
e.printStackTrace();
}
}
}