ArchiveReader.java example

Explorer

webarchive-commons-master
- src
  - main
    - java
      - org
        archive
        RecoverableRecordFormatException.java
        extract
        CDXExtractorOutput.java
        DumpingExtractorOutput.java
        ExtractingResourceFactoryMapper.java
        ExtractingResourceProducer.java
        ExtractorOutput.java
        FilteredExtractorOuput.java
        JSONViewExtractorOutput.java
        ProducerUtils.java
        RealCDXExtractorOutput.java
        ResourceExtractor.java
        ResourceFactoryMapper.java
        WARCMetadataRecordExtractorOutput.java
        WATExtractorOutput.java
        format
        ArchiveFileConstants.java
        arc
        ARCConstants.java
        ARCFormatException.java
        ARCMetaData.java
        ARCMetaDataParser.java
        FiledescRecord.java
        FiledescRecordParser.java
        cdx
        CDX09Line.java
        CDX11Line.java
        CDXFieldConstants.java
        CDXFile.java
        CDXInputSource.java
        CDXLine.java
        CDXLineFactory.java
        FieldSplitFormat.java
        FieldSplitLine.java
        MultiCDXInputSource.java
        StandardCDXLineFactory.java
        dns
        DNSParseException.java
        DNSRecord.java
        DNSResponse.java
        DNSResponseParser.java
        gzip
        GZIPConstants.java
        GZIPDecoder.java
        GZIPFExtraRecord.java
        GZIPFExtraRecords.java
        GZIPFooter.java
        GZIPFormatException.java
        GZIPHeader.java
        GZIPMemberSeries.java
        GZIPMemberWriter.java
        GZIPMemberWriterCommittedOutputStream.java
        GZIPSeriesMember.java
        GZIPStaticHeader.java
        zipnum
        LineBufferingIterator.java
        MultiBlockIterator.java
        SummaryBlockIterator.java
        SummaryLine.java
        TimestampBestPickDedupIterator.java
        TimestampCustomDedupIterator.java
        TimestampDedupIterator.java
        ZipNumBlockLoader.java
        ZipNumCluster.java
        ZipNumIndex.java
        ZipNumParams.java
        ZipNumWriter.java
        http
        DumpingHTTPParseObserver.java
        HttpConstants.java
        HttpHeader.java
        HttpHeaderObserver.java
        HttpHeaderParser.java
        HttpHeaders.java
        HttpMessage.java
        HttpMessageParser.java
        HttpParseException.java
        HttpParseObserver.java
        HttpRequest.java
        HttpRequestMessage.java
        HttpRequestMessageObserver.java
        HttpRequestMessageParser.java
        HttpRequestParser.java
        HttpResponse.java
        HttpResponseMessage.java
        HttpResponseMessageObserver.java
        HttpResponseMessageParser.java
        HttpResponseParser.java
        json
        CompoundORJSONPathSpec.java
        CrossProductOfLists.java
        JSONPathSpec.java
        JSONPathSpecFactory.java
        JSONUtils.java
        JSONView.java
        SimpleJSONPathSpec.java
        text
        charset
        CharsetDetector.java
        RotatingCharsetDetector.java
        StandardCharsetDetector.java
        html
        CDATALexer.java
        LexParser.java
        NodeUtils.java
        ParseObserver.java
        warc
        WARCConstants.java
        WARCRecordWriter.java
        hadoop
        ArchiveJSONViewLoader.java
        ArchiveMetadataLoader.java
        FilenameInputFormat.java
        PerMapOutputFormat.java
        ResourceContext.java
        ResourceInputFormat.java
        ResourceRecordReader.java
        func
        JSONViewEvalFunc.java
        TupleFunc.java
        URLResolverFunc.java
        httpclient
        ConfigurableX509TrustManager.java
        HttpRecorderGetMethod.java
        HttpRecorderMethod.java
        HttpRecorderPostMethod.java
        SingleHttpConnectionManager.java
        ThreadLocalHttpConnectionManager.java
        io
        ArchiveFileConstants.java
        ArchiveReader.java
        ArchiveReaderFactory.java
        ArchiveRecord.java
        ArchiveRecordHeader.java
        ArraySeekInputStream.java
        BufferedSeekInputStream.java
        CharSubSequence.java
        CompositeFileInputStream.java
        CompositeFileReader.java
        Endian.java
        GZIPMembersInputStream.java
        GenerationFileHandler.java
        GenericReplayCharSequence.java
        GzipHeader.java
        HeaderedArchiveRecord.java
        LoudObjectOutputStream.java
        MiserOutputStream.java
        NoGzipMagicException.java
        ObjectPlusFilesInputStream.java
        ObjectPlusFilesOutputStream.java
        OriginSeekInputStream.java
        Preformatter.java
        RandomAccessInputStream.java
        RandomAccessOutputStream.java
        ReadSource.java
        RecorderIOException.java
        RecorderLengthExceededException.java
        RecorderTimeoutException.java
        RecorderTooMuchHeaderException.java
        RecordingInputStream.java
        RecordingOutputStream.java
        RecoverableIOException.java
        ReplayCharSequence.java
        ReplayInputStream.java
        RepositionableInputStream.java
        SafeSeekInputStream.java
        SeekInputStream.java
        SeekReader.java
        SeekReaderCharSequence.java
        SinkHandlerLogThread.java
        UTF8Bytes.java
        WriterPool.java
        WriterPoolMember.java
        WriterPoolSettings.java
        arc
        ARC2WCDX.java
        ARCConstants.java
        ARCLocation.java
        ARCReader.java
        ARCReaderFactory.java
        ARCRecord.java
        ARCRecordMetaData.java
        ARCUtils.java
        ARCWriter.java
        ARCWriterPool.java
        WriterPoolSettingsData.java
        warc
        WARCConstants.java
        WARCReader.java
        WARCReaderFactory.java
        WARCRecord.java
        WARCRecordInfo.java
        WARCWriter.java
        WARCWriterPool.java
        WARCWriterPoolSettings.java
        WARCWriterPoolSettingsData.java
        net
        DownloadURLConnection.java
        FTPException.java
        PublicSuffixes.java
        md5
        Handler.java
        Md5URLConnection.java
        rsync
        Handler.java
        RsyncURLConnection.java
        resource
        AbstractEmptyResource.java
        AbstractResource.java
        MetaData.java
        Resource.java
        ResourceConstants.java
        ResourceContainer.java
        ResourceFactory.java
        ResourceParseException.java
        ResourceProducer.java
        TransformingResourceProducer.java
        arc
        ARCResource.java
        ARCResourceFactory.java
        record
        FiledescResource.java
        FiledescResourceFactory.java
        generic
        GenericResourceProducer.java
        GenericStreamResource.java
        gzip
        GZIPMetaData.java
        GZIPResource.java
        GZIPResourceContainer.java
        html
        ExtractingParseObserver.java
        HTMLMetaData.java
        HTMLResource.java
        HTMLResourceFactory.java
        http
        HTTPHeadersResource.java
        HTTPHeadersResourceFactory.java
        HTTPRequestResource.java
        HTTPRequestResourceFactory.java
        HTTPResponseResource.java
        HTTPResponseResourceFactory.java
        producer
        ARCFile.java
        EnvelopedResourceFile.java
        WARCFile.java
        warc
        WARCResource.java
        WARCResourceFactory.java
        record
        DNSResource.java
        DNSResourceFactory.java
        WARCJSONMetaDataResource.java
        WARCJSONMetaDataResourceFactory.java
        WARCMetaDataResource.java
        WARCMetaDataResourceFactory.java
        streamcontext
        AbstractBufferingStream.java
        ByteArrayWrappedStream.java
        HDFSStream.java
        HTTP11Stream.java
        RandomAccessFileStream.java
        SimpleStream.java
        Stream.java
        StreamWrappedInputStream.java
        uid
        RecordIDGenerator.java
        UUIDGenerator.java
        url
        AggressiveIACanonicalizerRules.java
        AggressiveIAURLCanonicalizer.java
        BasicURLCanonicalizer.java
        CanonicalizeRules.java
        CanonicalizerConstants.java
        DefaultIACanonicalizerRules.java
        DefaultIAURLCanonicalizer.java
        ExtractRule.java
        GoogleURLCanonicalizer.java
        HandyURL.java
        IAURLCanonicalizer.java
        LaxURI.java
        LaxURLCodec.java
        NonMassagingIAURLCanonicalizer.java
        OrdinaryIACanonicalizerRules.java
        OrdinaryIAURLCanonicalizer.java
        RewriteRule.java
        SURT.java
        SURTTokenizer.java
        URLCanonicalizer.java
        URLKeyMaker.java
        URLParser.java
        URLRegexTransformer.java
        UrlSurtRangeComputer.java
        UsableURI.java
        UsableURIFactory.java
        WaybackURLKeyMaker.java
        util
        ArchiveUtils.java
        Base32.java
        ByteOp.java
        CrossProduct.java
        DateUtils.java
        DevUtils.java
        FileNameSpec.java
        FileUtils.java
        GeneralURIStreamFactory.java
        Grep.java
        HMACSigner.java
        IAUtils.java
        InetAddressUtil.java
        InterruptibleCharSequence.java
        IterableLineIterator.java
        LaxHttpParser.java
        MimetypeUtils.java
        NestedMap.java
        PrefixSet.java
        ProcessUtils.java
        ProgressStatisticsReporter.java
        PropertyUtils.java
        Recorder.java
        Reporter.java
        SURT.java
        StreamCopy.java
        StringFieldExtractor.java
        StringParse.java
        SurtPrefixSet.java
        TextUtils.java
        TmpDirTestCase.java
        anvl
        ANVLRecord.java
        Element.java
        Label.java
        SubElement.java
        Value.java
        binsearch
        AbstractSeekableLineReader.java
        ByteBufferInputStream.java
        FieldExtractingSLR.java
        FileSearchTool.java
        SeekCDXBenchmarker.java
        SeekableLineReader.java
        SeekableLineReaderFactory.java
        SeekableLineReaderIterator.java
        SortedTextFile.java
        WrappedSeekableLineReader.java
        impl
        HDFSSeekableLineReader.java
        HDFSSeekableLineReaderFactory.java
        HTTPSeekableLineReader.java
        HTTPSeekableLineReaderFactory.java
        MappedSeekableLineReader.java
        MappedSeekableLineReaderFactory.java
        NIOSeekableLineReader.java
        NIOSeekableLineReaderFactory.java
        RandomAccessFileSeekableLineReader.java
        RandomAccessFileSeekableLineReaderFactory.java
        http
        ApacheHttp31SLR.java
        ApacheHttp31SLRFactory.java
        ApacheHttp43SLR.java
        ApacheHttp43SLRFactory.java
        HTTPURLConnSLR.java
        HTTPURLConnSLRFactory.java
        io
        BytesReadObserver.java
        CRCInputStream.java
        CRCOutputStream.java
        CommitedOutputStream.java
        EOFNotifyingInputStream.java
        EOFObserver.java
        MultiMemberOpenJDKGZIPInputStream.java
        NotifyingInputStream.java
        PushBackOneByteInputStream.java
        RuntimeIOException.java
        iterator
        AbstractPeekableIterator.java
        BoundedStringIterator.java
        CachingStringFilter.java
        CloseableCompositeIterator.java
        CloseableIterator.java
        CloseableIteratorUtil.java
        CloseableIteratorWrapper.java
        FilterStringIterator.java
        LineReadingIterator.java
        LookaheadIterator.java
        PeekableIterator.java
        PrefixMatchStringIterator.java
        RegexLineIterator.java
        SortedCompositeIterator.java
        StartBoundedStringIterator.java
        StringFilter.java
        StringTransformer.java
        TransformingIteratorWrapper.java
        TransformingPrefixStringFilter.java
        zip
        GZIPMembersInputStream.java
        GzipHeader.java
        NoGzipMagicException.java
        OpenJDK7GZIPInputStream.java
        OpenJDK7InflaterInputStream.java
  - test
    - java
      - org
        archive
        extract
        RealCDXExtractorOutputTest.java
        format
        dns
        DNSResponseParserTest.java
        gzip
        GZIPMemberSeriesTest.java
        GZIPMemberWriterTest.java
        zipnum
        ZipNumWriterTest.java
        http
        HttpRequestMessageParserTest.java
        HttpResponseParserTest.java
        json
        CompoundORJSONPathSpecTest.java
        JSONPathSpecFactoryTest.java
        JSONViewTest.java
        SimpleJSONPathSpecTest.java
        text
        html
        CDATALexerTest.java
        io
        ArchiveReaderFactoryTest.java
        BufferedSeekInputStreamTest.java
        HeaderedArchiveRecordTest.java
        RecordingInputStreamTest.java
        RecordingOutputStreamTest.java
        ReplayCharSequenceTest.java
        RepositionableInputStreamTest.java
        arc
        ARCReaderFactoryTest.java
        ARCWriterPoolTest.java
        ARCWriterTest.java
        warc
        WARCReaderFactoryTest.java
        WARCWriterTest.java
        net
        PublicSuffixesTest.java
        resource
        html
        ExtractingParseObserverTest.java
        HTMLMetaDataTest.java
        uid
        UUIDGeneratorTest.java
        url
        AggressiveIAURLCanonicalizerTest.java
        BasicURLCanonicalizerTest.java
        HandyURLTest.java
        IAURLCanonicalizerTest.java
        OrdinaryIAURLCanonicalizerTest.java
        URLParserTest.java
        URLRegexTransformerTest.java
        UsableURIFactoryTest.java
        UsableURITest.java
        WaybackURLKeyMakerTest.java
        util
        ArchiveUtilsTest.java
        ByteOpTest.java
        CrossProductTest.java
        FileUtilsTest.java
        InterruptibleCharSequenceTest.java
        MimetypeUtilsTest.java
        PropertyUtilsTest.java
        StringFieldExtractorTest.java
        TestUtils.java
        anvl
        ANVLRecordTest.java
        binsearch
        SortedTextFileTest.java
        iterator
        CachingStringFilterTest.java
        FilterStringIteratorTest.java
        SortedCompositeIteratorTest.java
        zip
        GZIPMembersInputStreamTest.java

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io;


import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.archive.util.MimetypeUtils;
import org.archive.util.zip.GZIPMembersInputStream;

import com.google.common.io.CountingInputStream;


/**
 * Reader for an Archive file of Archive {@link ArchiveRecord}s.
 * @author stack
 * @version $Date$ $Version$
 */
public abstract class ArchiveReader implements ArchiveFileConstants, Iterable<ArchiveRecord>, Closeable {    
    /**
     * Is this Archive file compressed?
     */
    private boolean compressed = false;
    
    /**
     * Should we digest as we read?
     */
    private boolean digest = true;
    
    /**
     * Should the parse be strict?
     */
    private boolean strict = false;
    
    /**
     * Archive file input stream.
     *
     * Keep it around so we can close it when done.
     *
     * <p>Set in constructor. Should support at least 1 byte mark/reset.
     * Make it protected so subclasses have access.
     */
    protected InputStream in = null;
    
    /**
     * Maximum amount of recoverable exceptions in a row.
     * If more than this amount in a row, we'll let out the exception rather
     * than go back in for yet another retry.
     */
    public static final int MAX_ALLOWED_RECOVERABLES = 10;
    

    /**
     * The Record currently being read.
     *
     * Keep this ongoing reference so we'll close the record even if the caller
     * doesn't.
     */
    private ArchiveRecord currentRecord = null;
    
    /**
     * Descriptive string for the Archive file we're going against:
     * full path, url, etc. -- depends on context in which file was made.
     */
    private String identifier = null;
    
    /**
     * Archive file version.
     */
    private String version = null;
    
    
    protected ArchiveReader() {
        super();
    }
    
    /**
     * Convenience method used by subclass constructors.
     * @param i Identifier for Archive file this reader goes against.
     */
    protected void initialize(final String i) {
        setReaderIdentifier(i);
    }
    
    /**
     * Convenience method for constructors.
     * 
     * @param f File to read.
     * @param offset Offset at which to start reading.
     * @return InputStream to read from.
     * @throws IOException If failed open or fail to get a memory
     * mapped byte buffer on file.
     */
    protected InputStream getInputStream(final File f, final long offset)
    throws IOException {
        FileInputStream fin = new FileInputStream(f); 
        return new BufferedInputStream(fin);
    }

    public boolean isCompressed() {
        return this.compressed;
    }

    /**
     * Get record at passed <code>offset</code>.
     * 
     * @param offset Byte index into file at which a record starts.
     * @return An Archive Record reference.
     * @throws IOException
     */
    public ArchiveRecord get(long offset) throws IOException {
        cleanupCurrentRecord();
        long posn = positionForRecord(in); 
        if(offset>=posn) {
            in.skip(offset-posn); 
        } else {
            throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); 
        }
        return createArchiveRecord(this.in, offset);
    }
    
    /**
     * @return Return Archive Record created against current offset.
     * @throws IOException
     */
    public ArchiveRecord get() throws IOException {
        return createArchiveRecord(this.in, positionForRecord(in));
    }

    public void close() throws IOException {
        if (this.in != null) {
            this.in.close();
            this.in = null;
        }
    }
    
    /**
     * Cleanout the current record if there is one.
     * @throws IOException
     */
    protected void cleanupCurrentRecord() throws IOException {
        if (this.currentRecord != null) {
            this.currentRecord.close();
            gotoEOR(this.currentRecord);
            this.currentRecord = null;
        }
    }
    
    /**
     * Return an Archive Record homed on <code>offset</code> into
     * <code>is</code>.
     * @param is Stream to read Record from.
     * @param offset Offset to find Record at.
     * @return ArchiveRecord instance.
     * @throws IOException
     */
    protected abstract ArchiveRecord createArchiveRecord(InputStream is,
    	long offset)
    throws IOException;
    
    /**
     * Skip over any trailing new lines at end of the record so we're lined up
     * ready to read the next.
     * @param record
     * @throws IOException
     */
    protected abstract void gotoEOR(ArchiveRecord record) throws IOException;
    
    public abstract String getFileExtension();
    public abstract String getDotFileExtension();

    /**
     * @return Version of this Archive file.
     */
    public String getVersion() {
    	return this.version;
    }

    /**
     * Validate the Archive file.
     *
     * This method iterates over the file throwing exception if it fails
     * to successfully parse any record.
     *
     * <p>Assumes the stream is at the start of the file.
     * @return List of all read Archive Headers.
     *
     * @throws IOException
     */
    public List<ArchiveRecordHeader> validate() throws IOException {
        return validate(-1);
    }

    /**
     * Validate the Archive file.
     *
     * This method iterates over the file throwing exception if it fails
     * to successfully parse.
     *
     * <p>We start validation from wherever we are in the stream.
     *
     * @param numRecords Number of records expected.  Pass -1 if number is
     * unknown.
     *
     * @return List of all read metadatas. As we validate records, we add
     * a reference to the read metadata.
     *
     * @throws IOException
     */
    public List<ArchiveRecordHeader> validate(int numRecords) 
    throws IOException {
        List<ArchiveRecordHeader> hdrList = new ArrayList<ArchiveRecordHeader>();
        int recordCount = 0;
        setStrict(true);
        for (Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
            recordCount++;
            ArchiveRecord r = i.next();
            if (r.getHeader().getLength() <= 0
                && r.getHeader().getMimetype().
                    equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
                throw new IOException("record content is empty.");
            }
            r.close();
            hdrList.add(r.getHeader());
        }

        if (numRecords != -1) {
            if (recordCount != numRecords) {
                throw new IOException("Count of records, " 
                        + Integer.toString(recordCount) 
                        + " is not equal to expected " 
                        + Integer.toString(numRecords));
            }
        }

        return hdrList;
    }

    /**
     * Test Archive file is valid.
     * Assumes the stream is at the start of the file.  Be aware that this
     * method makes a pass over the whole file. 
     * @return True if file can be successfully parsed.
     */
    public boolean isValid() {
        boolean valid = false;
        try {
            validate();
            valid = true;
        } catch(Exception e) {
            // File is not valid if exception thrown parsing.
            valid = false;
        }
    
        return valid;
    }

    /**
     * @return Returns the strict.
     */
    public boolean isStrict() {
        return this.strict;
    }

    /**
     * @param s The strict to set.
     */
    public void setStrict(boolean s) {
        this.strict = s;
    }

    /**
     * @param d True if we're to digest.
     */
    public void setDigest(boolean d) {
        this.digest = d;
    }

    /**
     * @return True if we're digesting as we read.
     */
    public boolean isDigest() {
        return this.digest;
    }
 
    protected Logger getLogger() {
        return Logger.getLogger(this.getClass().getName());
    }
    
    /**
     * Returns an ArchiveRecord iterator.
     * Of note, on IOException, especially if ZipException reading compressed
     * ARCs, rather than fail the iteration, try moving to the next record.
     * If {@link ArchiveReader#strict} is not set, this will usually succeed.
     * @return An iterator over ARC records.
     */
    public Iterator<ArchiveRecord> iterator() {
        // Eat up any record outstanding.
        try {
            cleanupCurrentRecord();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        return new ArchiveRecordIterator();
    }

	protected void setCompressed(boolean compressed) {
		this.compressed = compressed;
	}

    /**
     * @return The current ARC record or null if none.
     * After construction has the arcfile header record.
     * @see #get()
     */
	protected ArchiveRecord getCurrentRecord() {
		return this.currentRecord;
	}

	protected ArchiveRecord currentRecord(final ArchiveRecord r) {
		this.currentRecord = r;
        return r;
	}

	protected InputStream getIn() {
		return in;
	}

	protected void setIn(InputStream in) {
		this.in = in;
	}

	protected void setVersion(String version) {
		this.version = version;
	}

	public String getReaderIdentifier() {
		return this.identifier;
	}

	protected void setReaderIdentifier(final String i) {
		this.identifier = i;
	}
	
    /**
     * Log on stderr.
     * Logging should go via the logging system.  This method
     * bypasses the logging system going direct to stderr.
     * Should not generally be used.  Its used for rare messages
     * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
     * Override if using ARCReader in a context where no stderr or
     * where you'd like to redirect stderr to other than System.err.
     * @param level Level to log message at.
     * @param message Message to log.
     */
    public void logStdErr(Level level, String message) {
        System.err.println(level.toString() + " " + message);
    }
    
//    /**
//     * Add buffering to RandomAccessInputStream.
//     */
//    protected class RandomAccessBufferedInputStream
//    extends BufferedInputStream implements RepositionableStream {
//
//        public RandomAccessBufferedInputStream(RandomAccessInputStream is)
//        		throws IOException {
//            super(is);
//        }
//
//        public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
//        		throws IOException {
//            super(is, size);
//        }
//
//        public long position() throws IOException {
//            // Current position is the underlying files position
//            // minus the amount thats in the buffer yet to be read.
//            return ((RandomAccessInputStream)this.in).position() -
//            	(this.count - this.pos);
//        }
//
//        public void position(long position) throws IOException {
//            // Force refill of buffer whenever there's been a seek.
//            this.pos = 0;
//            this.count = 0;
//            ((RandomAccessInputStream)this.in).position(position);
//        }
//        
//        public int available() throws IOException {
//            // Avoid overflow on large datastreams
//            long amount = (long)in.available() + (long)(count - pos);
//            return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
//        }
//    }
    
    /**
     * Inner ArchiveRecord Iterator class.
     * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
     * trouble pulling record from underlying stream.
     * @author stack
     */
    protected class ArchiveRecordIterator implements Iterator<ArchiveRecord> {
        private final Logger logger =
            Logger.getLogger(this.getClass().getName());
        /**
         * @return True if we have more records to read.
         * @exception RuntimeException Can throw an IOException wrapped in a
         * RuntimeException if a problem reading underlying stream (Corrupted
         * gzip, etc.).
         */
        public boolean hasNext() {
            // Call close on any extant record.  This will scoot us past
            // any content not yet read.
            try {
                cleanupCurrentRecord();
            } catch (IOException e) {
                if (isStrict()) {
                    throw new RuntimeException(e);
                }
                if (e instanceof EOFException) {
                    logger.warning("Premature EOF cleaning up " + 
                        currentRecord.getHeader().toString() + ": " +
                        e.getMessage());
                    return false;
                }
                // If not strict, try going again.  We might be able to skip
                // over the bad record.
                logger.log(Level.WARNING,"Trying skip of failed record cleanup of " +
                    currentRecord.getHeader().toString() + ": " +
                    e.getMessage(), e);
            }
            return innerHasNext();
        }
        
        protected boolean innerHasNext(){
            try {
                getIn().mark(1); 
                int c = getIn().read();
                getIn().reset(); 
                return c > -1; 
            } catch (IOException e) {
                logger.log(Level.WARNING,"problem probing for more content",e);
                return false; 
            } 
        }

        /**
         * Tries to move to next record if we get
         * {@link RecoverableIOException}. If not <code>strict</code>
         * tries to move to next record if we get an
         * {@link IOException}.
         * @return Next object.
         * @exception RuntimeException Throws a runtime exception,
         * usually a wrapping of an IOException, if trouble getting
         * a record (Throws exception rather than return null).
         */
        public ArchiveRecord next() {
            long offset = -1;
            try {
                offset = positionForRecord(getIn()); 
                return exceptionNext();
            } catch (IOException e) {
                if (!isStrict()) {
                    // Retry though an IOE.  Maybe we will succeed reading
                    // subsequent record.
                    try {
                        if (hasNext()) {
                            getLogger().warning("Bad Record. Trying skip " +
                                "(Record start " +  offset + "): " +
                                e.getMessage());
                            return exceptionNext();
                        }
                        // Else we are at last record.  Iterator#next is
                        // expecting value. We do not have one. Throw exception.
                        throw new RuntimeException("Retried but no next " + 
                            "record (Record start " + offset + ")", e);
                    } catch (IOException e1) {
                        throw new RuntimeException("After retry (Offset " +
                                offset + ")", e1);
                    }
                }
                throw new RuntimeException("(Record start " + offset + ")", e);
            }
        }
        
        /**
         * A next that throws exceptions and has handling of
         * recoverable exceptions moving us to next record. Can call
         * hasNext which itself may throw exceptions.
         * @return Next record.
         * @throws IOException
         * @throws RuntimeException Thrown when we've reached maximum
         * retries.
         */
        protected ArchiveRecord exceptionNext()
        throws IOException, RuntimeException {
            ArchiveRecord result = null;
            IOException ioe = null;
            for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
                    result == null; i--) {
                ioe = null;
                try {
                    result = innerNext();
                } catch (RecoverableIOException e) {
                    ioe = e;
                    getLogger().warning(e.getMessage());
                    if (hasNext()) {
                        continue;
                    }
                    // No records left.  Throw exception rather than
                    // return null.  The caller is expecting to get
                    // back a record since they've just called
                    // hasNext.
                    break;
                }
            }
            if (ioe != null) {
                // Then we did MAX_ALLOWED_RECOVERABLES retries.  Throw
                // the recoverable ioe wrapped in a RuntimeException so
                // it goes out pass checks for IOE.
                throw new RuntimeException("Retried " +
                    MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
            }
            return result;
        }
        
        protected ArchiveRecord innerNext() throws IOException {
            return get(positionForRecord(getIn()));
        }
        
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }
    
    protected static long positionForRecord(InputStream in) {
        return (in instanceof GZIPMembersInputStream) 
            ? ((GZIPMembersInputStream)in).getCurrentMemberStart()
            : ((CountingInputStream)in).getCount();
    }
    
    protected static String stripExtension(final String name,
    		final String ext) {
        return (!name.endsWith(ext))? name:
            name.substring(0, name.length() - ext.length());
    }
    
    /**
     * @return short name of Archive file.
     */
    public String getFileName() {
        return (new File(getReaderIdentifier())).getName();
    }

    /**
     * @return short name of Archive file.
     */
    public String getStrippedFileName() {
        return getStrippedFileName(getFileName(),
    		getDotFileExtension());
    }
    
    /**
     * @param name Name of ARCFile.
     * @param dotFileExtension '.arc' or '.warc', etc.
     * @return short name of Archive file.
     */
    public static String getStrippedFileName(String name,
    		final String dotFileExtension) {
    	name = stripExtension(name,
    		ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
    	return stripExtension(name, dotFileExtension);
    }
    
    /**
     * @param value Value to test.
     * @return True if value is 'true', else false.
     */
    protected static boolean getTrueOrFalse(final String value) {
    	if (value == null || value.length() <= 0) {
    		return false;
    	}
        return Boolean.TRUE.toString().equals(value.toLowerCase());
    }
    
    /**
     * @param format Format to use outputting.
     * @throws IOException
     * @throws java.text.ParseException
     * @return True if handled.
     */
    protected boolean output(final String format)
    throws IOException, java.text.ParseException {
    	boolean result = true;
        // long start = System.currentTimeMillis();
    	
        // Write output as pseudo-CDX file.  See
        // http://www.archive.org/web/researcher/cdx_legend.php
        // and http://www.archive.org/web/researcher/example_cdx.php.
        // Hash is hard-coded straight SHA-1 hash of content.
        if (format.equals(DUMP)) {
        	// No point digesting dumping.
        	setDigest(false);
            dump(false);
        } else if (format.equals(GZIP_DUMP)) {
        	// No point digesting dumping.
        	setDigest(false);
            dump(true);
        } else if (format.equals(CDX)) {
        	cdxOutput(false);   
        } else if (format.equals(CDX_FILE)) {
            cdxOutput(true);
        } else {
        	result = false;
        }	
        return result;
    }
    
    protected void cdxOutput(boolean toFile)
    throws IOException {
        BufferedWriter cdxWriter = null;
        if (toFile) {
            String cdxFilename = stripExtension(getReaderIdentifier(),
                DOT_COMPRESSED_FILE_EXTENSION);
            cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
            cdxFilename += ('.' + CDX);
            cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
        }
        
        String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
            + " n g";
        if (toFile) {
            cdxWriter.write(header);
            cdxWriter.newLine();
        } else {
            System.out.println(header);
        }
        
        String strippedFileName = getStrippedFileName();
        try {
            for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
            	ArchiveRecord r = ii.next();
                if (toFile) {
                    cdxWriter.write(r.outputCdx(strippedFileName));
                    cdxWriter.newLine();
                } else {
                    System.out.println(r.outputCdx(strippedFileName));
                }
            }
        } finally {
            if (toFile) {
                cdxWriter.close();
            }
        }
    }
    
    /**
     * Output passed record using passed format specifier.
     * @param format What format to use outputting.
     * @throws IOException
     * @return True if handled.
     */
    public boolean outputRecord(final String format)
    throws IOException {
    	boolean result = true;
        if (format.equals(CDX)) {
            System.out.println(get().outputCdx(getStrippedFileName()));
        } else if(format.equals(ArchiveFileConstants.DUMP)) {
            // No point digesting if dumping content.
            setDigest(false);
            get().dump();
        } else {
        	result = false;
        }
        return result;
    }

    /**
     * Dump this file on STDOUT
     * @throws compress True if dumped output is compressed.
     * @throws IOException
     * @throws java.text.ParseException
     */
    public abstract void dump(final boolean compress)
    throws IOException, java.text.ParseException;
    
    /**
     * @return an ArchiveReader that will delete a local file on close.  Used
     * when we bring Archive files local and need to clean up afterward.
     */
    public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);
    
    /**
     * Output passed record using passed format specifier.
     * @param r ARCReader instance to output.
     * @param format What format to use outputting.
     * @throws IOException
     */
    protected static void outputRecord(final ArchiveReader r,
        final String format)
    throws IOException {
        if (!r.outputRecord(format)) {
            throw new IOException("Unsupported format" +
                " (or unsupported on a single record): " + format);
        }
    }
    
    /**
     * @return Base Options object filled out with help, digest, strict, etc.
     * options.
     */
    protected static Options getOptions() {
        Options options = new Options();
        options.addOption(new Option("h","help", false,
            "Prints this message and exits."));
        options.addOption(new Option("o","offset", true,
            "Outputs record at this offset into file."));
        options.addOption(new Option("d","digest", true,
            "Pass true|false. Expensive. Default: true (SHA-1)."));
        options.addOption(new Option("s","strict", false,
            "Strict mode. Fails parse if incorrectly formatted file."));
        options.addOption(new Option("f","format", true,
            "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
            "'or 'nohead'. Default: 'cdx'."));
        return options;
    }
}