ARCRecord.java example

Explorer

webarchive-commons-master
- src
  - main
    - java
      - org
        archive
        RecoverableRecordFormatException.java
        extract
        CDXExtractorOutput.java
        DumpingExtractorOutput.java
        ExtractingResourceFactoryMapper.java
        ExtractingResourceProducer.java
        ExtractorOutput.java
        FilteredExtractorOuput.java
        JSONViewExtractorOutput.java
        ProducerUtils.java
        RealCDXExtractorOutput.java
        ResourceExtractor.java
        ResourceFactoryMapper.java
        WARCMetadataRecordExtractorOutput.java
        WATExtractorOutput.java
        format
        ArchiveFileConstants.java
        arc
        ARCConstants.java
        ARCFormatException.java
        ARCMetaData.java
        ARCMetaDataParser.java
        FiledescRecord.java
        FiledescRecordParser.java
        cdx
        CDX09Line.java
        CDX11Line.java
        CDXFieldConstants.java
        CDXFile.java
        CDXInputSource.java
        CDXLine.java
        CDXLineFactory.java
        FieldSplitFormat.java
        FieldSplitLine.java
        MultiCDXInputSource.java
        StandardCDXLineFactory.java
        dns
        DNSParseException.java
        DNSRecord.java
        DNSResponse.java
        DNSResponseParser.java
        gzip
        GZIPConstants.java
        GZIPDecoder.java
        GZIPFExtraRecord.java
        GZIPFExtraRecords.java
        GZIPFooter.java
        GZIPFormatException.java
        GZIPHeader.java
        GZIPMemberSeries.java
        GZIPMemberWriter.java
        GZIPMemberWriterCommittedOutputStream.java
        GZIPSeriesMember.java
        GZIPStaticHeader.java
        zipnum
        LineBufferingIterator.java
        MultiBlockIterator.java
        SummaryBlockIterator.java
        SummaryLine.java
        TimestampBestPickDedupIterator.java
        TimestampCustomDedupIterator.java
        TimestampDedupIterator.java
        ZipNumBlockLoader.java
        ZipNumCluster.java
        ZipNumIndex.java
        ZipNumParams.java
        ZipNumWriter.java
        http
        DumpingHTTPParseObserver.java
        HttpConstants.java
        HttpHeader.java
        HttpHeaderObserver.java
        HttpHeaderParser.java
        HttpHeaders.java
        HttpMessage.java
        HttpMessageParser.java
        HttpParseException.java
        HttpParseObserver.java
        HttpRequest.java
        HttpRequestMessage.java
        HttpRequestMessageObserver.java
        HttpRequestMessageParser.java
        HttpRequestParser.java
        HttpResponse.java
        HttpResponseMessage.java
        HttpResponseMessageObserver.java
        HttpResponseMessageParser.java
        HttpResponseParser.java
        json
        CompoundORJSONPathSpec.java
        CrossProductOfLists.java
        JSONPathSpec.java
        JSONPathSpecFactory.java
        JSONUtils.java
        JSONView.java
        SimpleJSONPathSpec.java
        text
        charset
        CharsetDetector.java
        RotatingCharsetDetector.java
        StandardCharsetDetector.java
        html
        CDATALexer.java
        LexParser.java
        NodeUtils.java
        ParseObserver.java
        warc
        WARCConstants.java
        WARCRecordWriter.java
        hadoop
        ArchiveJSONViewLoader.java
        ArchiveMetadataLoader.java
        FilenameInputFormat.java
        PerMapOutputFormat.java
        ResourceContext.java
        ResourceInputFormat.java
        ResourceRecordReader.java
        func
        JSONViewEvalFunc.java
        TupleFunc.java
        URLResolverFunc.java
        httpclient
        ConfigurableX509TrustManager.java
        HttpRecorderGetMethod.java
        HttpRecorderMethod.java
        HttpRecorderPostMethod.java
        SingleHttpConnectionManager.java
        ThreadLocalHttpConnectionManager.java
        io
        ArchiveFileConstants.java
        ArchiveReader.java
        ArchiveReaderFactory.java
        ArchiveRecord.java
        ArchiveRecordHeader.java
        ArraySeekInputStream.java
        BufferedSeekInputStream.java
        CharSubSequence.java
        CompositeFileInputStream.java
        CompositeFileReader.java
        Endian.java
        GZIPMembersInputStream.java
        GenerationFileHandler.java
        GenericReplayCharSequence.java
        GzipHeader.java
        HeaderedArchiveRecord.java
        LoudObjectOutputStream.java
        MiserOutputStream.java
        NoGzipMagicException.java
        ObjectPlusFilesInputStream.java
        ObjectPlusFilesOutputStream.java
        OriginSeekInputStream.java
        Preformatter.java
        RandomAccessInputStream.java
        RandomAccessOutputStream.java
        ReadSource.java
        RecorderIOException.java
        RecorderLengthExceededException.java
        RecorderTimeoutException.java
        RecorderTooMuchHeaderException.java
        RecordingInputStream.java
        RecordingOutputStream.java
        RecoverableIOException.java
        ReplayCharSequence.java
        ReplayInputStream.java
        RepositionableInputStream.java
        SafeSeekInputStream.java
        SeekInputStream.java
        SeekReader.java
        SeekReaderCharSequence.java
        SinkHandlerLogThread.java
        UTF8Bytes.java
        WriterPool.java
        WriterPoolMember.java
        WriterPoolSettings.java
        arc
        ARC2WCDX.java
        ARCConstants.java
        ARCLocation.java
        ARCReader.java
        ARCReaderFactory.java
        ARCRecord.java
        ARCRecordMetaData.java
        ARCUtils.java
        ARCWriter.java
        ARCWriterPool.java
        WriterPoolSettingsData.java
        warc
        WARCConstants.java
        WARCReader.java
        WARCReaderFactory.java
        WARCRecord.java
        WARCRecordInfo.java
        WARCWriter.java
        WARCWriterPool.java
        WARCWriterPoolSettings.java
        WARCWriterPoolSettingsData.java
        net
        DownloadURLConnection.java
        FTPException.java
        PublicSuffixes.java
        md5
        Handler.java
        Md5URLConnection.java
        rsync
        Handler.java
        RsyncURLConnection.java
        resource
        AbstractEmptyResource.java
        AbstractResource.java
        MetaData.java
        Resource.java
        ResourceConstants.java
        ResourceContainer.java
        ResourceFactory.java
        ResourceParseException.java
        ResourceProducer.java
        TransformingResourceProducer.java
        arc
        ARCResource.java
        ARCResourceFactory.java
        record
        FiledescResource.java
        FiledescResourceFactory.java
        generic
        GenericResourceProducer.java
        GenericStreamResource.java
        gzip
        GZIPMetaData.java
        GZIPResource.java
        GZIPResourceContainer.java
        html
        ExtractingParseObserver.java
        HTMLMetaData.java
        HTMLResource.java
        HTMLResourceFactory.java
        http
        HTTPHeadersResource.java
        HTTPHeadersResourceFactory.java
        HTTPRequestResource.java
        HTTPRequestResourceFactory.java
        HTTPResponseResource.java
        HTTPResponseResourceFactory.java
        producer
        ARCFile.java
        EnvelopedResourceFile.java
        WARCFile.java
        warc
        WARCResource.java
        WARCResourceFactory.java
        record
        DNSResource.java
        DNSResourceFactory.java
        WARCJSONMetaDataResource.java
        WARCJSONMetaDataResourceFactory.java
        WARCMetaDataResource.java
        WARCMetaDataResourceFactory.java
        streamcontext
        AbstractBufferingStream.java
        ByteArrayWrappedStream.java
        HDFSStream.java
        HTTP11Stream.java
        RandomAccessFileStream.java
        SimpleStream.java
        Stream.java
        StreamWrappedInputStream.java
        uid
        RecordIDGenerator.java
        UUIDGenerator.java
        url
        AggressiveIACanonicalizerRules.java
        AggressiveIAURLCanonicalizer.java
        BasicURLCanonicalizer.java
        CanonicalizeRules.java
        CanonicalizerConstants.java
        DefaultIACanonicalizerRules.java
        DefaultIAURLCanonicalizer.java
        ExtractRule.java
        GoogleURLCanonicalizer.java
        HandyURL.java
        IAURLCanonicalizer.java
        LaxURI.java
        LaxURLCodec.java
        NonMassagingIAURLCanonicalizer.java
        OrdinaryIACanonicalizerRules.java
        OrdinaryIAURLCanonicalizer.java
        RewriteRule.java
        SURT.java
        SURTTokenizer.java
        URLCanonicalizer.java
        URLKeyMaker.java
        URLParser.java
        URLRegexTransformer.java
        UrlSurtRangeComputer.java
        UsableURI.java
        UsableURIFactory.java
        WaybackURLKeyMaker.java
        util
        ArchiveUtils.java
        Base32.java
        ByteOp.java
        CrossProduct.java
        DateUtils.java
        DevUtils.java
        FileNameSpec.java
        FileUtils.java
        GeneralURIStreamFactory.java
        Grep.java
        HMACSigner.java
        IAUtils.java
        InetAddressUtil.java
        InterruptibleCharSequence.java
        IterableLineIterator.java
        LaxHttpParser.java
        MimetypeUtils.java
        NestedMap.java
        PrefixSet.java
        ProcessUtils.java
        ProgressStatisticsReporter.java
        PropertyUtils.java
        Recorder.java
        Reporter.java
        SURT.java
        StreamCopy.java
        StringFieldExtractor.java
        StringParse.java
        SurtPrefixSet.java
        TextUtils.java
        TmpDirTestCase.java
        anvl
        ANVLRecord.java
        Element.java
        Label.java
        SubElement.java
        Value.java
        binsearch
        AbstractSeekableLineReader.java
        ByteBufferInputStream.java
        FieldExtractingSLR.java
        FileSearchTool.java
        SeekCDXBenchmarker.java
        SeekableLineReader.java
        SeekableLineReaderFactory.java
        SeekableLineReaderIterator.java
        SortedTextFile.java
        WrappedSeekableLineReader.java
        impl
        HDFSSeekableLineReader.java
        HDFSSeekableLineReaderFactory.java
        HTTPSeekableLineReader.java
        HTTPSeekableLineReaderFactory.java
        MappedSeekableLineReader.java
        MappedSeekableLineReaderFactory.java
        NIOSeekableLineReader.java
        NIOSeekableLineReaderFactory.java
        RandomAccessFileSeekableLineReader.java
        RandomAccessFileSeekableLineReaderFactory.java
        http
        ApacheHttp31SLR.java
        ApacheHttp31SLRFactory.java
        ApacheHttp43SLR.java
        ApacheHttp43SLRFactory.java
        HTTPURLConnSLR.java
        HTTPURLConnSLRFactory.java
        io
        BytesReadObserver.java
        CRCInputStream.java
        CRCOutputStream.java
        CommitedOutputStream.java
        EOFNotifyingInputStream.java
        EOFObserver.java
        MultiMemberOpenJDKGZIPInputStream.java
        NotifyingInputStream.java
        PushBackOneByteInputStream.java
        RuntimeIOException.java
        iterator
        AbstractPeekableIterator.java
        BoundedStringIterator.java
        CachingStringFilter.java
        CloseableCompositeIterator.java
        CloseableIterator.java
        CloseableIteratorUtil.java
        CloseableIteratorWrapper.java
        FilterStringIterator.java
        LineReadingIterator.java
        LookaheadIterator.java
        PeekableIterator.java
        PrefixMatchStringIterator.java
        RegexLineIterator.java
        SortedCompositeIterator.java
        StartBoundedStringIterator.java
        StringFilter.java
        StringTransformer.java
        TransformingIteratorWrapper.java
        TransformingPrefixStringFilter.java
        zip
        GZIPMembersInputStream.java
        GzipHeader.java
        NoGzipMagicException.java
        OpenJDK7GZIPInputStream.java
        OpenJDK7InflaterInputStream.java
  - test
    - java
      - org
        archive
        extract
        RealCDXExtractorOutputTest.java
        format
        dns
        DNSResponseParserTest.java
        gzip
        GZIPMemberSeriesTest.java
        GZIPMemberWriterTest.java
        zipnum
        ZipNumWriterTest.java
        http
        HttpRequestMessageParserTest.java
        HttpResponseParserTest.java
        json
        CompoundORJSONPathSpecTest.java
        JSONPathSpecFactoryTest.java
        JSONViewTest.java
        SimpleJSONPathSpecTest.java
        text
        html
        CDATALexerTest.java
        io
        ArchiveReaderFactoryTest.java
        BufferedSeekInputStreamTest.java
        HeaderedArchiveRecordTest.java
        RecordingInputStreamTest.java
        RecordingOutputStreamTest.java
        ReplayCharSequenceTest.java
        RepositionableInputStreamTest.java
        arc
        ARCReaderFactoryTest.java
        ARCWriterPoolTest.java
        ARCWriterTest.java
        warc
        WARCReaderFactoryTest.java
        WARCWriterTest.java
        net
        PublicSuffixesTest.java
        resource
        html
        ExtractingParseObserverTest.java
        HTMLMetaDataTest.java
        uid
        UUIDGeneratorTest.java
        url
        AggressiveIAURLCanonicalizerTest.java
        BasicURLCanonicalizerTest.java
        HandyURLTest.java
        IAURLCanonicalizerTest.java
        OrdinaryIAURLCanonicalizerTest.java
        URLParserTest.java
        URLRegexTransformerTest.java
        UsableURIFactoryTest.java
        UsableURITest.java
        WaybackURLKeyMakerTest.java
        util
        ArchiveUtilsTest.java
        ByteOpTest.java
        CrossProductTest.java
        FileUtilsTest.java
        InterruptibleCharSequenceTest.java
        MimetypeUtilsTest.java
        PropertyUtilsTest.java
        StringFieldExtractorTest.java
        TestUtils.java
        anvl
        ANVLRecordTest.java
        binsearch
        SortedTextFileTest.java
        iterator
        CachingStringFilterTest.java
        FilterStringIteratorTest.java
        SortedCompositeIteratorTest.java
        zip
        GZIPMembersInputStreamTest.java

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io.arc;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.apache.commons.lang.StringUtils;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.util.InetAddressUtil;
import org.archive.util.LaxHttpParser;
import org.archive.util.TextUtils;

/**
 * An ARC file record.
 * Does not compass the ARCRecord metadata line, just the record content.
 * @author stack
 */
public class ARCRecord extends ArchiveRecord implements ARCConstants {
    /**
     * Http status line object.
     * 
     * May be null if record is not http.
     */
    private StatusLine httpStatus = null;

    /**
     * Http header bytes.
     * 
     * If non-null and bytes available, give out its contents before we
     * go back to the underlying stream.
     */
    private InputStream httpHeaderStream = null;
    
    /**
     * Http headers.
     * 
     * Only populated after reading of headers.
     */
    private Header [] httpHeaders = null;

    /**
     * Array of field names.
     * 
     * Used to initialize <code>headerFieldNameKeys</code>.
     */
    private final String [] headerFieldNameKeysArray = {
        URL_FIELD_KEY,
        IP_HEADER_FIELD_KEY,
        DATE_FIELD_KEY,
        MIMETYPE_FIELD_KEY,
        LENGTH_FIELD_KEY
    };
    
    /**
     * An array of the header field names found in the ARC file header on
     * the 3rd line.
     * 
     * We used to read these in from the arc file first record 3rd line but
     * now we hardcode them for sake of improved performance.
     */
    private final List<String> headerFieldNameKeys =
        Arrays.asList(this.headerFieldNameKeysArray);

    /**
     * Http header bytes read while trying to read http header
     */
    public long httpHeaderBytesRead = -1;
    
    /**
     * record length from metadata line
     */
    public long recordDeclaredLength;
    
    /**
     * null if source was not compressed
     */
    public long compressedBytes; 
    
    /**
     * actual payload data (not including trailing newline), 
     * should match record-declared-length 
     */
    public long uncompressedBytes;

    /**
     * content-length header, iff HTTP and present, null otherwise 
     */
    public long httpPayloadDeclaredLength;

    /**
     * actual http payload length, should match http-payload-declared-length 
     */
    public long httpPayloadActualLength;
    
    /**
     * errors encountered reading record
     */
    public List<ArcRecordErrors> errors = new ArrayList<ArcRecordErrors>();

    /**
     * verbatim ARC record header string
     */
    private String headerString;
    public String getHeaderString() {
        return this.headerString;
    }
    
    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent.
     * @param metaData Meta data.
     * @throws IOException
     */
    public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
                throws IOException {
        this(in, metaData, 0, true, false, true);
    }

    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent.
     * @param metaData Meta data.
     * @param bodyOffset Offset into the body.  Usually 0.
     * @param digest True if we're to calculate digest for this record.  Not
     * digesting saves about ~15% of cpu during an ARC parse.
     * @param strict Be strict parsing (Parsing stops if ARC inproperly
     * formatted).
     * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
     * about ~20% of CPU during an ARC parse.
     * @throws IOException
     */
    public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
        int bodyOffset, boolean digest, boolean strict,
        final boolean parseHttpHeaders) 
    throws IOException {
        super(in, metaData, bodyOffset, digest, strict);
        if (parseHttpHeaders) {
            this.httpHeaderStream = readHttpHeader();
        }
    }
    
    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the records metadata 
     * this instance is to represent. 
     * @param identifier Identifier for this the hosting Reader.
     * @param offset Current offset into <code>in</code> (Used to keep
     * <code>position</code> properly aligned).  Usually 0.
     * @param digest True if we're to calculate digest for this record.  Not
     * digesting saves about ~15% of cpu during an ARC parse.
     * @param strict Be strict parsing (Parsing stops if ARC inproperly
     * formatted).
     * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
     * about ~20% of CPU during an ARC parse.
     * @param isAllignedOnFirstRecord True if this is the first record to be
     * read from an archive
     * @param String version Version information to be returned to the
     * ARCReader constructing this record 
     * 
     * @throws IOException
     */
    public ARCRecord(InputStream in, final String identifier, 
                final long offset, boolean digest,      boolean strict, 
                final boolean parseHttpHeaders, 
                final boolean isAlignedOnFirstRecord, String version)
    throws IOException {
        super(in, null, 0, digest, strict);
        setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version));
        if (parseHttpHeaders) {
            this.httpHeaderStream = readHttpHeader();
        }
    }
    
    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the records metadata 
     * this instance is to represent.
     * @param identifier Identifier for this the hosting Reader.
     * @param offset Current offset into <code>in</code> (Used to keep
     * <code>position</code> properly aligned).  Usually 0.
     * @param digest True if we're to calculate digest for this record.  Not
     * digesting saves about ~15% of cpu during an ARC parse.
     * @param strict Be strict parsing (Parsing stops if ARC inproperly
     * formatted).
     * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
     * about ~20% of CPU during an ARC parse.
     * 
     * @throws IOException
     */
    public ARCRecord(InputStream in, final String identifier, 
                final long offset, boolean digest,      boolean strict, 
                final boolean parseHttpHeaders) 
    throws IOException {
        this(in, identifier, offset, digest, strict, parseHttpHeaders, 
                false, null);
    }
    
    private ArchiveRecordHeader parseHeaders(final InputStream in,
        final String identifier, final long offset, final boolean strict, 
        final boolean isAlignedOnFirstRecord, String version)
    throws IOException {
        
        ArrayList<String> firstLineValues = new ArrayList<String>(20);
        getTokenizedHeaderLine(in, firstLineValues);
        
        int bodyOffset = 0;
        String origin = "";
        if (offset == 0 && isAlignedOnFirstRecord) {
            // If offset is zero and we were aligned at first record on
            // creation (See #alignedOnFirstRecord for more on this), then no
            // records have been read yet and we're reading our first one, the
            // record of ARC file meta info.  Its special.  In ARC versions
            // 1.x, first record has three lines of meta info. We've just read
            // the first line. There are two more.  The second line has misc.
            // info.  We're only interested in the first field, the version
            // number.  The third line is the list of field names. Here's what
            // ARC file version 1.x meta content looks like:
            //
            // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 \\
            //      20040107015752 text/plain 77
            // 1 0 InternetArchive
            // URL IP-address Archive-date Content-type Archive-length
            //
            ArrayList<String> secondLineValues = new ArrayList<String>(20);
            bodyOffset += getTokenizedHeaderLine(in, secondLineValues);
            version = ((String)secondLineValues.get(0) +
                "." + (String)secondLineValues.get(1));
            origin = (String)secondLineValues.get(2);
            // Just read over the 3rd line.  We used to parse it and use
            // values found here but now we just hardcode them to avoid
            // having to read this 3rd line even for random arc file accesses.
            bodyOffset += getTokenizedHeaderLine(in, null);
            // this.position = bodyOffset;
        }
        setBodyOffset(bodyOffset);
        
        return computeMetaData(this.headerFieldNameKeys, firstLineValues, 
            version, origin, offset, identifier);
    }
    
    /**
     * Get a record header line as list of tokens.
     *
     * We keep reading till we find a LINE_SEPARATOR or we reach the end
     * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
     *
     * @param stream InputStream to read from.
     * @param list Empty list that gets filled w/ string tokens.
     * @return Count of characters read.
     * @exception IOException If problem reading stream or no line separator
     * found or EOF before EOL or we didn't get minimum header fields.
     */
    private int getTokenizedHeaderLine(final InputStream stream,
            List<String> list) throws IOException {
        // Preallocate usual line size.
        StringBuilder buffer = new StringBuilder(2048 + 20);
        int read = 0;
        int previous = -1;
        for (int c = -1; true;) {
                previous = c;
            c = stream.read();
            if (c == -1) {
                throw new RecoverableIOException("Hit EOF before header EOL.");
            }
            c &= 0xff; 
            read++;
            if (read > MAX_HEADER_LINE_LENGTH) {
                throw new IOException("Header line longer than max allowed " +
                    " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
                    " -- or passed buffer doesn't contain a line (Read: " +
                    buffer.length() + ").  Here's" +
                    " some of what was read: " +
                    buffer.substring(0, Math.min(buffer.length(), 256)));
            }

            if (c == LINE_SEPARATOR) {
                if (buffer.length() == 0) {
                    // Empty line at start of buffer.  Skip it and try again.
                    continue;
                }

                if (list != null) {
                    list.add(buffer.toString());
                }
                // LOOP TERMINATION.
                break;
            } else if (c == HEADER_FIELD_SEPARATOR) {
                if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
                        // Early ARCs sometimes had multiple spaces between fields.
                        continue;
                }
                if (list != null) {
                    list.add(buffer.toString());
                }
                // reset to empty
                buffer.setLength(0);
            } else {
                buffer.append((char)c);
            }
        }

        // List must have at least 3 elements in it and no more than 10.  If
        // it has other than this, then bogus parse.
        if (list != null && (list.size() < 3 || list.size() > 100)) {
            throw new IOException("Unparseable header line: " + list);
        }

        // save verbatim header String
        this.headerString = StringUtils.join(list," ");
        
        return read;
    }
    
    /**
     * Compute metadata fields.
     *
     * Here we check the meta field has right number of items in it.
     *
     * @param keys Keys to use composing headerFields map.
     * @param values Values to set into the headerFields map.
     * @param v The version of this ARC file.
     * @param offset Offset into arc file.
     *
     * @return Metadata structure for this record.
     *
     * @exception IOException  If no. of keys doesn't match no. of values.
     */
    private ARCRecordMetaData computeMetaData(List<String> keys,
                List<String> values, String v, String origin,
                long offset, final String identifier)
    throws IOException {
        if (keys.size() != values.size()) {
            List<String> originalValues = values;
            if (!isStrict()) {
                values = fixSpaceInURL(values, keys.size());
                // If values still doesn't match key size, try and do
                // further repair.
                    if (keys.size() != values.size()) {
                        // Early ARCs had a space in mimetype.
                        if (values.size() == (keys.size() + 1) &&
                                        values.get(4).toLowerCase().startsWith("charset=")) {
                                List<String> nuvalues =
                                        new ArrayList<String>(keys.size());
                                nuvalues.add(0, values.get(0));
                                nuvalues.add(1, values.get(1));
                                nuvalues.add(2, values.get(2));
                                nuvalues.add(3, values.get(3) + values.get(4));
                                nuvalues.add(4, values.get(5));
                                values = nuvalues;
                        } else if((values.size() + 1) == keys.size() &&
                            isLegitimateIPValue(values.get(1)) &&
                            isDate(values.get(2)) && isNumber(values.get(3))) {
                        // Mimetype is empty.
                        List<String> nuvalues =
                            new ArrayList<String>(keys.size());
                        nuvalues.add(0, values.get(0));
                        nuvalues.add(1, values.get(1));
                        nuvalues.add(2, values.get(2));
                        nuvalues.add(3, "-");
                        nuvalues.add(4, values.get(3));
                        values = nuvalues;
                    }
                    }
                }
            if (keys.size() != values.size()) {
                throw new IOException("Size of field name keys does" +
                    " not match count of field values: " + values);
            }
            // Note that field was fixed on stderr.
            System.err.println(Level.WARNING.toString() + "Fixed spaces in metadata line at " +
                "offset " + offset +
                " Original: " + originalValues + ", New: " + values);
        }
        
        Map<String, Object> headerFields =
                new HashMap<String, Object>(keys.size() + 2);
        for (int i = 0; i < keys.size(); i++) {
            headerFields.put(keys.get(i), values.get(i));
        }
        
        // Add a check for tabs in URLs.  If any, replace with '%09'.
        // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
        // [ 1010966 ] crawl.log has URIs with spaces in them.
        String url = (String)headerFields.get(URL_FIELD_KEY);
        if (url != null && url.indexOf('\t') >= 0) {
            headerFields.put(URL_FIELD_KEY,
                TextUtils.replaceAll("\t", url, "%09"));
        }

        headerFields.put(VERSION_FIELD_KEY, v);
        headerFields.put(ORIGIN_FIELD_KEY, origin);
        headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long(offset));

        return new ARCRecordMetaData(identifier, headerFields);
    }

    /**
     * Fix space in URLs.
     * The ARCWriter used to write into the ARC URLs with spaces in them.
     * See <a
     * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
     * crawl.log has URIs with spaces in them</a>.
     * This method does fix up on such headers converting all spaces found
     * to '%20'.
     * @param values List of metadata values.
     * @param requiredSize Expected size of resultant values list.
     * @return New list if we successfully fixed up values or original if
     * fixup failed.
     */
    private List<String> fixSpaceInURL(List<String> values, int requiredSize) {
        // Do validity check. 3rd from last is a date of 14 numeric
        // characters. The 4th from last is IP, all before the IP
        // should be concatenated together with a '%20' joiner.
        // In the below, '4' is 4th field from end which has the IP.
        if (!(values.size() > requiredSize) || values.size() < 4) {
            return values;
        }
        // Test 3rd field is valid date.
        if (!isDate((String) values.get(values.size() - 3))) {
            return values;
        }

        // Test 4th field is valid IP.
        if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
            return values;
        }

        List<String> newValues = new ArrayList<String>(requiredSize);
        StringBuffer url = new StringBuffer();
        for (int i = 0; i < (values.size() - 4); i++) {
            if (i > 0) {
                url.append("%20");
            }
            url.append(values.get(i));
        }
        newValues.add(url.toString());
        for (int i = values.size() - 4; i < values.size(); i++) {
            newValues.add(values.get(i));
        }
        return newValues;
    }
    
    private boolean isDate(final String date) {
        if (date.length() != 14) {
            return false;
        }
        return isNumber(date);
    }
    
    private boolean isNumber(final String n) {
        for (int i = 0; i < n.length(); i++) {
            if (!Character.isDigit(n.charAt(i))) {
                return false;
            }
        }
        return true;
    }
    
    private boolean isLegitimateIPValue(final String ip) {
        if ("-".equals(ip)) {
            return true;
        }
        Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
        return m != null && m.matches();
    }
    
    /**
     * Skip over the the http header if one present.
     * 
     * Subsequent reads will get the body.
     * 
     * <p>Calling this method in the midst of reading the header
     * will make for strange results.  Otherwise, safe to call
     * at any time though before reading any of the arc record
     * content is only time that it makes sense.
     * 
     * <p>After calling this method, you can call
     * {@link #getHttpHeaders()} to get the read http header.
     * 
     * @throws IOException
     */
    public void skipHttpHeader() throws IOException {
        if (this.httpHeaderStream != null) {
            // Empty the httpHeaderStream
            for (int available = this.httpHeaderStream.available();
                        this.httpHeaderStream != null &&
                                (available = this.httpHeaderStream.available()) > 0;) {
                // We should be in this loop once only we should only do this
                // buffer allocation once.
                byte [] buffer = new byte[available];
                // The read nulls out httpHeaderStream when done with it so
                // need check for null in the loop control line.
                read(buffer, 0, available);
            }
        }
    }
    
    public void dumpHttpHeader() throws IOException {
                if (this.httpHeaderStream == null) {
                        return;
                }
                // Dump the httpHeaderStream to STDOUT
                for (int available = this.httpHeaderStream.available();
                        this.httpHeaderStream != null
                                && (available = this.httpHeaderStream.available()) > 0;) {
                        // We should be in this loop only once and should do this
                        // buffer allocation once.
                        byte[] buffer = new byte[available];
                        // The read nulls out httpHeaderStream when done with it so
                        // need check for null in the loop control line.
                        int read = read(buffer, 0, available);
                        System.out.write(buffer, 0, read);
                }
        }
    
    /**
         * Read http header if present. Technique borrowed from HttpClient HttpParse
         * class. set errors when found.
         * 
         * @return ByteArrayInputStream with the http header in it or null if no
         *         http header.
         * @throws IOException
         */
    private InputStream readHttpHeader() throws IOException {
    	
    	// this can be helpful when simply iterating over records, 
    	// looking for problems.
        Logger logger = Logger.getLogger(this.getClass().getName());
    	ArchiveRecordHeader h = this.getHeader();
    	
        // If judged a record that doesn't have an http header, return
        // immediately.
        String url = getHeader().getUrl();
        if(!url.startsWith("http") ||
            getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
            return null;
        }
        
        String statusLine;
        byte[] statusBytes;
        int eolCharCount = 0;
        int errOffset = 0;
        
        // Read status line, skipping any errant http headers found before it
        // This allows a larger number of 'corrupt' arcs -- where headers were accidentally
        // inserted before the status line to be readable
        while (true) {
        	statusBytes = LaxHttpParser.readRawLine(getIn());
        	eolCharCount = getEolCharsCount(statusBytes);
        	if (eolCharCount <= 0) {
        		throw new RecoverableIOException(
                "Failed to read http status where one was expected: " 
                + ((statusBytes == null) ? "" : new String(statusBytes)));
        	}
        
        	statusLine = EncodingUtil.getString(statusBytes, 0,
        			statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
        	
        	// If a null or DELETED break immediately
        	if ((statusLine == null) || statusLine.startsWith("DELETED")) {
        		break;
        	}
        	
        	// If it's actually the status line, break, otherwise continue skipping any
        	// previous header values
        	if (!statusLine.contains(":") && StatusLine.startsWithHTTP(statusLine)) {
        		break;
        	}
        	
        	// Add bytes read to error "offset" to add to position
        	errOffset += statusBytes.length;
        }
        
        if (errOffset > 0) {
            this.incrementPosition(errOffset);
        }
        
        if ((statusLine == null) ||
                !StatusLine.startsWithHTTP(statusLine)) {
            if (statusLine.startsWith("DELETED")) {
                // Some old ARCs have deleted records like following:
                // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
                // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
                // (follows ~29K spaces)
                // For now, throw a RecoverableIOException so if iterating over
                // records, we keep going.  TODO: Later make a legitimate
                // ARCRecord from the deleted record rather than throw
                // exception.
                throw new DeletedARCRecordIOException(statusLine);
            } else {
            	this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_INVALID);
            }
        }

        try {
        	this.httpStatus = new StatusLine(statusLine);
        } catch(IOException e) {
        	logger.warning(e.getMessage() + " at offset: " + h.getOffset());
        	this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION);
        }
        
        // Save off all bytes read.  Keep them as bytes rather than
        // convert to strings so we don't have to worry about encodings
        // though this should never be a problem doing http headers since
        // its all supposed to be ascii.
        ByteArrayOutputStream baos =
            new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
        baos.write(statusBytes);
        
        // Now read rest of the header lines looking for the separation
        // between header and body.
        for (byte [] lineBytes = null; true;) {
            lineBytes = LaxHttpParser.readRawLine(getIn());
            eolCharCount = getEolCharsCount(lineBytes);
            if (eolCharCount <= 0) {
            	if (getIn().available() == 0) {
            		httpHeaderBytesRead += statusBytes.length;
                	logger.warning("HTTP header truncated at offset: " + h.getOffset());
            		this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED);
            		this.setEor(true);
            		break;
            	} else {
            		throw new IOException("Failed reading http headers: " +
            				((lineBytes != null)? new String(lineBytes): null));
            	}
            } else {
            	httpHeaderBytesRead += lineBytes.length;
            }
            // Save the bytes read.
            baos.write(lineBytes);
            if ((lineBytes.length - eolCharCount) <= 0) {
                // We've finished reading the http header.
                break;
            }
        }
        
        byte [] headerBytes = baos.toByteArray();
        // Save off where body starts.
        this.getMetaData().setContentBegin(headerBytes.length);
        ByteArrayInputStream bais =
            new ByteArrayInputStream(headerBytes);
        if (!bais.markSupported()) {
            throw new IOException("ByteArrayInputStream does not support mark");
        }
        bais.mark(headerBytes.length);
        // Read the status line.  Don't let it into the parseHeaders function.
        // It doesn't know what to do with it.
        bais.read(statusBytes, 0, statusBytes.length);
        this.httpHeaders = LaxHttpParser.parseHeaders(bais,
            ARCConstants.DEFAULT_ENCODING);
        this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
        bais.reset();
        return bais;
    }
    
    private static class DeletedARCRecordIOException
    extends RecoverableIOException {
        private static final long serialVersionUID = 1L;

        public DeletedARCRecordIOException(final String reason) {
            super(reason);
        }
    }
    
    /**
     * Return status code for this record.
     * 
     * This method will return -1 until the http header has been read.
     * @return Status code.
     */
    public int getStatusCode() {
        return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
    }
    
    /**
     * @param bytes Array of bytes to examine for an EOL.
     * @return Count of end-of-line characters or zero if none.
     */
    private int getEolCharsCount(byte [] bytes) {
        int count = 0;
        if (bytes != null && bytes.length >=1 &&
                bytes[bytes.length - 1] == '\n') {
            count++;
            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
                count++;
            }
        }
        return count;
    }

    /**
     * @return Meta data for this record.
     */
    public ARCRecordMetaData getMetaData() {
        return (ARCRecordMetaData)getHeader();
    }
    
    /**
     * @return http headers (Only available after header has been read).
     */
    public Header [] getHttpHeaders() {
        return this.httpHeaders;
    }
    
    /**
     * @return ArcRecordErrors encountered when reading 
     */
    public List<ArcRecordErrors> getErrors() {
    	return this.errors;
    }
    
    /**
     * @return true if ARC record errors found 
     */
    public boolean hasErrors() {
    	return !this.errors.isEmpty();
    }
    
    /**
     * @return Next character in this ARCRecord's content else -1 if at end of
     * this record.
     * @throws IOException
     */
    public int read() throws IOException {
        int c = -1;
        if (this.httpHeaderStream != null &&
                (this.httpHeaderStream.available() > 0)) {
            // If http header, return bytes from it before we go to underlying
            // stream.
            c = this.httpHeaderStream.read();
            // If done with the header stream, null it out.
            if (this.httpHeaderStream.available() <= 0) {
                this.httpHeaderStream = null;
            }
            incrementPosition();
        } else {
            c = super.read();
        }
        return c;
    }

    public int read(byte [] b, int offset, int length) throws IOException {
        int read = -1;
        if (this.httpHeaderStream != null &&
                (this.httpHeaderStream.available() > 0)) {
            // If http header, return bytes from it before we go to underlying
            // stream.
            read = Math.min(length, this.httpHeaderStream.available());
            if (read == 0) {
                read = -1;
            } else {
                read = this.httpHeaderStream.read(b, offset, read);
            }
            // If done with the header stream, null it out.
            if (this.httpHeaderStream.available() <= 0) {
                this.httpHeaderStream = null;
            }
            incrementPosition(read);
        } else {
            read = super.read(b, offset, length);
        }
        return read;
    }

    /**
     * @return Offset at which the body begins (Only known after
     * header has been read) or -1 if none or if we haven't read
     * headers yet.  Usually length of HTTP headers (does not include ARC
     * metadata line length).
     */
    public int getBodyOffset() {
        return this.getMetaData().getContentBegin();
    }
    
    @Override
    protected String getIp4Cdx(ArchiveRecordHeader h) {
        String result = null;
        if (h instanceof ARCRecordMetaData) {
                result = ((ARCRecordMetaData)h).getIp();
        }
        return (result != null)? result: super.getIp4Cdx(h);
    }
    
    @Override
        protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
                String result = null;
                if (h instanceof ARCRecordMetaData) {
                        result = ((ARCRecordMetaData) h).getStatusCode();
                }
                return (result != null) ? result: super.getStatusCode4Cdx(h);
        }
    
    @Override
        protected String getDigest4Cdx(ArchiveRecordHeader h) {
                String result = null;
                if (h instanceof ARCRecordMetaData) {
                        result = ((ARCRecordMetaData) h).getDigest();
                }
                return (result != null) ? result: super.getDigest4Cdx(h);
        }
}