/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.io; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.logging.Level; import org.archive.util.Base32; /** * Archive file Record. * @author stack * @version $Date$ $Version$ */ public abstract class ArchiveRecord extends InputStream { /** * Minimal http response or request header length. * * I've seen in arcs content length of 1 with no header. */ protected static final long MIN_HTTP_HEADER_LENGTH = Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length()); protected ArchiveRecordHeader header = null; /** * Stream to read this record from. * * Stream can only be read sequentially. Will only return this records' * content returning a -1 if you try to read beyond the end of the current * record. * * <p>Streams can be markable or not. If they are, we'll be able to roll * back when we've read too far. If not markable, assumption is that * the underlying stream is managing our not reading too much (This pertains * to the skipping over the end of the ARCRecord. See {@link #skip()}. */ protected InputStream in = null; /** * Position w/i the Record content, within <code>in</code>. * This position is relative within this Record. Its not same as the * Archive file position. */ protected long position = 0; /** * Set flag when we've reached the end-of-record. */ protected boolean eor = false; /** * Compute digest on what we read and add to metadata when done. * * Currently hardcoded as sha-1. TODO: Remove when archive records * digest or else, add a facility that allows the arc reader to * compare the calculated digest to that which is recorded in * the arc. * * <p>Protected instead of private so subclasses can update and complete * the digest. */ protected MessageDigest digest = null; private String digestStr = null; protected boolean strict = false; /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent. * @throws IOException */ public ArchiveRecord(InputStream in) throws IOException { this(in, null, 0, true, false); } /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent. * @param header Header data. * @throws IOException */ public ArchiveRecord(InputStream in, ArchiveRecordHeader header) throws IOException { this(in, header, 0, true, false); } /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent. * @param header Header data. * @param bodyOffset Offset into the body. Usually 0. * @param digest True if we're to calculate digest for this record. Not * digesting saves about ~15% of cpu during an ARC parse. * @param strict Be strict parsing (Parsing stops if ARC inproperly * formatted). * @throws IOException */ public ArchiveRecord(InputStream in, ArchiveRecordHeader header, int bodyOffset, boolean digest, boolean strict) throws IOException { this.in = in; this.header = header; this.position = bodyOffset; if (digest) { try { this.digest = MessageDigest.getInstance("SHA1"); } catch (NoSuchAlgorithmException e) { // Convert to IOE because thats more amenable to callers // -- they are dealing with it anyways. throw new IOException(e.getMessage()); } } this.strict = strict; } public boolean markSupported() { return false; } /** * @return Header data for this record. */ public ArchiveRecordHeader getHeader() { return this.header; } protected void setHeader(ArchiveRecordHeader header) { this.header = header; } /** * Calling close on a record skips us past this record to the next record * in the stream. * * It does not actually close the stream. The underlying steam is probably * being used by the next arc record. * * @throws IOException */ public void close() throws IOException { if (this.in != null) { skip(); this.in = null; if (this.digest != null) { this.digestStr = Base32.encode(this.digest.digest()); } } } /** * @return Next character in this Record content else -1 if at EOR. * @throws IOException */ public int read() throws IOException { int c = -1; if (available() > 0) { c = this.in.read(); if (c == -1) { throw new IOException("Premature EOF before end-of-record."); } if (this.digest != null) { this.digest.update((byte) c); } incrementPosition(); } return c; } public int read(byte[] b, int offset, int length) throws IOException { int read = Math.min(length, available()); if (read == -1 || read == 0) { read = -1; } else { read = this.in.read(b, offset, read); if (read == -1) { String msg = "Premature EOF before end-of-record: " + getHeader().getHeaderFields(); if (isStrict()) { throw new IOException(msg); } setEor(true); System.err.println(Level.WARNING.toString() + " " + msg); } if (this.digest != null && read >= 0) { this.digest.update(b, offset, read); } incrementPosition(read); } return read; } /** * This available is not the stream's available. Its an available based on * what the stated Archive record length is minus what we've read to date. * * @return True if bytes remaining in record content. */ public int available() { long amount = getHeader().getLength() - getPosition(); return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount); } /** * Skip over this records content. * * @throws IOException */ protected void skip() throws IOException { if (this.eor) { return; } // Read to the end of the body of the record. Exhaust the stream. // Can't skip direct to end because underlying stream may be compressed // and we're calculating the digest for the record. int r = available(); while (r > 0 && !this.eor) { skip(r); r = available(); } } public long skip(long n) throws IOException { final int SKIP_BUFFERSIZE = 1024 * 4; byte[] b = new byte[SKIP_BUFFERSIZE]; long total = 0; for (int read = 0; (total < n) && (read != -1);) { read = Math.min(SKIP_BUFFERSIZE, (int) (n - total)); // TODO: Interesting is that reading from compressed stream, we only // read about 500 characters at a time though we ask for 4k. // Look at this sometime. read = read(b, 0, read); if (read <= 0) { read = -1; } else { total += read; } } return total; } /** * @return Returns the strict. */ public boolean isStrict() { return this.strict; } /** * @param strict The strict to set. */ public void setStrict(boolean strict) { this.strict = strict; } protected InputStream getIn() { return this.in; } public String getDigestStr() { return this.digestStr; } protected void incrementPosition() { this.position++; } protected void incrementPosition(final long incr) { this.position += incr; } public long getPosition() { return this.position; } protected boolean isEor() { return eor; } protected void setEor(boolean eor) { this.eor = eor; } protected String getStatusCode4Cdx(final ArchiveRecordHeader h) { return "-"; } protected String getIp4Cdx(final ArchiveRecordHeader h) { return "-"; } protected String getDigest4Cdx(final ArchiveRecordHeader h) { return getDigestStr() == null? "-": getDigestStr(); } protected String getMimetype4Cdx(final ArchiveRecordHeader h) { return h.getMimetype(); } protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); } /** * Writes output on STDOUT. * @throws IOException */ public void dump() throws IOException { dump(System.out); } /** * Writes output on passed <code>os</code>. * @throws IOException */ public void dump(final OutputStream os) throws IOException { final byte [] outputBuffer = new byte [16*1024]; int read = outputBuffer.length; while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) { os.write(outputBuffer, 0, read); } os.flush(); } /** * Is it likely that this record contains headers? * This method will return true if the body is a http response that includes * http response headers or the body is a http request that includes request * headers, etc. Be aware that headers in content are distinct from * {@link ArchiveRecordHeader} 'headers'. * @return True if this Record's content has headers: */ public boolean hasContentHeaders() { final String url = getHeader().getUrl(); if (url == null) { return false; } if (!url.toLowerCase().startsWith("http")) { return false; } if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return false; } return true; } protected void setBodyOffset(int bodyOffset) { this.position = bodyOffset; } }