/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.io.warc; import it.unimi.dsi.fastutil.io.RepositionableStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; import org.archive.util.LaxHttpParser; /** * A WARC file Record. * * @author stack */ public class WARCRecord extends ArchiveRecord implements WARCConstants { private Pattern WHITESPACE = Pattern.compile("\\s"); /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent. * @throws IOException */ public WARCRecord(InputStream in, final String identifier, final long offset) throws IOException { this(in, identifier, offset, true, false); } /** * Constructor. * @param in Stream cue'd up just past Header Line and Named Fields. * @param headers Header Line and ANVL Named fields. * @throws IOException */ public WARCRecord(InputStream in, ArchiveRecordHeader headers) throws IOException { super(in, headers, 0, true, false); } /** * Constructor. * * @param in Stream cue'd up to be at the start of the record this instance * is to represent or, if <code>headers</code> is not null, just past the * Header Line and Named Fields. * @param identifier Identifier for this the hosting Reader. * @param offset Current offset into <code>in</code> (Used to keep * <code>position</code> properly aligned). Usually 0. * @param digest True if we're to calculate digest for this record. Not * digesting saves about ~15% of cpu during parse. * @param strict Be strict parsing (Parsing stops if file inproperly * formatted). * @throws IOException */ public WARCRecord(final InputStream in, final String identifier, final long offset, boolean digest, boolean strict) throws IOException { super(in, null, 0, digest, strict); setHeader(parseHeaders(in, identifier, offset, strict)); } /** * Parse WARC Header Line and Named Fields. * @param in Stream to read. * @param identifier Identifier for the hosting Reader. * @param offset Absolute offset into Reader. * @param strict Whether to be loose parsing or not. * @return An ArchiveRecordHeader. * @throws IOException */ protected ArchiveRecordHeader parseHeaders(final InputStream in, final String identifier, final long offset, final boolean strict) throws IOException { final Map<String, Object> m = new HashMap<String, Object>(); m.put(ABSOLUTE_OFFSET_KEY, new Long(offset)); m.put(READER_IDENTIFIER_FIELD_KEY, identifier); long startPosition = -1; if (in instanceof RepositionableStream) { startPosition = ((RepositionableStream)in).position(); } String firstLine = new String(LaxHttpParser.readLine(in, WARC_HEADER_ENCODING)); if (firstLine == null || firstLine.length() <=0) { throw new IOException("Failed to read WARC_MAGIC"); } if (!firstLine.startsWith(WARC_MAGIC)) { throw new IOException("Failed to find WARC MAGIC: " + firstLine); } // Here we start reading off the inputstream but we're reading the // stream direct rather than going via WARCRecord#read. The latter will // keep count of bytes read, digest and fail properly if EOR too soon... // We don't want digesting while reading Headers. // Header [] h = LaxHttpParser.parseHeaders(in, WARC_HEADER_ENCODING); for (int i = 0; i < h.length; i++) { m.put(h[i].getName(), h[i].getValue()); } int headerLength = -1; if (in instanceof RepositionableStream) { headerLength = (int)(((RepositionableStream)in).position() - startPosition); } final int contentOffset = headerLength; incrementPosition(contentOffset); return new ArchiveRecordHeader() { private Map<String, Object> headers = m; private int contentBegin = contentOffset; public String getDate() { return (String)this.headers.get(HEADER_KEY_DATE); } public String getDigest() { return null; // TODO: perhaps return block-digest? // superclass def implies this is calculated ("only after // read in totality"), not pulled from header, so // below prior implementation was misleading // return (String)this.headers.get(HEADER_KEY_CHECKSUM); } public String getReaderIdentifier() { return (String)this.headers.get(READER_IDENTIFIER_FIELD_KEY); } public Set<String> getHeaderFieldKeys() { return this.headers.keySet(); } public Map<String,Object> getHeaderFields() { return this.headers; } public Object getHeaderValue(String key) { return this.headers.get(key); } // Returns just the Content-Length of the warc record public long getContentLength() { Object o = this.headers.get(CONTENT_LENGTH); if (o == null) { return -1; } long contentLength = (o instanceof Long)? ((Long)o).longValue(): Long.parseLong((String)o); return contentLength; } // Returns the full record length public long getLength() { return getContentLength() + contentOffset; } public String getMimetype() { return (String)this.headers.get(CONTENT_TYPE); } public long getOffset() { Object o = this.headers.get(ABSOLUTE_OFFSET_KEY); if (o == null) { return -1; } return (o instanceof Long)? ((Long)o).longValue(): Long.parseLong((String)o); } public String getRecordIdentifier() { return (String)this.headers.get(RECORD_IDENTIFIER_FIELD_KEY); } public String getUrl() { return (String)this.headers.get(HEADER_KEY_URI); } public String getVersion() { return (String)this.headers.get(VERSION_FIELD_KEY); } public int getContentBegin() { return this.contentBegin; } @Override public String toString() { return this.headers.toString(); } }; } @Override protected String getMimetype4Cdx(ArchiveRecordHeader h) { final String m = super.getMimetype4Cdx(h); // Mimetypes can have spaces in WARCs. Emitting for CDX, just // squash them for now. Later, quote them since squashing spaces won't // work for params that have quoted-string values. Matcher matcher = WHITESPACE.matcher(m); return matcher.replaceAll(""); } }