package com.digitalpebble.stormcrawler.warc; import java.io.ByteArrayOutputStream; import java.net.URI; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.UUID; import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang.StringUtils; import org.apache.storm.hdfs.bolt.format.RecordFormat; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.protocol.HttpHeaders; import org.apache.storm.tuple.Tuple; /** Generate a byte representation of a WARC entry from a tuple **/ @SuppressWarnings("serial") public class WARCRecordFormat implements RecordFormat { private static final String WARC_VERSION = "WARC/1.0"; private static final String CRLF = "\r\n"; private static final byte[] CRLF_BYTES = { 13, 10 }; public static final SimpleDateFormat WARC_DF = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH); private static final Base32 base32 = new Base32(); private static final String digestNoContent = getDigestSha1(new byte[0]); public static String getDigestSha1(byte[] bytes) { return "sha1:" + base32.encodeAsString(DigestUtils.sha1(bytes)); } public static String getDigestSha1(byte[] bytes1, byte[] bytes2) { MessageDigest sha1 = DigestUtils.getSha1Digest(); sha1.update(bytes1); return "sha1:" + base32.encodeAsString(sha1.digest(bytes2)); } /** * Generates a WARC info entry which can be stored at the beginning of each * WARC file. **/ public static byte[] generateWARCInfo(Map<String, String> fields) { StringBuffer buffer = new StringBuffer(); buffer.append(WARC_VERSION); buffer.append(CRLF); buffer.append("WARC-Type: warcinfo").append(CRLF); String mainID = UUID.randomUUID().toString(); // retrieve the date and filename from the map String date = fields.get("WARC-Date"); buffer.append("WARC-Date: ").append(date).append(CRLF); String filename = fields.get("WARC-Filename"); buffer.append("WARC-Filename: ").append(filename).append(CRLF); buffer.append("WARC-Record-ID").append(": ").append("<urn:uuid:") .append(mainID).append(">").append(CRLF); buffer.append("Content-Type").append(": ") .append("application/warc-fields").append(CRLF); StringBuilder fieldsBuffer = new StringBuilder(); // add WARC fields // http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf Iterator<Entry<String, String>> iter = fields.entrySet().iterator(); while (iter.hasNext()) { Entry<String, String> entry = iter.next(); String key = entry.getKey(); if (key.startsWith("WARC-")) continue; fieldsBuffer.append(key).append(": ").append(entry.getValue()) .append(CRLF); } buffer.append("Content-Length") .append(": ") .append(fieldsBuffer.toString() .getBytes(StandardCharsets.UTF_8).length).append(CRLF); buffer.append(CRLF); buffer.append(fieldsBuffer.toString()); buffer.append(CRLF); buffer.append(CRLF); return buffer.toString().getBytes(StandardCharsets.UTF_8); } @Override public byte[] format(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // were the headers stored as is? Can write a response element then String headersVerbatim = metadata.getFirstValue("_response.headers_"); byte[] httpheaders = new byte[0]; if (StringUtils.isNotBlank(headersVerbatim)) { // check that ends with an empty line if (!headersVerbatim.endsWith(CRLF + CRLF)) { headersVerbatim += CRLF + CRLF; } httpheaders = headersVerbatim.getBytes(); } StringBuffer buffer = new StringBuffer(); buffer.append(WARC_VERSION); buffer.append(CRLF); String mainID = UUID.randomUUID().toString(); buffer.append("WARC-Record-ID").append(": ").append("<urn:uuid:") .append(mainID).append(">").append(CRLF); int contentLength = 0; String payloadDigest = digestNoContent; String blockDigest; if (content != null) { contentLength = content.length; payloadDigest = getDigestSha1(content); blockDigest = getDigestSha1(httpheaders, content); } else { blockDigest = getDigestSha1(httpheaders); } // add the length of the http header contentLength += httpheaders.length; buffer.append("Content-Length").append(": ") .append(Integer.toString(contentLength)).append(CRLF); // TODO get actual fetch time from metadata if any Date now = new Date(); buffer.append("WARC-Date").append(": ").append(WARC_DF.format(now)) .append(CRLF); // check if http headers have been stored verbatim // if not generate a response instead String WARCTypeValue = "resource"; if (StringUtils.isNotBlank(headersVerbatim)) { WARCTypeValue = "response"; } buffer.append("WARC-Type").append(": ").append(WARCTypeValue) .append(CRLF); // "WARC-IP-Address" if present String IP = metadata.getFirstValue("_ip_"); if (StringUtils.isNotBlank(IP)) { buffer.append("WARC-IP-Address").append(": ").append("IP") .append(CRLF); } String targetURI = null; // must be a valid URI try { String normalised = url.replaceAll(" ", "%20"); URI uri = URI.create(normalised); targetURI = uri.toASCIIString(); buffer.append("WARC-Target-URI").append(": ").append(targetURI) .append(CRLF); } catch (Exception e) { throw new RuntimeException("Invalid URI " + url); } // provide a ContentType if type response if (WARCTypeValue.equals("response")) { buffer.append("Content-Type: application/http; msgtype=response") .append(CRLF); } // for resources just use the content type provided by the server if any else { String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (StringUtils.isBlank(ct)) { ct = "application/octet-stream"; } buffer.append("Content-Type: ").append(ct).append(CRLF); } buffer.append("WARC-Payload-Digest").append(": ").append(payloadDigest) .append(CRLF); buffer.append("WARC-Block-Digest").append(": ").append(blockDigest) .append(CRLF); // finished writing the WARC headers, now let's serialize it ByteArrayOutputStream bos = new ByteArrayOutputStream(); try { // store the headers bos.write(buffer.toString().getBytes(StandardCharsets.UTF_8)); bos.write(CRLF_BYTES); // the http headers bos.write(httpheaders); // the binary content itself if (content != null) { bos.write(content); } bos.write(CRLF_BYTES); bos.write(CRLF_BYTES); } catch (Exception e) { throw new RuntimeException(e); } return bos.toByteArray(); } }