package org.archive.format.warc; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Date; import java.util.UUID; import org.archive.format.http.HttpConstants; import org.archive.format.http.HttpHeaders; import org.archive.util.DateUtils; import org.archive.util.StreamCopy; import com.google.common.io.FileBackedOutputStream; public class WARCRecordWriter implements WARCConstants, HttpConstants { private static final String SCHEME = "urn:uuid"; private static final String SCHEME_COLON = SCHEME + ":"; private static int DEFAULT_RAM_BUFFER = 1024 * 1024; public int ramBuffer = DEFAULT_RAM_BUFFER; // OutputStream out; // public WARCRecordWriter(OutputStream out) { // this.out = out; // } private void writeRecord(OutputStream out, HttpHeaders headers, InputStream contents, int trailingCRLFs) throws IOException { InputStream content2 = null; if(contents == null) { headers.add(CONTENT_LENGTH, "0"); } else { FileBackedOutputStream fbos = new FileBackedOutputStream(ramBuffer); long amt = StreamCopy.copy(contents, fbos) + (2 * trailingCRLFs); headers.add(CONTENT_LENGTH,String.valueOf(amt)); content2 = fbos.getSupplier().getInput(); } out.write(WARC_ID.getBytes(DEFAULT_ENCODING)); out.write(CR); out.write(LF); headers.write(out); if(content2 != null) { StreamCopy.copy(content2, out); } for(int i = 0; i < trailingCRLFs; i++) { out.write(CR); out.write(LF); } } public void writeWARCInfoRecord(OutputStream out, String filename, InputStream contents) throws IOException { // WARC/1.0 // WARC-Type: warcinfo // WARC-Date: 2010-10-08T07:00:26Z // WARC-Filename: LOC-MONTHLY-014-20101008070022-00127-crawling111.us.archive.org.warc.gz // WARC-Record-ID: <urn:uuid:05de9500-7047-4206-aa7f-346a0dc91b1f> // Content-Type: application/warc-fields // Content-Length: 600 HttpHeaders headers = new HttpHeaders(); headers.add(HEADER_KEY_TYPE, WARCINFO); headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date()); headers.add(HEADER_KEY_FILENAME, filename); headers.add(HEADER_KEY_ID, makeRecordId()); headers.add(CONTENT_TYPE,WARC_FIELDS_TYPE); writeRecord(out,headers,contents,2); } public void writeJSONMetadataRecord(OutputStream out, InputStream contents, String targetURI, Date originalDate, String origRecordId) throws IOException { // WARC-Type The type of WARC record. Set to 'metadata' // WARC-Target-URI The original URI of the primary content // WARC-Date A 14-digit timestamp that represents the instant of data capture of the primary content // WARC-Record-ID An identifier assigned to the current record that is globally unique for its period of intended use // WARC-Refers-To The WARC-Record-ID of the primary WARC record being described. // In the case of ARC records, the identifier is a combination of ARC filename and file-offset (e.g. <urn:arc:foo.arc.gz:3492>) // Content-Type The MIME type of the information contained in the metadata record's block. Set to 'application/json' // Content-Length The number of octets in the metadata record's block HttpHeaders headers = new HttpHeaders(); headers.add(HEADER_KEY_TYPE, METADATA); headers.add(HEADER_KEY_URI, targetURI); headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate)); headers.add(HEADER_KEY_ID, makeRecordId()); headers.add(HEADER_KEY_REFERS_TO, origRecordId); headers.add(CONTENT_TYPE,"application/json"); writeRecord(out, headers, contents, 1); } private String makeRecordId() { StringBuilder recID = new StringBuilder(); recID.append("<").append(SCHEME_COLON); recID.append(UUID.randomUUID().toString()); recID.append(">"); return recID.toString(); } }