package org.archive.format.warc;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Date;
import java.util.UUID;
import org.archive.format.http.HttpConstants;
import org.archive.format.http.HttpHeaders;
import org.archive.util.DateUtils;
import org.archive.util.StreamCopy;
public class WARCRecordWriter implements WARCConstants, HttpConstants
{
private static final String SCHEME = "urn:uuid";
private static final String SCHEME_COLON = SCHEME + ":";
/**
* Write the headers and contents as a WARC record to the given
* output stream.
*
* WARC record format:
* <pre>warc-file = 1*warc-record
* warc-record = header CRLF block CRLF CRLF
* header = version warc-fields
* version = "WARC/0.18" CRLF
* warc-fields = *named-field CRLF
* block = *OCTET</pre>
*/
private void writeRecord( OutputStream out,
HttpHeaders headers,
byte[] contents) throws IOException
{
if ( contents == null )
{
headers.add(CONTENT_LENGTH, "0");
}
else
{
headers.add(CONTENT_LENGTH,String.valueOf(contents.length));
}
out.write(WARC_ID.getBytes(DEFAULT_ENCODING));
out.write(CR);
out.write(LF);
// NOTE: HttpHeaders.write() method includes the trailing CRLF.
// So we don't need to write it out here.
headers.write(out);
if ( contents != null )
{
out.write( contents );
}
// Emit the 2 trailing CRLF sequences.
out.write(CR);
out.write(LF);
out.write(CR);
out.write(LF);
}
public void writeWARCInfoRecord(OutputStream out,
String filename,
byte[] contents ) throws IOException
{
// WARC/1.0
// WARC-Type: warcinfo
// WARC-Date: 2010-10-08T07:00:26Z
// WARC-Filename: LOC-MONTHLY-014-20101008070022-00127-crawling111.us.archive.org.warc.gz
// WARC-Record-ID: <urn:uuid:05de9500-7047-4206-aa7f-346a0dc91b1f>
// Content-Type: application/warc-fields
// Content-Length: 600
HttpHeaders headers = new HttpHeaders();
headers.add(HEADER_KEY_TYPE, WARCINFO);
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date());
headers.add(HEADER_KEY_FILENAME, filename);
headers.add(HEADER_KEY_ID, makeRecordId());
headers.add(CONTENT_TYPE,WARC_FIELDS_TYPE);
writeRecord(out,headers,contents);
}
public void writeJSONMetadataRecord( OutputStream out,
byte[] contents,
String targetURI,
Date originalDate,
String origRecordId ) throws IOException
{
HttpHeaders headers = new HttpHeaders();
headers.add(HEADER_KEY_TYPE, METADATA);
headers.add(HEADER_KEY_URI, targetURI);
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
headers.add(HEADER_KEY_ID, makeRecordId());
headers.add(HEADER_KEY_REFERS_TO, origRecordId);
headers.add(CONTENT_TYPE,"application/json");
writeRecord(out, headers, contents);
}
private String makeRecordId()
{
StringBuilder recID = new StringBuilder();
recID.append("<").append(SCHEME_COLON);
recID.append(UUID.randomUUID().toString());
recID.append(">");
return recID.toString();
}
}