package dk.kb.yggdrasil.warc; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.util.Date; import java.util.List; import org.jwat.common.ContentType; import org.jwat.common.RandomAccessFileOutputStream; import org.jwat.common.Uri; import org.jwat.warc.WarcConcurrentTo; import org.jwat.warc.WarcConstants; import org.jwat.warc.WarcDigest; import org.jwat.warc.WarcHeader; import org.jwat.warc.WarcRecord; import org.jwat.warc.WarcWriter; import org.jwat.warc.WarcWriterFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import dk.kb.yggdrasil.exceptions.ArgumentCheck; import dk.kb.yggdrasil.exceptions.YggdrasilException; /** * Wrapper class to hide away WARC writing internals. */ public class WarcWriterWrapper { /** Logging mechanism. */ private static final Logger logger = LoggerFactory.getLogger(WarcWriterWrapper.class.getName()); /** Buffer size used by the WARC reader. */ public static final int WARC_READER_BUFFER_SIZE = 8192; /** UUID of package/WARC file. */ protected String uuid; /** WARC file. */ protected File writerFile; /** WARC <code>RandomAccessFile</code>. */ protected RandomAccessFile writerRaf; /** <code>RandomAccessFile</code> as an <code>OutputStream</code> */ protected RandomAccessFileOutputStream writerRafout; /** WARC writer implementation. */ protected WarcWriter writer; /** Is the WARC file new or not. */ protected boolean bIsNew; /** * Open new or existing WARC file. * @param path parent path where the file must be created/opened * @param uuid uuid of WARC file * @return WARC writer wrapper * @throws YggdrasilException is an exception occurs */ public static WarcWriterWrapper getWriter(File path, String uuid) throws YggdrasilException { ArgumentCheck.checkExistsDirectory(path, "path"); ArgumentCheck.checkNotNullOrEmpty(uuid, "uuid"); WarcWriterWrapper w3 = null; File writerFile = new File(path, uuid); try { if (writerFile.exists() && !writerFile.isFile()) { throw new YggdrasilException("'" + uuid +"' appears to be an existing folder, this is disappointing."); } w3 = new WarcWriterWrapper(); w3.uuid = uuid; w3.writerFile = writerFile; w3.writerRaf = new RandomAccessFile(w3.writerFile, "rw"); w3.writerRaf.seek(w3.writerRaf.length()); w3.writerRafout = new RandomAccessFileOutputStream(w3.writerRaf); w3.writer = WarcWriterFactory.getWriter(w3.writerRafout, WARC_READER_BUFFER_SIZE, false); w3.writer.setExceptionOnContentLengthMismatch(true); w3.bIsNew = (w3.writerRaf.length() == 0L); } catch (FileNotFoundException e) { throw new YggdrasilException("Exception while opening WARC file", e); } catch (IOException e) { throw new YggdrasilException("Exception while opening WARC file", e); } return w3; } /** WARC file Warcinfo id. */ private Uri warcinfoRecordId; /** * Returns the WARC file Warcinfo id. * @return the WARC file Warcinfo id */ public Uri getWarcinfoRecordId() { return warcinfoRecordId; } /** * Append a Warcinfo record to WARC file. * @param warcFieldsBytes warc fields as byte array * @param blockDigest optional block digest * @return WarcRecordId of newly created record * @throws YggdrasilException if an exception occurs while writing record */ public Uri writeWarcinfoRecord(byte[] warcFieldsBytes, WarcDigest blockDigest) throws YggdrasilException { ArgumentCheck.checkNotNull(warcFieldsBytes, "warcFieldsBytes"); try { ByteArrayInputStream bin = new ByteArrayInputStream(warcFieldsBytes); warcinfoRecordId = new Uri("urn:uuid:" + uuid); WarcRecord record = WarcRecord.createRecord(writer); WarcHeader header = record.header; header.warcTypeIdx = WarcConstants.RT_IDX_WARCINFO; header.warcDate = new Date(); header.warcFilename = uuid; header.warcRecordIdUri = warcinfoRecordId; header.contentTypeStr = WarcConstants.CT_APP_WARC_FIELDS; header.warcBlockDigest = blockDigest; header.contentLength = Long.valueOf(warcFieldsBytes.length); writer.writeHeader(record); writer.streamPayload(bin); writer.closeRecord(); } catch (UnsupportedEncodingException e) { throw new YggdrasilException("Exception while writing WARC warcinfo record!", e); } catch (URISyntaxException e) { throw new YggdrasilException("Exception while writing WARC warcinfo record!", e); } catch (IOException e) { throw new YggdrasilException("Exception while writing WARC warcinfo record!", e); } logger.debug("Written Info Record '" + uuid + "'."); return warcinfoRecordId; } /** * Append a resource record to WARC file. * @param in payload input stream * @param len payload length * @param contentType payload content-type * @param blockDigest optional block digest * @param uuid The UUID for the record. * @return WarcRecordId of newly created record * @throws YggdrasilException if an exception occurs while writing record */ public Uri writeResourceRecord(InputStream in, long len, ContentType contentType, WarcDigest blockDigest, String uuid) throws YggdrasilException { ArgumentCheck.checkNotNull(in, "in"); ArgumentCheck.checkNotNull(len, "len"); ArgumentCheck.checkNotNull(contentType, "contentType"); ArgumentCheck.checkNotNull(uuid, "uuid"); Uri warcRecordIdUri = null; try { warcRecordIdUri = new Uri("urn:uuid:" + uuid); WarcRecord record = WarcRecord.createRecord(writer); WarcHeader header = record.header; header.warcTypeIdx = WarcConstants.RT_IDX_RESOURCE; header.warcDate = new Date(); header.warcWarcinfoIdUri = warcinfoRecordId; header.warcRecordIdUri = warcRecordIdUri; header.warcTargetUriUri = warcRecordIdUri; header.warcBlockDigest = blockDigest; header.contentType = contentType; header.contentLength = len; writer.writeHeader(record); writer.streamPayload(in); writer.closeRecord(); } catch (URISyntaxException e) { throw new YggdrasilException("Exception while writing WARC resource record!", e); } catch (IOException e) { throw new YggdrasilException("Exception while writing WARC resource record!", e); } logger.debug("Written Resource Record '" + uuid + "'."); return warcRecordIdUri; } /** * Append a metadata record to WARC file. * @param in payload input stream * @param len payload length * @param refersTo The refers to header element. * @param contentType payload content-type * @param blockDigest optional block digest * @param uuid The UUID for the record. * @return WarcRecordId of newly created record * @throws YggdrasilException if an exception occurs while writing record */ public Uri writeMetadataRecord(InputStream in, long len, ContentType contentType, Uri refersTo, WarcDigest blockDigest, String uuid) throws YggdrasilException { ArgumentCheck.checkNotNull(in, "in"); ArgumentCheck.checkNotNull(len, "len"); ArgumentCheck.checkNotNull(contentType, "contentType"); ArgumentCheck.checkNotNull(uuid, "uuid"); Uri warcRecordIdUri = null; try { warcRecordIdUri = new Uri("urn:uuid:" + uuid); WarcRecord record = WarcRecord.createRecord(writer); WarcHeader header = record.header; header.warcTypeIdx = WarcConstants.RT_IDX_METADATA; header.warcDate = new Date(); header.warcWarcinfoIdUri = warcinfoRecordId; header.warcRecordIdUri = warcRecordIdUri; header.warcRefersToUri = refersTo; header.warcBlockDigest = blockDigest; header.contentType = contentType; header.contentLength = len; writer.writeHeader(record); writer.streamPayload(in); writer.closeRecord(); } catch (URISyntaxException e) { throw new YggdrasilException("Exception while writing WARC metadata record!", e); } catch (IOException e) { throw new YggdrasilException("Exception while writing WARC metadata record!", e); } logger.debug("Written Metadata Record '" + uuid + "'."); return warcRecordIdUri; } /** * Append a update record to WARC file. * @param in payload input stream * @param len payload length * @param contentType payload content-type * @param refersTo The refers to header element. * @param concurrentTo List of concurrentTo header elements. * @param blockDigest optional block digest * @param uuid The UUID for the record. * @return WarcRecordId of newly created record * @throws YggdrasilException if an exception occurs while writing record */ public Uri writeUpdateRecord(InputStream in, long len, ContentType contentType, Uri refersTo, List<WarcConcurrentTo> concurrentTo, WarcDigest blockDigest, String uuid) throws YggdrasilException { ArgumentCheck.checkNotNull(in, "in"); ArgumentCheck.checkNotNull(len, "len"); ArgumentCheck.checkNotNull(contentType, "contentType"); ArgumentCheck.checkNotNull(uuid, "uuid"); ArgumentCheck.checkNotNull(concurrentTo, "concurrentTo"); Uri warcRecordIdUri = null; try { warcRecordIdUri = new Uri("urn:uuid:" + uuid); WarcRecord record = WarcRecord.createRecord(writer); WarcHeader header = record.header; header.warcTypeStr = "update"; header.warcDate = new Date(); header.warcWarcinfoIdUri = warcinfoRecordId; header.warcRecordIdUri = warcRecordIdUri; header.warcConcurrentToList.addAll(concurrentTo); header.warcRefersToUri = refersTo; header.warcBlockDigest = blockDigest; header.contentType = contentType; header.contentLength = len; writer.writeHeader(record); writer.streamPayload(in); writer.closeRecord(); } catch (URISyntaxException e) { throw new YggdrasilException("Exception while writing WARC metadata record!", e); } catch (IOException e) { throw new YggdrasilException("Exception while writing WARC metadata record!", e); } logger.debug("Written Update Record '" + uuid + "'."); return warcRecordIdUri; } /** * @return The current size of the warc file. */ public long getWarcFileSize() { return writerFile.length(); } /** * @return The Warc file. */ public File getWarcFile() { return writerFile; } /** * @return The ID for the Warc file. */ public String getWarcFileId() { return writerFile.getName(); } /** * Close writer, output stream and random access file. * @throws YggdrasilException if an exception occurs while closing associated resources */ public void close() throws YggdrasilException { try { if (writer != null) { writer.close(); writer = null; } if (writerRafout != null) { writerRafout.close(); writerRafout = null; } if (writerRaf != null) { writerRaf.close(); writerRaf = null; } } catch (IOException e) { throw new YggdrasilException("Exception closing WarcWriterWrapper!", e); } } }