/** * */ package org.archive.io.arc; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.commons.io.IOUtils; import org.archive.io.warc.TestWARCReader; import org.archive.io.warc.TestWARCRecordInfo; import org.archive.io.warc.WARCRecordInfo; import org.archive.util.DateUtils; import com.google.common.io.CountingInputStream; import java.util.Locale; /** * Fixture ARCReader. * <p>It works as ArchiveReader reading from ARC file with just one * WARC record at offset 0 (no version_block line).</p> * <p>record content is customized through {@link WARCRecordInfo}. * ({@link TestWARCRecordInfo} offers commonly-used default values and convenient factory * methods.</p> * * TODO: could separate ARC record formatting code out of ARCWriter and reuse it here. * current ARCWriter requires too much boilerplate, always writes out the first metadata * line. * * @see TestWARCReader * @contributor kenji * */ public class TestARCReader extends ARCReader { public TestARCReader(InputStream is) { setIn(is); } public TestARCReader(WARCRecordInfo recinfo) throws IOException { setIn(new CountingInputStream(buildRecordContent(recinfo))); // ARCRecord tries to read off version-block if offset==0 and // alignedOnFirstRecord is true (it is by default). As we don't have // version-block at offset 0, we disable this behavior. setAlignedOnFirstRecord(false); } @Override public ARCRecord get(long offset) throws IOException { return (ARCRecord)super.get(offset); } private String isozToDateTime14(String isoz) { try { Date d = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH).parse(isoz); return DateUtils.get14DigitDate(d); } catch (ParseException ex) { throw new RuntimeException("bad ISOZ: " + isoz, ex); } } /** * build minimally-conforming ARC record byte stream. * @param recinfo WARCRecordInfo with record metadata and content. * type parameter does not matter, be sure to set contentType to that of * payload. * @return InputStream reading from created record bits * @throws IOException */ public InputStream buildRecordContent(WARCRecordInfo recinfo) throws IOException { final char FS = ARCWriter.HEADER_FIELD_SEPARATOR; ByteArrayOutputStream buf = new ByteArrayOutputStream(); String timeStamp = isozToDateTime14(recinfo.getCreate14DigitDate()); String mimetype = recinfo.getMimetype(); long contentLen = recinfo.getContentLength(); // URL Record v1. String urlRecordLine = recinfo.getUrl() + FS + "4.4.4.4" + FS + timeStamp + FS + mimetype + FS + contentLen + ARCWriter.LINE_SEPARATOR; buf.write(urlRecordLine.getBytes("UTF-8")); IOUtils.copy(recinfo.getContentStream(), buf); buf.write(ARCWriter.LINE_SEPARATOR); return new ByteArrayInputStream(buf.toByteArray()); } }