package org.archive.io.warc; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.text.ParseException; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; import org.archive.util.anvl.ANVLRecord; import org.archive.util.anvl.Element; import com.google.common.io.CountingInputStream; /** * Fixture WARCReader. * <p>It works as ArchiveReader reading from WARC file with just one * WARC record at offset 0 (there's no "warcinfo" record).</p> * <p>Content of the record is customized through {@link WARCRecordInfo}. * ({@link TestWARCRecordInfo} offers commonly-used default values and convenient factory * methods.</p> * <p>Typical test code would be:</p> * <pre> * String payload = "hogehogehogehogehoge"; * WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse(payload); * TestWARCReader ar = new TestWARCReader(recinfo); * WARCRecord rec = (WARCRecord)ar.get(0); * </pre> * * @contributor kenji * */ public class TestWARCReader extends ArchiveReader { public static final String CRLF = "\r\n"; public TestWARCReader(InputStream is) { setIn(is); } public TestWARCReader(WARCRecordInfo recinfo) throws IOException { // not clearly stated, but ArchiveReader expects CountingInputStream. setIn(new CountingInputStream(TestWARCReader.buildRecordContent(recinfo))); } @Override public WARCRecord get(long offset) throws IOException { return (WARCRecord)super.get(offset); } @Override protected WARCRecord createArchiveRecord(InputStream is, long offset) throws IOException { return (WARCRecord)currentRecord(new WARCRecord(is, "<identifier>", offset)); } @Override protected void gotoEOR(ArchiveRecord record) throws IOException { } @Override public String getFileExtension() { return "warc"; } @Override public String getDotFileExtension() { return ".warc"; } @Override public void dump(boolean compress) throws IOException, ParseException { // TODO Auto-generated method stub } @Override public ArchiveReader getDeleteFileOnCloseReader(File f) { // TODO Auto-generated method stub return null; } /** * build minimal WARC record byte stream. * @param recinfo WARCRecordInfo with record metadata and content * @return InputStream reading from created record bits * @throws IOException */ public static InputStream buildRecordContent(WARCRecordInfo recinfo) throws IOException { ByteArrayOutputStream buf = new ByteArrayOutputStream(); Writer w = new OutputStreamWriter(buf); w.write("WARC/1.0" + CRLF); w.write("WARC-Type: " + recinfo.getType() + CRLF); if (StringUtils.isNotEmpty(recinfo.getUrl())) { w.write("WARC-Target-URI: " + recinfo.getUrl() + CRLF); } w.write("WARC-Date: " + recinfo.getCreate14DigitDate() + CRLF); if (recinfo.getExtraHeaders() != null) { ANVLRecord headers = recinfo.getExtraHeaders(); for (Element el : headers) { w.write(el.getLabel() + ": " + el.getValue() + CRLF); } } w.write("Content-Type: " + recinfo.getMimetype() + CRLF); w.write("Content-Length: " + recinfo.getContentLength() + CRLF); w.write(CRLF); w.flush(); IOUtils.copy(recinfo.getContentStream(), buf); buf.write((CRLF+CRLF).getBytes()); buf.close(); return new ByteArrayInputStream(buf.toByteArray()); } }