package dk.kb.yggdrasil.warc; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.net.URL; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.UUID; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.jwat.common.Base32; import org.jwat.common.ByteCountingPushBackInputStream; import org.jwat.common.ContentType; import org.jwat.common.RandomAccessFileInputStream; import org.jwat.common.Uri; import org.jwat.warc.WarcConstants; import org.jwat.warc.WarcDigest; import org.jwat.warc.WarcHeader; import org.jwat.warc.WarcReader; import org.jwat.warc.WarcReaderFactory; import org.jwat.warc.WarcRecord; import dk.kb.yggdrasil.exceptions.YggdrasilException; @RunWith(JUnit4.class) public class TestWarcWriterWrapper { public String getUrlPath(URL url) { String path = url.getFile(); path = path.replaceAll( "%5b", "[" ); path = path.replaceAll( "%5d", "]" ); return path; } @Test public void test_warcwriterwrapper() { URL url; File file; ByteArrayInputStream in; ByteCountingPushBackInputStream pbin; ByteArrayOutputStream out = new ByteArrayOutputStream(); byte[] tmpBuf = new byte[1024]; int read; try { MessageDigest md = MessageDigest.getInstance("SHA1"); byte[] digestBytes; ContentType contentType; WarcDigest blockDigest; // Get eclipse/maven target test folder. url = this.getClass().getClassLoader().getResource(""); file = new File(getUrlPath(url)); File warcFile = new File(file, "42"); if (warcFile.exists()) { if (!warcFile.delete()) { Assert.fail("Unable to remove data from previous run!"); } } WarcWriterWrapper w3 = WarcWriterWrapper.getWriter(file, "42"); Assert.assertNotNull(w3); Assert.assertTrue(w3.bIsNew); String warcFields = "greetings: hi mom!\n"; byte[] warcFieldsBytes = warcFields.getBytes("UTF-8"); md.reset(); digestBytes = md.digest(warcFieldsBytes); blockDigest = WarcDigest.createWarcDigest("SHA1", digestBytes, "Base32", Base32.encodeArray(digestBytes)); Uri warcinfoId = w3.writeWarcinfoRecord(warcFieldsBytes, blockDigest); Assert.assertNotNull(warcinfoId); String dataStr = "very interesting data!"; String dataUUID = UUID.randomUUID().toString(); byte[] dataBytes = dataStr.getBytes("UTF-8"); md.reset(); digestBytes = md.digest(dataBytes); in = new ByteArrayInputStream(dataBytes); contentType = ContentType.parseContentType("application/binary"); blockDigest = WarcDigest.createWarcDigest("SHA1", digestBytes, "Base32", Base32.encodeArray(digestBytes)); Uri warcResourceId = w3.writeResourceRecord(in, dataBytes.length, contentType, blockDigest, dataUUID); Assert.assertNotNull(warcResourceId); Assert.assertTrue("Resource ID '" + warcResourceId + "' should contain uuid '" + dataUUID + "'", warcResourceId.toString().contains(dataUUID)); String metadataStr = "very interesting metadata!"; String metadataUUID = UUID.randomUUID().toString(); byte[] metadataBytes = metadataStr.getBytes("UTF-8"); md.reset(); digestBytes = md.digest(metadataBytes); in = new ByteArrayInputStream(metadataBytes); contentType = ContentType.parseContentType("text/xml; charset=\"utf-8\""); blockDigest = WarcDigest.createWarcDigest("SHA1", digestBytes, "Base32", Base32.encodeArray(digestBytes)); Uri warcMetadataId = w3.writeMetadataRecord(in, metadataBytes.length, contentType, warcResourceId, blockDigest, metadataUUID); Assert.assertNotNull(warcMetadataId); Assert.assertTrue("Resource ID '" + warcMetadataId + "' should contain uuid '" + metadataUUID + "'", warcMetadataId.toString().contains(metadataUUID)); w3.close(); RandomAccessFile raf = new RandomAccessFile(new File(file, "42"), "r"); RandomAccessFileInputStream rafin = new RandomAccessFileInputStream(raf); WarcReader reader = WarcReaderFactory.getReader(rafin, 8192); WarcRecord record; WarcHeader header; // WARC INFO validation record = reader.getNextRecord(); Assert.assertNotNull(record); Assert.assertTrue(record.isCompliant()); header = record.header; Assert.assertEquals(new Integer(WarcConstants.RT_IDX_WARCINFO), header.warcTypeIdx); Assert.assertNull(header.warcWarcinfoIdUri); Assert.assertEquals(warcinfoId, header.warcRecordIdUri); Assert.assertNull(header.warcRefersToUri); pbin = record.getPayload().getInputStream(); out.reset(); while ((read = pbin.read(tmpBuf)) != -1) { out.write(tmpBuf, 0, read); } out.close(); pbin.close(); Assert.assertArrayEquals(warcFieldsBytes, out.toByteArray()); // WARC Resource validation record = reader.getNextRecord(); Assert.assertNotNull(record); Assert.assertTrue(record.isCompliant()); header = record.header; Assert.assertEquals(new Integer(WarcConstants.RT_IDX_RESOURCE), header.warcTypeIdx); Assert.assertEquals(warcinfoId, header.warcWarcinfoIdUri); Assert.assertEquals(warcResourceId, header.warcRecordIdUri); Assert.assertNull(header.warcRefersToUri); Assert.assertNotNull(header.warcRecordIdStr); pbin = record.getPayload().getInputStream(); out.reset(); while ((read = pbin.read(tmpBuf)) != -1) { out.write(tmpBuf, 0, read); } out.close(); pbin.close(); Assert.assertArrayEquals(dataBytes, out.toByteArray()); // WARC Metadata validation record = reader.getNextRecord(); Assert.assertNotNull(record); Assert.assertTrue(record.isCompliant()); header = record.header; Assert.assertEquals(new Integer(WarcConstants.RT_IDX_METADATA), header.warcTypeIdx); Assert.assertEquals(warcinfoId, header.warcWarcinfoIdUri); Assert.assertEquals(warcMetadataId, header.warcRecordIdUri); Assert.assertEquals(warcResourceId, header.warcRefersToUri); pbin = record.getPayload().getInputStream(); out.reset(); while ((read = pbin.read(tmpBuf)) != -1) { out.write(tmpBuf, 0, read); } out.close(); pbin.close(); Assert.assertArrayEquals(metadataBytes, out.toByteArray()); record = reader.getNextRecord(); Assert.assertNull(record); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); Assert.fail("Unexpected exception!"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); Assert.fail("Unexpected exception!"); } catch (YggdrasilException e) { e.printStackTrace(); Assert.fail("Unexpected exception!"); } catch (FileNotFoundException e) { e.printStackTrace(); Assert.fail("Unexpected exception!"); } catch (IOException e) { e.printStackTrace(); Assert.fail("Unexpected exception!"); } } }