/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io.warc;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.UTF8Bytes;
import org.archive.io.WriterPoolMember;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;
import org.archive.util.ArchiveUtils;
import org.archive.util.TmpDirTestCase;
import org.archive.util.anvl.ANVLRecord;
/**
* Test Writer and Reader.
* @author stack
* @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
*/
public class WARCWriterTest
extends TmpDirTestCase implements WARCConstants {
private static final AtomicInteger SERIAL_NO = new AtomicInteger();
RecordIDGenerator generator = new UUIDGenerator();
/**
* Prefix to use for ARC files made by JUNIT.
*/
private static final String SUFFIX = "JUNIT";
private static final String SOME_URL = "http://www.archive.org/test/";
@SuppressWarnings("unchecked")
public void testCheckHeaderLineValue() throws Exception {
WARCWriter writer = new WARCWriter(
SERIAL_NO,
new WARCWriterPoolSettingsData(
"","test",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
writer.checkHeaderValue("one");
IllegalArgumentException exception = null;
try {
writer.checkHeaderValue("with space");
} catch(IllegalArgumentException e) {
exception = e;
}
assertNotNull(exception);
exception = null;
try {
writer.checkHeaderValue("with\0x0000controlcharacter");
} catch(IllegalArgumentException e) {
exception = e;
}
writer.close();
assertNotNull(exception);
}
@SuppressWarnings("unchecked")
public void testMimetypes() throws IOException {
WARCWriter writer = new WARCWriter(SERIAL_NO,
new WARCWriterPoolSettingsData(
"m","testM",1,false,Collections.EMPTY_LIST,Collections.EMPTY_LIST,generator));
writer.checkHeaderLineMimetypeParameter("text/xml");
writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
assertEquals(writer.checkHeaderLineMimetypeParameter(
"text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
assertEquals(writer.checkHeaderLineMimetypeParameter(
"multipart/mixed; \r\n boundary=\"simple boundary\""),
"multipart/mixed; boundary=\"simple boundary\"");
}
public void testWriteRecord() throws IOException {
File [] files = {getTmpDir()};
// Write uncompressed.
WARCWriter writer =
new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
this.getClass().getName(), "templateWR1", -1, false, Arrays.asList(files), null, generator));
writeFile(writer);
writer.close();
// Write compressed.
writer = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
this.getClass().getName(), "templateWR2", -1, true, Arrays.asList(files), null, generator));
writeFile(writer);
writer.close();
}
private void writeFile(final WARCWriter writer)
throws IOException {
try {
writeWarcinfoRecord(writer);
writeBasicRecords(writer);
} finally {
writer.close();
writer.getFile().delete();
}
}
private void writeWarcinfoRecord(WARCWriter writer)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setType(WARCRecordType.warcinfo);
recordInfo.setUrl(null);
recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
recordInfo.setMimetype(ANVLRecord.MIMETYPE);
recordInfo.setExtraHeaders(null);
recordInfo.setEnforceLength(true);
ANVLRecord meta = new ANVLRecord();
meta.addLabelValue("size", "1G");
meta.addLabelValue("operator", "igor");
byte [] bytes = meta.getUTF8Bytes();
recordInfo.setContentStream(new ByteArrayInputStream(bytes));
recordInfo.setContentLength((long) bytes.length);
final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString());
recordInfo.setRecordId(recordid);
writer.writeRecord(recordInfo);
}
protected void writeBasicRecords(final WARCWriter writer)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setType(WARCRecordType.metadata);
recordInfo.setUrl("http://www.archive.org/");
recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
recordInfo.setMimetype("no/type");
recordInfo.setEnforceLength(true);
ANVLRecord headerFields = new ANVLRecord();
headerFields.addLabelValue("x", "y");
headerFields.addLabelValue("a", "b");
recordInfo.setExtraHeaders(headerFields);
URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString());
recordInfo.setRecordId(rid);
final String content = "Any old content.";
for (int i = 0; i < 10; i++) {
String body = i + ". " + content;
byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
recordInfo.setContentStream(new ByteArrayInputStream(bodyBytes));
recordInfo.setContentLength((long)bodyBytes.length);
writer.writeRecord(recordInfo);
}
}
/**
* @return Generic HTML Content.
*/
protected static String getContent() {
return getContent(null);
}
/**
* @return Generic HTML Content with mention of passed <code>indexStr</code>
* in title and body.
*/
protected static String getContent(String indexStr) {
String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
return "HTTP/1.1 200 OK\r\n" +
"Content-Type: text/html\r\n\r\n" +
"<html><head><title>" + page +
"</title></head>" +
"<body>" + page +
"</body></html>";
}
/**
* Write random HTML Record.
* @param w Where to write.
* @param index An index to put into content.
* @return Length of record written.
* @throws IOException
*/
protected int writeRandomHTTPRecord(WARCWriter w, int index)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setType(WARCRecordType.resource);
recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
recordInfo.setMimetype("text/html; charset=UTF-8");
recordInfo.setRecordId(w.generateRecordId(null));
recordInfo.setEnforceLength(true);
String indexStr = Integer.toString(index);
recordInfo.setUrl("http://www.one.net/id=" + indexStr);
byte[] record = (getContent(indexStr)).getBytes();
recordInfo.setContentLength((long) record.length);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(record);
recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
// Add named fields for ip, checksum, and relate the metadata
// and request to the resource field.
recordInfo.addExtraHeader(NAMED_FIELD_IP_LABEL, "127.0.0.1");
w.writeRecord(recordInfo);
return record.length;
}
/**
* Fill a WARC with HTML Records.
* @param baseName WARC basename.
* @param compress Whether to compress or not.
* @param maxSize Maximum WARC size.
* @param recordCount How many records.
* @return The written file.
* @throws IOException
*/
private File writeRecords(String baseName, boolean compress,
int maxSize, int recordCount)
throws IOException {
cleanUpOldFiles(baseName);
File [] files = {getTmpDir()};
WARCWriter w = new WARCWriter(SERIAL_NO, new WARCWriterPoolSettingsData(
baseName + '-' + SUFFIX, "${prefix}", maxSize, compress, Arrays.asList(files), null, generator));
assertNotNull(w);
for (int i = 0; i < recordCount; i++) {
writeRandomHTTPRecord(w, i);
}
w.close();
assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
w.getFile().exists());
return w.getFile();
}
/**
* Run validation of passed file.
* @param f File to validate.
* @param recordCount Expected count of records.
* @throws FileNotFoundException
* @throws IOException
*/
private void validate(File f, int recordCount)
throws FileNotFoundException, IOException {
WARCReader reader = WARCReaderFactory.get(f);
assertNotNull(reader);
List<ArchiveRecordHeader> headers = null;
if (recordCount == -1) {
headers = reader.validate();
} else {
headers = reader.validate(recordCount);
}
reader.close();
// Now, run through each of the records doing absolute get going from
// the end to start. Reopen the arc so no context between this test
// and the previous.
for (int i = headers.size() - 1; i >= 0; i--) {
reader = WARCReaderFactory.get(f);
ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
ArchiveRecord r = reader.get(h.getOffset());
String mimeType = r.getHeader().getMimetype();
assertTrue("Record is bogus",
mimeType != null && mimeType.length() > 0);
reader.close();
}
assertTrue("Metadatas not equal", headers.size() == recordCount);
for (Iterator<ArchiveRecordHeader> i = headers.iterator(); i.hasNext();) {
ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
assertTrue("Record is empty", r.getLength() > 0);
}
}
public void testWriteRecords() throws IOException {
final int recordCount = 2;
File f = writeRecords("writeRecords", false, DEFAULT_MAX_WARC_FILE_SIZE,
recordCount);
validate(f, recordCount + 1); // Header record.
}
public void testRandomAccess() throws IOException {
final int recordCount = 3;
File f = writeRecords("randomAccess", true, DEFAULT_MAX_WARC_FILE_SIZE,
recordCount);
WARCReader reader = WARCReaderFactory.get(f);
// Get to second record. Get its offset for later use.
boolean readFirst = false;
String url = null;
long offset = -1;
long totalRecords = 0;
boolean readSecond = false;
for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();
totalRecords++) {
WARCRecord ar = (WARCRecord)i.next();
if (!readFirst) {
readFirst = true;
continue;
}
if (!readSecond) {
url = ar.getHeader().getUrl();
offset = ar.getHeader().getOffset();
readSecond = true;
}
}
reader.close();
reader = WARCReaderFactory.get(f, offset);
ArchiveRecord ar = reader.get();
assertEquals(ar.getHeader().getUrl(), url);
ar.close();
reader.close();
// Get reader again. See how iterator works with offset
reader = WARCReaderFactory.get(f, offset);
int count = 0;
for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); i.next()) {
count++;
}
reader.close();
assertEquals(totalRecords - 1, count);
}
public void testWriteRecordCompressed() throws IOException {
final int recordCount = 2;
File arcFile = writeRecords("writeRecordCompressed", true,
DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
validate(arcFile, recordCount + 1 /*Header record*/);
}
protected WARCWriter createWARCWriter(String name,
boolean compress) {
File [] files = {getTmpDir()};
return new WARCWriter(SERIAL_NO,
new WARCWriterPoolSettingsData(
name,
"${prefix}-"+SUFFIX,
DEFAULT_MAX_WARC_FILE_SIZE,
compress,
Arrays.asList(files),
null,
generator));
}
protected static ByteArrayOutputStream getBaos(String str)
throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(str.getBytes());
return baos;
}
protected static void writeRecord(WARCWriter w, String url,
String mimetype, int len, ByteArrayOutputStream baos)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setType(WARCRecordType.resource);
recordInfo.setUrl(url);
recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
recordInfo.setMimetype(mimetype);
recordInfo.setRecordId(w.generateRecordId(null));
recordInfo.setExtraHeaders(null);
recordInfo.setContentStream(new ByteArrayInputStream(baos.toByteArray()));
recordInfo.setContentLength((long) len);
recordInfo.setEnforceLength(true);
w.writeRecord(recordInfo);
}
protected int iterateRecords(WARCReader r)
throws IOException {
int count = 0;
for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
ArchiveRecord ar = i.next();
ar.close();
if (count != 0) {
assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
ar.getHeader().getUrl().equals(SOME_URL));
}
count++;
}
return count;
}
protected WARCWriter createWithOneRecord(String name,
boolean compressed)
throws IOException {
WARCWriter writer = createWARCWriter(name, compressed);
String content = getContent();
writeRecord(writer, SOME_URL, "text/html",
content.length(), getBaos(content));
return writer;
}
public void testSpaceInURL() throws IOException {
long bytesWritten = holeyUrl("testSpaceInURL", false, " ");
assertEquals("Unexpected successful writing occurred",0,bytesWritten);
}
public void testTabInURL() throws IOException {
long bytesWritten = holeyUrl("testTabInURL", false, "\t");
assertEquals("Unexpected successful writing occurred",0,bytesWritten);
}
protected long holeyUrl(String name, boolean compress, String urlInsert)
throws IOException {
WARCWriter writer = createWithOneRecord(name, compress);
// Add some bytes on the end to mess up the record.
long startPos = writer.getPosition();
String content = getContent();
ByteArrayOutputStream baos = getBaos(content);
writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
content.length(), baos);
long endPos = writer.getPosition();
writer.close();
return endPos-startPos;
}
/**
* Write an arc file for other tests to use.
* @param arcdir Directory to write to.
* @param compress True if file should be compressed.
* @return ARC written.
* @throws IOException
*/
public static File createWARCFile(File arcdir, boolean compress)
throws IOException {
File [] files = {arcdir};
WARCWriter writer =
new WARCWriter(SERIAL_NO,
new WARCWriterPoolSettingsData(
"",
"test",
DEFAULT_MAX_WARC_FILE_SIZE,
compress,
Arrays.asList(files),
null,
new UUIDGenerator()));
String content = getContent();
writeRecord(writer, SOME_URL, "text/html", content.length(),
getBaos(content));
writer.close();
return writer.getFile();
}
// public void testSpeed() throws IOException {
// ARCWriter writer = createArcWithOneRecord("speed", true);
// // Add a record with a length that is too long.
// String content = getContent();
// final int count = 100000;
// logger.info("Starting speed write of " + count + " records.");
// for (int i = 0; i < count; i++) {
// writeRecord(writer, SOME_URL, "text/html", content.length(),
// getBaos(content));
// }
// writer.close();
// logger.info("Finished speed write test.");
// }
public void testArcRecordOffsetReads() throws Exception {
// Get an ARC with one record.
WriterPoolMember w =
createWithOneRecord("testArcRecordInBufferStream", true);
w.close();
// Get reader on said ARC.
WARCReader r = WARCReaderFactory.get(w.getFile());
final Iterator<ArchiveRecord> i = r.iterator();
// Skip first ARC meta record.
ArchiveRecord ar = i.next();
i.hasNext();
// Now we're at first and only record in ARC.
ar = (WARCRecord) i.next();
// Now try getting some random set of bytes out of it
// at an odd offset (used to fail because we were
// doing bad math to find where in buffer to read).
final byte[] buffer = new byte[17];
final int maxRead = 4;
int totalRead = 0;
while (totalRead < maxRead) {
totalRead = totalRead
+ ar.read(buffer, 13 + totalRead, maxRead - totalRead);
assertTrue(totalRead > 0);
}
}
}