/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import junit.framework.TestCase;
import org.apache.commons.httpclient.Header;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
public class HeaderedArchiveRecordTest extends TestCase {
private static final String HTTPHEADER = "HTTP/1.1 200 OK\r\n"
+ "Last-Modified: Sun, 28 Aug 2005 14:10:55 GMT\r\n"
+ "Content-Length: 108\r\n" + "Connection: close\r\n"
+ "Content-Type: text/html\r\n" + "\r\n";
private static final String BODY = "<html>\r\n" + " <head>\r\n"
+ " <title>Neue Seite 1</title>\r\n" + " </head>\r\n"
+ " <body bgcolor=\"#000066\">\r\n" + " </body>\r\n" + "</html>";
public void testParseHttpHeadersInWARC() throws IOException {
final String url = "http://foo.maths.uq.edu.au/index.html";
// final String warcHeader = "WARC/0.10 000000000486 response " +
// url + " 20070315152520 " +
// "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " +
// "application/http; msgtype=response\r\n" +
// "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" +
// "IP-Address: 80.150.6.184\r\n" +
// "\r\n";
final String warcHeader = "WARC/0.12\r\n"
+ "MIME-Version: 1.0\r\n"
+ "WARC-Record-Type: response\r\n"
+ "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n"
+ "WARC-Date: 2006-09-19T17:20:24Z\r\n"
+ "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n"
+ "WARC-IP-Address: 80.150.6.184\r\n"
+ "Content-ID: <urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58>\r\n"
+ "Content-Type: application/http; msgtype=response\r\n"
+ "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
+ "\r\n";
final String hdr = warcHeader + HTTPHEADER + BODY;
WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
"READER_IDENTIFIER", 0, false, true);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
har.skipHttpHeader();
byte[] b = new byte[BODY.length()];
har.read(b);
String bodyRead = new String(b);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
assertEquals("failed to retrieve Url from metadata", har.getHeader()
.getUrl(), url);
}
public void testParseHttpHeadersInARC() throws IOException {
final int len = HTTPHEADER.length() + BODY.length();
final int contentLength = BODY.length();
final String url = "http://www.ly.gov.tw:80/accpart.htm";
final String hdr = HTTPHEADER + BODY;
// Interesting difference between ARCRecord and WARCRecord is that the
// stream passed the ARCRecord is supposed to be just past the
// ARCRecord metadata line where as stream passed WARCRecord is at
// record start. TODO: Add to ARCRecord constructor that doesn't
// take an ArchiveRecordHeader but rather parses it from the stream.
ArchiveRecordHeader arh = new ArchiveRecordHeader() {
public int getContentBegin() {
// TODO: In ARCs, this is where http headers end and
// the content begins. Need to reconcile for generic
// HeaderedArchiveRecord processing. In this context, it
// makes sense setting it to zero -- HeaderedArchiveRecord
// will then figure it out.
return 0;
}
public String getDate() {
return null;
}
public String getDigest() {
return null;
}
public Set<String> getHeaderFieldKeys() {
return null;
}
public Map<String,Object> getHeaderFields() {
return null;
}
public Object getHeaderValue(String key) {
return null;
}
public long getLength() {
return len;
}
public long getContentLength() {
return contentLength;
}
public String getMimetype() {
return null;
}
public long getOffset() {
return 0;
}
public String getReaderIdentifier() {
return null;
}
public String getRecordIdentifier() {
return null;
}
public String getUrl() {
return url;
}
public String getVersion() {
return null;
}
};
ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
arh, 0, false, true, false);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
har.skipHttpHeader();
byte[] b = new byte[BODY.length()];
har.read(b);
String bodyRead = new String(b);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
}
public void testEasierParseHttpHeadersInARC() throws IOException {
final String url = "http://www.archive.org/index.htm";
final String arcHeader = url
+ " 192.168.0.1 20070515111004 text/html 167568\n";
final String hdr = arcHeader + HTTPHEADER + BODY;
ARCRecord r = new ARCRecord(new ByteArrayInputStream(hdr.getBytes()),
"READER_IDENTIFIER", 0, false, true, false);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
har.skipHttpHeader();
byte[] b = new byte[BODY.length()];
har.read(b);
String bodyRead = new String(b);
assertEquals(BODY, bodyRead);
assertHeaderCorrectlyParsed(har.getContentHeaders());
assertEquals("failed to retrieve Url from metadata", har.getHeader()
.getUrl(), url);
}
private void assertHeaderCorrectlyParsed(Header[] headers) {
final List<String> orgHeaders = Arrays.asList(HTTPHEADER.split("\r\n"));
assertEquals("not all HTTP header entries have been retrieved",
orgHeaders.size(), headers.length + 1);
for (Header header : headers) {
assertTrue(orgHeaders.contains(header.getName() + ": "
+ header.getValue()));
}
}
public void testNoheaderWARC() throws IOException {
String b = "hello world";
String c = "WARC/0.12\r\nContent-Type: text/plain\r\n"
+ "Content-Length: " + b.length() + "\r\n\r\n" + b;
org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord(
new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0,
false, true);
HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
assertTrue(har.isStrict());
}
}