/**
*
*/
package org.archive.wayback.archivalurl;
import java.io.IOException;
import javax.servlet.http.HttpServletResponse;
import junit.framework.TestCase;
import org.archive.format.http.HttpHeaders;
import org.archive.io.warc.TestWARCReader;
import org.archive.io.warc.TestWARCRecordInfo;
import org.archive.io.warc.WARCRecord;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.wayback.ResultURIConverter;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor;
import org.archive.wayback.replay.TextReplayRenderer;
import org.archive.wayback.replay.TransparentReplayRendererTest.TestServletOutputStream;
import org.archive.wayback.replay.charset.CharsetDetector;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;
import org.archive.wayback.util.htmllex.ParseContext;
import org.archive.wayback.util.htmllex.ParseEventHandler;
import org.easymock.Capture;
import org.easymock.EasyMock;
import org.htmlparser.Node;
/**
* unit test for {@link ArchivalUrlSAXRewriteReplayRenderer}.
* @author kenji
*
*/
public class ArchivalUrlSAXRewriteReplayRendererTest extends TestCase {
ResultURIConverter uriConverter;
HttpServletResponse response;
ParseEventHandler nodeHandler;
WaybackRequest wbRequest;
CaptureSearchResult result;
TestServletOutputStream servletOutput = new TestServletOutputStream();
ArchivalUrlSAXRewriteReplayRenderer cut;
public static class TestParseEventHandler implements ParseEventHandler {
@Override
public void handleParseStart(ParseContext context) {
}
@Override
public void handleNode(ParseContext context, Node node)
throws IOException {
String html = node.toHtml();
//System.out.print(html);
((ReplayParseContext) context).getOutputStream().write(
html.getBytes("UTF-8"));
}
@Override
public void handleParseComplete(ParseContext context) {
}
}
/* (non-Javadoc)
* @see junit.framework.TestCase#setUp()
*/
protected void setUp() throws Exception {
super.setUp();
RedirectRewritingHttpHeaderProcessor httpHeaderProcessor = new RedirectRewritingHttpHeaderProcessor();
httpHeaderProcessor.setPrefix("X-Archive-Orig-");
cut = new ArchivalUrlSAXRewriteReplayRenderer(httpHeaderProcessor);
uriConverter = EasyMock.createMock(ResultURIConverter.class);
response = EasyMock.createMock(HttpServletResponse.class);
EasyMock.expect(response.getOutputStream()).andReturn(servletOutput);
nodeHandler = EasyMock.createMock(ParseEventHandler.class);
cut.setDelegator(nodeHandler);
wbRequest = new WaybackRequest();
wbRequest.setFrameWrapperContext(false);
// replace default CharsetDetector (StandardCharsetDetector) with a stub
// so as not to depend on its behavior.
cut.setCharsetDetector(new CharsetDetector() {
@Override
public String getCharset(Resource httpHeadersResource,
Resource payloadResource, WaybackRequest wbRequest) {
return "UTF-8";
}
});
result = new CaptureSearchResult();
result.setOriginalUrl("http://www.example.com/");
}
public static Resource createTestHtmlResource(byte[] payloadBytes) throws IOException {
WARCRecordInfo recinfo = TestWARCRecordInfo.createCompressedHttpResponse("text/html", payloadBytes);
TestWARCReader ar = new TestWARCReader(recinfo);
WARCRecord rec = ar.get(0);
WarcResource resource = new WarcResource(rec, ar);
resource.parseHeaders();
return resource;
}
public static Resource createTestRevisitResource(byte[] payloadBytes, boolean withHeader, boolean gzipContent) throws IOException {
WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(
"text/html", payloadBytes.length, withHeader, gzipContent);
TestWARCReader ar = new TestWARCReader(recinfo);
WARCRecord rec = ar.get(0);
WarcResource resource = new WarcResource(rec, ar);
resource.parseHeaders();
return resource;
}
/**
* test basic behavior with simple input.
* expectations:
* <ul>
* <li>reads <em>decoded (uncompressed)</em> contents from archive record.</li>
* <li>calls delegator.handleParseStart() and handleParseComplete() just once, respectively.</li>
* <li>calls HttpServletResponse.setHeader() for Content-Type, Content-Length and
* {@link TextReplayRenderer#GUESSED_CHARSET_HEADER} (not configurable as with TextReplayRenderer
* subclasses.</li>
* <li>calls HttpServletResponse.setCharsetEncoding() with value "utf-8"</li>
* <li>passes CaptureSearchResult.originalUrl to ParseContext.baseUrl</li>
* </ul>
* URL translation is not tested here because it is not a responsibility of this class.
* it should be tested in a test case for {@link ParseEventHandler} implementations.
* @throws Exception
*/
public void testBasicBehavior() throws Exception {
String payload = "<HTML></HTML>\n";
final byte[] payloadBytes = payload.getBytes("UTF-8");
Resource payloadResource = createTestHtmlResource(payloadBytes);
Capture<ReplayParseContext> parseContextCapture = new Capture<ReplayParseContext>();
Capture<Node> nodeCapture = new Capture<Node>();
nodeHandler.handleParseStart(EasyMock.<ReplayParseContext>anyObject());
nodeHandler.handleParseComplete(EasyMock.<ReplayParseContext>anyObject());
TestParseEventHandler delegate = new TestParseEventHandler();
nodeHandler.handleNode(EasyMock.capture(parseContextCapture), EasyMock.capture(nodeCapture));
EasyMock.expectLastCall().andDelegateTo(delegate).atLeastOnce();
response.setStatus(200);
response.setCharacterEncoding("utf-8");
response.setHeader("Content-Length", Integer.toString(payloadBytes.length));
response.setHeader(TextReplayRenderer.GUESSED_CHARSET_HEADER, "UTF-8");
response.setHeader("Content-Type", "text/html");
response.setHeader(EasyMock.matches("X-Archive-Orig-.*"), EasyMock.<String>notNull());
EasyMock.expectLastCall().anyTimes();
EasyMock.replay(nodeHandler, response, uriConverter);
cut.renderResource(null, response, wbRequest, result, payloadResource, payloadResource, uriConverter, null);
EasyMock.verify(nodeHandler, response, uriConverter);
// NOTE: this compares output of Node.toHtml() with the original input.
// there's a good chance of Node.toHtml() producing different text than original HTML.
String out = servletOutput.getString();
assertEquals("servlet output", payload, out);
ReplayParseContext context = parseContextCapture.getValue();
// testing indirectly because ReplayParseContext has no method returning baseUrl.
assertEquals("baseUrl is correctly set up", "http://www.example.com/a.html", context.resolve("a.html"));
}
/**
* test revisit record (in new format with HTTP headers).
* @throws Exception
*/
public void testRevisit() throws Exception {
final String payload = "<HTML></HTML>\n";
final byte[] payloadBytes = payload.getBytes("UTF-8");
Resource payloadResource = createTestHtmlResource(payloadBytes);
// payloadResource is Content-Encoding: gzip, revisit must be gzipped, too.
Resource headerResource = createTestRevisitResource(payloadBytes, true, true);
Capture<ReplayParseContext> parseContextCapture = new Capture<ReplayParseContext>();
Capture<Node> nodeCapture = new Capture<Node>();
nodeHandler.handleParseStart(EasyMock.<ReplayParseContext>anyObject());
nodeHandler.handleParseComplete(EasyMock.<ReplayParseContext>anyObject());
TestParseEventHandler delegate = new TestParseEventHandler();
nodeHandler.handleNode(EasyMock.capture(parseContextCapture), EasyMock.capture(nodeCapture));
EasyMock.expectLastCall().andDelegateTo(delegate).atLeastOnce();
response.setStatus(200);
response.setCharacterEncoding("utf-8");
response.setHeader("Content-Length", Integer.toString(payloadBytes.length));
response.setHeader(TextReplayRenderer.GUESSED_CHARSET_HEADER, "UTF-8");
response.setHeader("Content-Type", "text/html");
response.setHeader(EasyMock.matches("X-Archive-Orig-.*"), EasyMock.<String>notNull());
EasyMock.expectLastCall().anyTimes();
EasyMock.replay(nodeHandler, response, uriConverter);
cut.renderResource(null, response, wbRequest, result, headerResource, payloadResource, uriConverter, null);
EasyMock.verify(nodeHandler, response, uriConverter);
// NOTE: this compares output of Node.toHtml() with the original input.
// there's a good chance of Node.toHtml() producing different text than original HTML.
String out = servletOutput.getString();
assertEquals("servlet output", payload, out);
ReplayParseContext context = parseContextCapture.getValue();
// testing indirectly because ReplayParseContext has no method returning baseUrl.
assertEquals("baseUrl is correctly set up", "http://www.example.com/a.html", context.resolve("a.html"));
}
// no test for old-style revisit record as headerResource, because it is caller's responsibility to
// set headerResource = payloadResource in this case.
// /**
// * test revisit record (in old format without HTTP headers).
// * @throws Exception
// */
// public void testOldRevisit() throws Exception {
// final String payload = "<HTML></HTML>\n";
// final byte[] payloadBytes = payload.getBytes("UTF-8");
// Resource payloadResource = createTestHtmlResource(payloadBytes);
// Resource headerResource = createTestRevisitResource(payloadBytes, false);
//
// Capture<ReplayParseContext> parseContextCapture = new Capture<ReplayParseContext>();
// Capture<Node> nodeCapture = new Capture<Node>();
// nodeHandler.handleParseStart(EasyMock.<ReplayParseContext>anyObject());
// nodeHandler.handleParseComplete(EasyMock.<ReplayParseContext>anyObject());
// TestParseEventHandler delegate = new TestParseEventHandler();
// nodeHandler.handleNode(EasyMock.capture(parseContextCapture), EasyMock.capture(nodeCapture));
// EasyMock.expectLastCall().andDelegateTo(delegate).atLeastOnce();
//
// response.setStatus(200);
// response.setCharacterEncoding("utf-8");
// response.setHeader("Content-Length", Integer.toString(payloadBytes.length));
// response.setHeader(TextReplayRenderer.GUESSED_CHARSET_HEADER, "UTF-8");
// response.setHeader("Content-Type", "text/html");
// response.setHeader(EasyMock.matches("X-Archive-Orig-.*"), EasyMock.<String>notNull());
// EasyMock.expectLastCall().anyTimes();
//
// EasyMock.replay(nodeHandler, response, uriConverter);
//
// cut.renderResource(null, response, wbRequest, result, headerResource, payloadResource, uriConverter, null);
//
// EasyMock.verify(nodeHandler, response, uriConverter);
//
// // NOTE: this compares output of Node.toHtml() with the original input.
// // there's a good chance of Node.toHtml() producing different text than original HTML.
// String out = servletOutput.getString();
// assertEquals("servlet output", payload, out);
//
// ReplayParseContext context = parseContextCapture.getValue();
// // testing indirectly because ReplayParseContext has no method returning baseUrl.
// assertEquals("baseUrl is correctly set up", "http://www.example.com/a.html", context.resolve("a.html"));
// }
public void testDoneFlagSetForFrameset() throws Exception {
String payload = "<frameset cols=\"25%,*,25%\">\n" +
" <frame src=\"top.html\">\n" +
" <frame src=\"center.html\">\n" +
" <frame src=\"bottom.html\">\n" +
"</frameset>\n";
byte[] payloadBytes = payload.getBytes("UTF-8");
Resource payloadResource = createTestHtmlResource(payloadBytes);
Capture<ReplayParseContext> parseContextCapture = new Capture<ReplayParseContext>();
nodeHandler.handleParseStart(EasyMock.capture(parseContextCapture));
nodeHandler.handleParseComplete(EasyMock.<ReplayParseContext>anyObject());
nodeHandler.handleNode(EasyMock.<ParseContext>anyObject(), EasyMock.<Node>anyObject());
EasyMock.expectLastCall().anyTimes();
// do not care about these expectations in this test.
response.setStatus(200);
response.setCharacterEncoding("utf-8");
response.setHeader(EasyMock.<String>notNull(), EasyMock.<String>notNull());
EasyMock.expectLastCall().anyTimes();
EasyMock.replay(nodeHandler, response, uriConverter);
cut.renderResource(null, response, wbRequest, result, payloadResource, payloadResource, uriConverter, null);
EasyMock.verify(nodeHandler, response, uriConverter);
ReplayParseContext context = parseContextCapture.getValue();
assertNotNull(context);
// it's a kind of odd to use this constant defined in FastArchivalUrlReplayParseEventHandler.
// ArchivalUrlSAXRewriteReplayRenderer is supposed not to be tied with particular ParseEventHandler
// implementation.
assertNotNull("FERRET_DONE flag is set",
context.getData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY));
}
public void testDoneFlagNotSetForFrameWrapperContext() throws Exception {
String payload = "<frameset cols=\"25%,*,25%\">\n" +
" <frame src=\"top.html\">\n" +
" <frame src=\"center.html\">\n" +
" <frame src=\"bottom.html\">\n" +
"</frameset>\n";
byte[] payloadBytes = payload.getBytes("UTF-8");
Resource payloadResource = createTestHtmlResource(payloadBytes);
Capture<ReplayParseContext> parseContextCapture = new Capture<ReplayParseContext>();
nodeHandler.handleParseStart(EasyMock.capture(parseContextCapture));
nodeHandler.handleParseComplete(EasyMock.<ReplayParseContext>anyObject());
nodeHandler.handleNode(EasyMock.<ParseContext>anyObject(), EasyMock.<Node>anyObject());
EasyMock.expectLastCall().anyTimes();
// do not care about these expectations in this test.
response.setStatus(200);
response.setCharacterEncoding("utf-8");
response.setHeader(EasyMock.<String>notNull(), EasyMock.<String>notNull());
EasyMock.expectLastCall().anyTimes();
EasyMock.replay(nodeHandler, response, uriConverter);
// !!! KEY SETTING OF THIS TEST !!!
wbRequest.setFrameWrapperContext(true);
cut.renderResource(null, response, wbRequest, result, payloadResource, payloadResource, uriConverter, null);
EasyMock.verify(nodeHandler, response, uriConverter);
ReplayParseContext context = parseContextCapture.getValue();
assertNotNull(context);
// it's kind of odd to use this constant defined in FastArchivalUrlReplayParseEventHandler.
// ArchivalUrlSAXRewriteReplayRenderer is supposed not to be tied with particular ParseEventHandler
// implementation.
assertNull("FERRET_DONE flag is NOT set",
context.getData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY));
}
// TODO: more tests
// handles unescaped HTML entities in <script> element correctly (what exactly does "correctly" mean?)
// HttpServletResponse gets output in UTF-8, no matter what original encoding might be.
}