package org.archive.wayback.archivalurl; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.replay.html.ReplayParseContext; import org.archive.wayback.util.htmllex.ContextAwareLexer; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; import junit.framework.TestCase; public class FastArchivalUrlReplayParseEventHandlerTest extends TestCase { public void testRewrite() throws Exception { assertEquals("<html><a href=\"http://replay.archive.org/2001/http://www.example.com/foo.html\">foo</a></html>",doEndToEnd("<html><a href=\"/foo.html\">foo</a></html>")); assertEquals("<html><a href=\"http://replay.archive.org/2001/http://www.example.com/foo.html\">foo</a></html>",doEndToEnd("<html><a href=\"foo.html\">foo</a></html>")); assertEquals("<html><a href=\"javascript:doWin('http://replay.archive.org/2001/http://www.symphony.org/')\">American Symphony Orchestra League</a></html>",doEndToEnd("<html><a href=\"javascript:doWin('http://www.symphony.org')\">American Symphony Orchestra League</a></html>")); } public String doEndToEnd(String input) throws Exception { String baseUrl = "http://www.example.com/"; String timestamp = "2001"; String outputCharset = "UTF-8"; String charSet = "UTF-8"; ByteArrayInputStream bais = new ByteArrayInputStream(input.getBytes(charSet)); FastArchivalUrlReplayParseEventHandler delegator = new FastArchivalUrlReplayParseEventHandler(); delegator.setCommentJsp(null); delegator.setJspInsertPath(null); ArchivalUrlResultURIConverter uriConverter = new ArchivalUrlResultURIConverter(); uriConverter.setReplayURIPrefix("http://replay.archive.org/"); ArchivalUrlContextResultURIConverterFactory fact = new ArchivalUrlContextResultURIConverterFactory( (ArchivalUrlResultURIConverter) uriConverter); // The URL of the page, for resolving in-page relative URLs: URL url = null; try { url = new URL(baseUrl); } catch (MalformedURLException e1) { // TODO: this shouldn't happen... e1.printStackTrace(); throw new IOException(e1.getMessage()); } // To make sure we get the length, we have to buffer it all up... ByteArrayOutputStream baos = new ByteArrayOutputStream(); // set up the context: ReplayParseContext context = new ReplayParseContext(fact,url,timestamp); context.setOutputCharset(outputCharset); context.setOutputStream(baos); context.setJspExec(null); // and finally, parse, using the special lexer that knows how to // handle javascript blocks containing unescaped HTML entities: Page lexPage = new Page(bais,charSet); Lexer lexer = new Lexer(lexPage); Lexer.STRICT_REMARKS = false; ContextAwareLexer lex = new ContextAwareLexer(lexer, context); Node node; try { while((node = lex.nextNode()) != null) { delegator.handleNode(context, node); } delegator.handleParseComplete(context); } catch (ParserException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } // At this point, baos contains the utf-8 encoded bytes of our result: return new String(baos.toByteArray(),outputCharset); } }