FastArchivalUrlReplayParseEventHandlerTest.java example

Explorer
wayback-machine-master
package org.archive.wayback.archivalurl;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;

import org.archive.wayback.ResultURIConverter;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.util.htmllex.ContextAwareLexer;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;

import junit.framework.TestCase;

public class FastArchivalUrlReplayParseEventHandlerTest extends TestCase {
	
	
	

	public void testRewrite() throws Exception {
		assertEquals("<html><a href=\"http://replay.archive.org/2001/http://www.example.com/foo.html\">foo</a></html>",doEndToEnd("<html><a href=\"/foo.html\">foo</a></html>"));
		assertEquals("<html><a href=\"http://replay.archive.org/2001/http://www.example.com/foo.html\">foo</a></html>",doEndToEnd("<html><a href=\"foo.html\">foo</a></html>"));
		assertEquals("<html><a href=\"javascript:doWin('http://replay.archive.org/2001/http://www.symphony.org/')\">American Symphony Orchestra League</a></html>",doEndToEnd("<html><a href=\"javascript:doWin('http://www.symphony.org')\">American Symphony Orchestra League</a></html>"));
	}
	
	public String doEndToEnd(String input) throws Exception {
		String baseUrl = "http://www.example.com/";
		String timestamp = "2001";
		String outputCharset = "UTF-8";
		String charSet = "UTF-8";
		
		ByteArrayInputStream bais = new ByteArrayInputStream(input.getBytes(charSet));
		
		FastArchivalUrlReplayParseEventHandler delegator = new FastArchivalUrlReplayParseEventHandler();
		delegator.setCommentJsp(null);
		delegator.setJspInsertPath(null);
		
		ArchivalUrlResultURIConverter uriConverter = new ArchivalUrlResultURIConverter();
		uriConverter.setReplayURIPrefix("http://replay.archive.org/");
		
		ArchivalUrlContextResultURIConverterFactory fact = 
			new ArchivalUrlContextResultURIConverterFactory(
					(ArchivalUrlResultURIConverter) uriConverter);

		// The URL of the page, for resolving in-page relative URLs: 
    	URL url = null;
		try {
			url = new URL(baseUrl);
		} catch (MalformedURLException e1) {
			// TODO: this shouldn't happen...
			e1.printStackTrace();
			throw new IOException(e1.getMessage());
		}

		// To make sure we get the length, we have to buffer it all up...
		ByteArrayOutputStream baos = new ByteArrayOutputStream();

		// set up the context:
		ReplayParseContext context = 
			new ReplayParseContext(fact,url,timestamp);
		context.setOutputCharset(outputCharset);
		context.setOutputStream(baos);
		context.setJspExec(null);
		
		// and finally, parse, using the special lexer that knows how to
		// handle javascript blocks containing unescaped HTML entities:
		Page lexPage = new Page(bais,charSet);
		Lexer lexer = new Lexer(lexPage);
		Lexer.STRICT_REMARKS = false;
    	ContextAwareLexer lex = new ContextAwareLexer(lexer, context);
    	Node node;
    	try {
			while((node = lex.nextNode()) != null) {
				delegator.handleNode(context, node);
			}
			delegator.handleParseComplete(context);
		} catch (ParserException e) {
			e.printStackTrace();
			throw new IOException(e.getMessage());
		}

		// At this point, baos contains the utf-8 encoded bytes of our result:
		return new String(baos.toByteArray(),outputCharset);

	}
}