ArchivalUrlSAXRewriteReplayRenderer.java example

Explorer
wayback-machine-master
/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.archivalurl;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.archive.wayback.ReplayRenderer;
import org.archive.wayback.ResultURIConverter;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.WaybackException;
import org.archive.wayback.replay.HttpHeaderOperation;
import org.archive.wayback.replay.HttpHeaderProcessor;
import org.archive.wayback.replay.JSPExecutor;
import org.archive.wayback.replay.TagMagix;
import org.archive.wayback.replay.TextReplayRenderer;
import org.archive.wayback.replay.charset.CharsetDetector;
import org.archive.wayback.replay.charset.StandardCharsetDetector;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.util.htmllex.ContextAwareLexer;
import org.archive.wayback.util.htmllex.ParseEventHandler;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;

/**
 * ReplayRenderer which attempts to rewrite text/html documents so URLs 
 * references within the document load from the correct ArchivalURL AccessPoint.
 * 
 * @author brad
 *
 */
public class ArchivalUrlSAXRewriteReplayRenderer implements ReplayRenderer {
	private ParseEventHandler delegator = null;
	private HttpHeaderProcessor httpHeaderProcessor;
	private CharsetDetector charsetDetector = new StandardCharsetDetector();
	private final static String OUTPUT_CHARSET = "utf-8";
	private static int FRAMESET_SCAN_BUFFER_SIZE = 16 * 1024;
	private static ReplayRenderer frameWrappingRenderer = null;
	public static ReplayRenderer getFrameWrappingRenderer() {
		return frameWrappingRenderer;
	}

	public static void setFrameWrappingRenderer(ReplayRenderer frameWrappingRenderer) {
		ArchivalUrlSAXRewriteReplayRenderer.frameWrappingRenderer = frameWrappingRenderer;
	}

	/**
	 * @param httpHeaderProcessor which should process HTTP headers
	 */
	public ArchivalUrlSAXRewriteReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) {
		this.httpHeaderProcessor = httpHeaderProcessor;
	}

	// assume this is only called for appropriate doc types: html
	public void renderResource(HttpServletRequest httpRequest,
			HttpServletResponse httpResponse, WaybackRequest wbRequest,
			CaptureSearchResult result, Resource resource,
			ResultURIConverter uriConverter, CaptureSearchResults results)
			throws ServletException, IOException, WaybackException {
		
		resource = TextReplayRenderer.decodeResource(resource);

		// The URL of the page, for resolving in-page relative URLs: 
    	URL url = null;
		try {
			url = new URL(result.getOriginalUrl());
		} catch (MalformedURLException e1) {
			// TODO: this shouldn't happen...
			e1.printStackTrace();
			throw new IOException(e1.getMessage());
		}
		// determine the character set used to encode the document bytes:
		String charSet = charsetDetector.getCharset(resource, wbRequest);

		ArchivalUrlContextResultURIConverterFactory fact = 
			new ArchivalUrlContextResultURIConverterFactory(
					(ArchivalUrlResultURIConverter) uriConverter);
		// set up the context:
		ReplayParseContext context = 
			new ReplayParseContext(fact,url,result.getCaptureTimestamp());

		if(!wbRequest.isFrameWrapperContext()) {
			// in case this is an HTML page with FRAMEs, peek ahead an look:
			// TODO: make ThreadLocal:
			byte buffer[] = new byte[FRAMESET_SCAN_BUFFER_SIZE];

			resource.mark(FRAMESET_SCAN_BUFFER_SIZE);
			int amtRead = resource.read(buffer);
			resource.reset();
			
			if(amtRead > 0) {
				StringBuilder foo = new StringBuilder(new String(buffer,charSet));
				int frameIdx = TagMagix.getEndOfFirstTag(foo, "FRAMESET");
				if(frameIdx != -1) {
					// insert flag so we don't add FRAMESET:
					context.putData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY,"");

//					// top-level Frameset: Draw the frame wrapper thingy:
//					frameWrappingRenderer.renderResource(httpRequest, 
//							httpResponse, wbRequest, result, resource, 
//							uriConverter, results);
//					return;
				}
			}
		}
		
		
		// copy the HTTP response code:
		HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse);

		// transform the original headers according to our headerProcessor:
		Map<String,String> headers = HttpHeaderOperation.processHeaders(
				resource, result, uriConverter, httpHeaderProcessor);

		// prepare several objects for the parse:
		
		// a JSPExecutor:
		JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest, 
				httpResponse, wbRequest, results, result, resource);
		

		// To make sure we get the length, we have to buffer it all up...
		ByteArrayOutputStream baos = new ByteArrayOutputStream();

		context.setOutputCharset(OUTPUT_CHARSET);
		context.setOutputStream(baos);
		context.setJspExec(jspExec);

		
		// and finally, parse, using the special lexer that knows how to
		// handle javascript blocks containing unescaped HTML entities:
		Page lexPage = new Page(resource,charSet);
		Lexer lexer = new Lexer(lexPage);
		Lexer.STRICT_REMARKS = false;
    	ContextAwareLexer lex = new ContextAwareLexer(lexer, context);
    	Node node;
    	try {
			delegator.handleParseStart(context);
			while((node = lex.nextNode()) != null) {
				delegator.handleNode(context, node);
			}
			delegator.handleParseComplete(context);
		} catch (ParserException e) {
			e.printStackTrace();
			throw new IOException(e.getMessage());
		}

		// At this point, baos contains the utf-8 encoded bytes of our result:
		byte[] utf8Bytes = baos.toByteArray();
		// set the corrected length:
		headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, 
				String.valueOf(utf8Bytes.length));
		headers.put(TextReplayRenderer.GUESSED_CHARSET_HEADER, charSet);

		// send back the headers:
		HttpHeaderOperation.sendHeaders(headers, httpResponse);
		// Tomcat will always send a charset... It's trying to be smarter than
		// we are. If the original page didn't include a "charset" as part of
		// the "Content-Type" HTTP header, then Tomcat will use the default..
		// who knows what that is, or what that will do to the page..
		// let's try explicitly setting it to what we used:
		httpResponse.setCharacterEncoding(OUTPUT_CHARSET);

		httpResponse.getOutputStream().write(utf8Bytes);
	}

	/**
	 * @return the charsetDetector
	 */
	public CharsetDetector getCharsetDetector() {
		return charsetDetector;
	}

	/**
	 * @param charsetDetector the charsetDetector to set
	 */
	public void setCharsetDetector(CharsetDetector charsetDetector) {
		this.charsetDetector = charsetDetector;
	}

	/**
	 * @return the delegator
	 */
	public ParseEventHandler getDelegator() {
		return delegator;
	}

	/**
	 * @param delegator the delegator to set
	 */
	public void setDelegator(ParseEventHandler delegator) {
		this.delegator = delegator;
	}
}