/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.archivalurl; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.archive.wayback.ReplayRenderer; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.Resource; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.WaybackException; import org.archive.wayback.replay.HttpHeaderOperation; import org.archive.wayback.replay.HttpHeaderProcessor; import org.archive.wayback.replay.JSPExecutor; import org.archive.wayback.replay.TagMagix; import org.archive.wayback.replay.TextReplayRenderer; import org.archive.wayback.replay.charset.CharsetDetector; import org.archive.wayback.replay.charset.StandardCharsetDetector; import org.archive.wayback.replay.html.ReplayParseContext; import org.archive.wayback.util.htmllex.ContextAwareLexer; import org.archive.wayback.util.htmllex.ParseEventHandler; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; /** * ReplayRenderer which attempts to rewrite text/html documents so URLs * references within the document load from the correct ArchivalURL AccessPoint. * * @author brad * */ public class ArchivalUrlSAXRewriteReplayRenderer implements ReplayRenderer { private ParseEventHandler delegator = null; private HttpHeaderProcessor httpHeaderProcessor; private CharsetDetector charsetDetector = new StandardCharsetDetector(); private final static String OUTPUT_CHARSET = "utf-8"; private static int FRAMESET_SCAN_BUFFER_SIZE = 16 * 1024; private static ReplayRenderer frameWrappingRenderer = null; public static ReplayRenderer getFrameWrappingRenderer() { return frameWrappingRenderer; } public static void setFrameWrappingRenderer(ReplayRenderer frameWrappingRenderer) { ArchivalUrlSAXRewriteReplayRenderer.frameWrappingRenderer = frameWrappingRenderer; } /** * @param httpHeaderProcessor which should process HTTP headers */ public ArchivalUrlSAXRewriteReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { this.httpHeaderProcessor = httpHeaderProcessor; } // assume this is only called for appropriate doc types: html public void renderResource(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResult result, Resource resource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, WaybackException { resource = TextReplayRenderer.decodeResource(resource); // The URL of the page, for resolving in-page relative URLs: URL url = null; try { url = new URL(result.getOriginalUrl()); } catch (MalformedURLException e1) { // TODO: this shouldn't happen... e1.printStackTrace(); throw new IOException(e1.getMessage()); } // determine the character set used to encode the document bytes: String charSet = charsetDetector.getCharset(resource, wbRequest); ArchivalUrlContextResultURIConverterFactory fact = new ArchivalUrlContextResultURIConverterFactory( (ArchivalUrlResultURIConverter) uriConverter); // set up the context: ReplayParseContext context = new ReplayParseContext(fact,url,result.getCaptureTimestamp()); if(!wbRequest.isFrameWrapperContext()) { // in case this is an HTML page with FRAMEs, peek ahead an look: // TODO: make ThreadLocal: byte buffer[] = new byte[FRAMESET_SCAN_BUFFER_SIZE]; resource.mark(FRAMESET_SCAN_BUFFER_SIZE); int amtRead = resource.read(buffer); resource.reset(); if(amtRead > 0) { StringBuilder foo = new StringBuilder(new String(buffer,charSet)); int frameIdx = TagMagix.getEndOfFirstTag(foo, "FRAMESET"); if(frameIdx != -1) { // insert flag so we don't add FRAMESET: context.putData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY,""); // // top-level Frameset: Draw the frame wrapper thingy: // frameWrappingRenderer.renderResource(httpRequest, // httpResponse, wbRequest, result, resource, // uriConverter, results); // return; } } } // copy the HTTP response code: HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); // transform the original headers according to our headerProcessor: Map<String,String> headers = HttpHeaderOperation.processHeaders( resource, result, uriConverter, httpHeaderProcessor); // prepare several objects for the parse: // a JSPExecutor: JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest, httpResponse, wbRequest, results, result, resource); // To make sure we get the length, we have to buffer it all up... ByteArrayOutputStream baos = new ByteArrayOutputStream(); context.setOutputCharset(OUTPUT_CHARSET); context.setOutputStream(baos); context.setJspExec(jspExec); // and finally, parse, using the special lexer that knows how to // handle javascript blocks containing unescaped HTML entities: Page lexPage = new Page(resource,charSet); Lexer lexer = new Lexer(lexPage); Lexer.STRICT_REMARKS = false; ContextAwareLexer lex = new ContextAwareLexer(lexer, context); Node node; try { delegator.handleParseStart(context); while((node = lex.nextNode()) != null) { delegator.handleNode(context, node); } delegator.handleParseComplete(context); } catch (ParserException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } // At this point, baos contains the utf-8 encoded bytes of our result: byte[] utf8Bytes = baos.toByteArray(); // set the corrected length: headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, String.valueOf(utf8Bytes.length)); headers.put(TextReplayRenderer.GUESSED_CHARSET_HEADER, charSet); // send back the headers: HttpHeaderOperation.sendHeaders(headers, httpResponse); // Tomcat will always send a charset... It's trying to be smarter than // we are. If the original page didn't include a "charset" as part of // the "Content-Type" HTTP header, then Tomcat will use the default.. // who knows what that is, or what that will do to the page.. // let's try explicitly setting it to what we used: httpResponse.setCharacterEncoding(OUTPUT_CHARSET); httpResponse.getOutputStream().write(utf8Bytes); } /** * @return the charsetDetector */ public CharsetDetector getCharsetDetector() { return charsetDetector; } /** * @param charsetDetector the charsetDetector to set */ public void setCharsetDetector(CharsetDetector charsetDetector) { this.charsetDetector = charsetDetector; } /** * @return the delegator */ public ParseEventHandler getDelegator() { return delegator; } /** * @param delegator the delegator to set */ public void setDelegator(ParseEventHandler delegator) { this.delegator = delegator; } }