/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.archivalurl; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Map; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.archive.wayback.ReplayRenderer; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.Resource; import org.archive.wayback.core.UIResults; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.WaybackException; import org.archive.wayback.replay.HttpHeaderOperation; import org.archive.wayback.replay.HttpHeaderProcessor; import org.archive.wayback.replay.JSPExecutor; import org.archive.wayback.replay.TagMagix; import org.archive.wayback.replay.TextReplayRenderer; import org.archive.wayback.replay.charset.CharsetDetector; import org.archive.wayback.replay.charset.StandardCharsetDetector; import org.archive.wayback.replay.html.ContextResultURIConverterFactory; import org.archive.wayback.replay.html.IdentityResultURIConverterFactory; import org.archive.wayback.replay.html.ReplayParseContext; import org.archive.wayback.replay.html.RewriteDirector; import org.archive.wayback.util.htmllex.ContextAwareLexer; import org.archive.wayback.util.htmllex.ParseEventHandler; import org.archive.wayback.webapp.AccessPoint; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; /** * ReplayRenderer which attempts to rewrite text/html documents so URLs * references within the document load from the correct ArchivalURL AccessPoint. * * @author brad * */ public class ArchivalUrlSAXRewriteReplayRenderer implements ReplayRenderer { private ParseEventHandler delegator = null; private HttpHeaderProcessor httpHeaderProcessor; private CharsetDetector charsetDetector = new StandardCharsetDetector(); private ContextResultURIConverterFactory converterFactory = null; private boolean rewriteHttpsOnly; private final static String OUTPUT_CHARSET = "utf-8"; private static int FRAMESET_SCAN_BUFFER_SIZE = 16 * 1024; private static ReplayRenderer frameWrappingRenderer = null; public static ReplayRenderer getFrameWrappingRenderer() { return frameWrappingRenderer; } public static void setFrameWrappingRenderer(ReplayRenderer frameWrappingRenderer) { ArchivalUrlSAXRewriteReplayRenderer.frameWrappingRenderer = frameWrappingRenderer; } /** * @param httpHeaderProcessor which should process HTTP headers */ public ArchivalUrlSAXRewriteReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { this.httpHeaderProcessor = httpHeaderProcessor; } // assume this is only called for appropriate doc types: html public void renderResource(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResult result, Resource resource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, WaybackException { renderResource(httpRequest, httpResponse, wbRequest, result, resource, resource, uriConverter, results); } @Override public void renderResource(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResult result, Resource httpHeadersResource, Resource payloadResource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, WaybackException { Resource decodedResource = TextReplayRenderer.decodeResource(httpHeadersResource, payloadResource); // The URL of the page, for resolving in-page relative URLs: // URL url = null; // try { // url = new URL(result.getOriginalUrl()); // } catch (MalformedURLException e1) { // // TODO: this shouldn't happen... // e1.printStackTrace(); // throw new IOException(e1.getMessage()); // } // determine the character set used to encode the document bytes: String charSet = charsetDetector.getCharset(httpHeadersResource, decodedResource, wbRequest); ContextResultURIConverterFactory fact = createConverterFactory(uriConverter, httpRequest, wbRequest); // set up the context: ReplayParseContext context = // new ReplayParseContext(fact,url,result.getCaptureTimestamp()); new ReplayParseContext(fact, result); context.setRewriteHttpsOnly(rewriteHttpsOnly); // XXX same code in ArchivalUrlJSStringReplayRenderer String policy = result.getOraclePolicy(); if (policy == null) { AccessPoint accessPoint = wbRequest.getAccessPoint(); if (accessPoint != null) { policy = accessPoint.getRewriteDirective(result); } } if (policy != null) { context.setOraclePolicy(policy); } if (!wbRequest.isFrameWrapperContext()) { // in case this is an HTML page with FRAMEs, peek ahead an look: // TODO: make ThreadLocal: byte buffer[] = new byte[FRAMESET_SCAN_BUFFER_SIZE]; decodedResource.mark(FRAMESET_SCAN_BUFFER_SIZE); int amtRead = decodedResource.read(buffer); decodedResource.reset(); if(amtRead > 0) { StringBuilder foo = new StringBuilder(new String(buffer,charSet)); int frameIdx = TagMagix.getEndOfFirstTag(foo, "FRAMESET"); if(frameIdx != -1) { // insert flag so we don't add FRAMESET: context.putData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY,""); // // top-level Frameset: Draw the frame wrapper thingy: // frameWrappingRenderer.renderResource(httpRequest, // httpResponse, wbRequest, result, resource, // uriConverter, results); // return; } } } // copy the HTTP response code: HttpHeaderOperation.copyHTTPMessageHeader(httpHeadersResource, httpResponse); // transform the original headers according to our headerProcessor: Map<String,String> headers = HttpHeaderOperation.processHeaders( httpHeadersResource, result, uriConverter, httpHeaderProcessor); // prepare several objects for the parse: // a JSPExecutor: // JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest, // httpResponse, wbRequest, results, result, decodedResource); UIResults uiResults = new UIResults(wbRequest, uriConverter, results, result, decodedResource); JSPExecutor jspExec = new JSPExecutor(httpRequest, httpResponse, uiResults); // To make sure we get the length, we have to buffer it all up... ByteArrayOutputStream baos = new ByteArrayOutputStream(); context.setOutputCharset(OUTPUT_CHARSET); context.setOutputStream(baos); context.setJspExec(jspExec); // and finally, parse, using the special lexer that knows how to // handle javascript blocks containing unescaped HTML entities: Page lexPage = new Page(decodedResource,charSet); Lexer lexer = new Lexer(lexPage); Lexer.STRICT_REMARKS = false; ContextAwareLexer lex = new ContextAwareLexer(lexer, context); Node node; try { delegator.handleParseStart(context); while((node = lex.nextNode()) != null) { delegator.handleNode(context, node); } delegator.handleParseComplete(context); } catch (ParserException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } // At this point, baos contains the utf-8 encoded bytes of our result: byte[] utf8Bytes = baos.toByteArray(); // set the corrected length: headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, String.valueOf(utf8Bytes.length)); headers.put(TextReplayRenderer.GUESSED_CHARSET_HEADER, charSet); // send back the headers: HttpHeaderOperation.sendHeaders(headers, httpResponse); // Tomcat will always send a charset... It's trying to be smarter than // we are. If the original page didn't include a "charset" as part of // the "Content-Type" HTTP header, then Tomcat will use the default.. // who knows what that is, or what that will do to the page.. // let's try explicitly setting it to what we used: httpResponse.setCharacterEncoding(OUTPUT_CHARSET); httpResponse.getOutputStream().write(utf8Bytes); } protected ContextResultURIConverterFactory createConverterFactory(ResultURIConverter uriConverter, HttpServletRequest httpRequest, WaybackRequest wbRequest) { // sam ecode in ArchivalURLJSStringTransformerReplayRenderer ContextResultURIConverterFactory fact = null; if (uriConverter instanceof ArchivalUrlResultURIConverter) { fact = new ArchivalUrlContextResultURIConverterFactory( (ArchivalUrlResultURIConverter) uriConverter); } else if (converterFactory != null) { fact = converterFactory; } else { fact = new IdentityResultURIConverterFactory(uriConverter); } return fact; } /** * @return the charsetDetector */ public CharsetDetector getCharsetDetector() { return charsetDetector; } /** * @param charsetDetector the charsetDetector to set */ public void setCharsetDetector(CharsetDetector charsetDetector) { this.charsetDetector = charsetDetector; } /** * @return the delegator */ public ParseEventHandler getDelegator() { return delegator; } /** * @param delegator the delegator to set */ public void setDelegator(ParseEventHandler delegator) { this.delegator = delegator; } public ContextResultURIConverterFactory getConverterFactory() { return converterFactory; } public void setConverterFactory( ContextResultURIConverterFactory converterFactory) { this.converterFactory = converterFactory; } public boolean isRewriteHttpsOnly() { return rewriteHttpsOnly; } public void setRewriteHttpsOnly(boolean rewriteHttpsOnly) { this.rewriteHttpsOnly = rewriteHttpsOnly; } }