/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.replay; import java.io.IOException; import java.util.List; import java.util.Map; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.archive.wayback.ReplayRenderer; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.Resource; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadContentException; import org.archive.wayback.replay.charset.CharsetDetector; import org.archive.wayback.replay.charset.StandardCharsetDetector; import org.archive.wayback.replay.html.ContextResultURIConverterFactory; /** * {@link ReplayRenderer} for rewriting textual resource with {@link TextDocument}. * <p> * {@link Resource} is first decoded (if {@code Transfer-Encoding} is applied), * and fully loaded into {@code TextDocument} object. Then {@link #updatePage} * method is called for actual rewrite. Then updated content is sent to the response. * </p> * <p>Customization Properties: * <ul> * <li>{@code jspInserts}: a list of Servlets for annotation inserts. It is * {@code updatePage}'s responsibility to perform actual insertion.</li> * <li>{@code charsetDetector}: {@link CharsetDetector} for detecting resource's * charset. Default is {@link StandardCharsetDetector}.</li> * <li>{@code httpHeaderProcessor} (constructor arg): {@link HttpHeaderProcessor} * for rewriting resource's HTTP headers.</li> * <li>{@code guessedCharsetHeader}: name of response HTTP header for sending * out resource's charset detected by {@code charsetDetector}. Default is * {@link #GUESSED_CHARSET_HEADER}.</li> * <li>{@code pageURIConverterFactory}: You can use different URI conversion * just for {@code TextDocument} and {#updatePage} by setting this property * to non-{@code null} {@code ContextResultURIConverterFactory}. It does not * affect URI conversion for HTTP headers.</li> * </ul> * </p> * @author brad */ public abstract class TextReplayRenderer implements ReplayRenderer { public static String GUESSED_CHARSET_HEADER = "X-Archive-Guessed-Charset"; public static String ORIG_ENCODING = "X-Archive-Orig-Encoding"; private String guessedCharsetHeader = GUESSED_CHARSET_HEADER; private List<String> jspInserts = null; private HttpHeaderProcessor httpHeaderProcessor; private CharsetDetector charsetDetector = new StandardCharsetDetector(); private ContextResultURIConverterFactory pageConverterFactory = null; public TextReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { this.httpHeaderProcessor = httpHeaderProcessor; } protected abstract void updatePage(TextDocument page, HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResult result, Resource resource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException; @Override public void renderResource(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResult result, Resource resource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, BadContentException { renderResource(httpRequest, httpResponse, wbRequest, result, resource, resource, uriConverter, results); } @Override public void renderResource(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResult result, Resource httpHeadersResource, Resource payloadResource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, BadContentException { // Decode resource (such as if gzip encoded) Resource decodedResource = decodeResource(httpHeadersResource, payloadResource); HttpHeaderOperation.copyHTTPMessageHeader(httpHeadersResource, httpResponse); Map<String,String> headers = HttpHeaderOperation.processHeaders( httpHeadersResource, result, uriConverter, httpHeaderProcessor); String charSet = charsetDetector.getCharset(httpHeadersResource, decodedResource, wbRequest); ResultURIConverter pageConverter = uriConverter; if (pageConverterFactory != null) { // XXX: ad-hoc code - ContextResultURIConverterFactory should take ResultURIConverter // as argument, so that it can simply wrap the original. String replayURIPrefix = (uriConverter instanceof ArchivalUrlResultURIConverter ? ((ArchivalUrlResultURIConverter)uriConverter).getReplayURIPrefix() : ""); ResultURIConverter ruc = pageConverterFactory.getContextConverter(replayURIPrefix); if (ruc != null) pageConverter = ruc; } // Load content into an HTML page, and resolve load-time URLs: TextDocument page = new TextDocument(decodedResource, result, uriConverter); page.readFully(charSet); updatePage(page, httpRequest, httpResponse, wbRequest, result, decodedResource, pageConverter, results); // set the corrected length: int bytes = page.getBytes().length; headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, String.valueOf(bytes)); if (guessedCharsetHeader != null) { headers.put(guessedCharsetHeader, page.getCharSet()); } // send back the headers: HttpHeaderOperation.sendHeaders(headers, httpResponse); // Tomcat will always send a charset... It's trying to be smarter than // we are. If the original page didn't include a "charset" as part of // the "Content-Type" HTTP header, then Tomcat will use the default.. // who knows what that is, or what that will do to the page.. // let's try explicitly setting it to what we used: httpResponse.setCharacterEncoding(page.getCharSet()); page.writeToOutputStream(httpResponse.getOutputStream()); } /** * @return the jspInserts */ public List<String> getJspInserts() { return jspInserts; } /** * @param jspInserts the jspInserts to set */ public void setJspInserts(List<String> jspInserts) { this.jspInserts = jspInserts; } /** * @return the charsetDetector */ public CharsetDetector getCharsetDetector() { return charsetDetector; } /** * @param charsetDetector the charsetDetector to set */ public void setCharsetDetector(CharsetDetector charsetDetector) { this.charsetDetector = charsetDetector; } /** * @return the String HTTP Header used to indicate what Wayback determined * was the pages original charset */ public String getGuessedCharsetHeader() { return guessedCharsetHeader; } /** * @param guessedCharsetHeader the String HTTP Header value used to indicate * to clients what Wayback determined was the pages original charset. If set * to null, the header will be omitted. */ public void setGuessedCharsetHeader(String guessedCharsetHeader) { this.guessedCharsetHeader = guessedCharsetHeader; } public static Resource decodeResource(Resource resource) throws IOException { return decodeResource(resource, resource); } /** * return gzip-decoding wrapper Resource if Resource has {@code Content-Encoding: gzip}. * return {@code payloadResource} otherwise. * <p>if headerResource's content is gzip-compressed (i.e. {@code Content-Encoding} is "{@code gzip}"), * return a wrapping Resource that returns decoded content.</p> * <p>As a side-effect, {@code Content-Encoding} and * {@code Transfer-Encoding} headers are removed from {@code headersResource} (this happens only when * {@code headerResoruce} is gzip-compressed.). It is assumed that {@code headerResource} and * {@code payloadResource} are captures of identical response content.</p> * <p>TODO: XArchiveHttpHeaderProcessor also does HTTP header removal. Check for refactoring case.</p> * @param headersResource Resource to read HTTP headers from. * @param payloadResource Resource to read content from (same as {@code headerResource} for regular captures, * different Resource if headersResource is a revisit record.) * @return The decoded Resource. * @throws IOException */ public static Resource decodeResource(Resource headersResource, Resource payloadResource) throws IOException { Map<String, String> headers = headersResource.getHttpHeaders(); if (headers != null) { String encoding = HttpHeaderOperation.getHeaderValue(headers, HttpHeaderOperation.HTTP_CONTENT_ENCODING); if (encoding != null) { if (encoding.toLowerCase().equals(GzipDecodingResource.GZIP)) { headers.put(ORIG_ENCODING, encoding); HttpHeaderOperation.removeHeader(headers, HttpHeaderOperation.HTTP_CONTENT_ENCODING); if (HttpHeaderOperation.isChunkEncoded(headers)) { HttpHeaderOperation.removeHeader(headers, HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER); } return new GzipDecodingResource(payloadResource); } // TODO: check for other encodings? } } return payloadResource; } /** * set {@link ContextResultURIConverterFactory} that creates replacement * {@link ResultURIConverter} for this {@code TextReplayRenderer}. * If set to non-{@code null}, its {@code getContextConverter} method will be called * with {@code replayURIPrefix}. If the method returns non-{@code null}, it will be * passed to {@link #updatePage} instead of the original. * @param pageConverterFactory {@link ContextResultURIConverterFactory} */ public void setPageURIConverterFactory(ContextResultURIConverterFactory pageConverterFactory) { this.pageConverterFactory = pageConverterFactory; } /** * return text to insert. * <p>{@code jspInserts} are executed in sequence, concatenating * their output.</p> * @param page for calling {@code includeJspString} method * @param httpRequest incoming request * @param httpResponse outgoing response * @param wbRequest wayback request info * @param results captures * @param result capture being replayed * @param resource resource being replayed * @return concatenated output of {@code jspInserts} * (empty if no {@code jspInserts} is configured.) * @throws IOException error from {@link TextDocument#includeJspString} * @throws ServletException error from {@link TextDocument#includeJspString} */ protected CharSequence buildInsertText(TextDocument page, HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResults results, CaptureSearchResult result, Resource resource) throws IOException, ServletException { if (jspInserts == null || jspInserts.isEmpty()) return ""; StringBuilder toInsert = new StringBuilder(300); for (String jspName : jspInserts) { toInsert.append(page.includeJspString(jspName, httpRequest, httpResponse, wbRequest, results, result, resource)); } return toInsert; } }