/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.replay; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.Resource; import org.archive.wayback.core.UIResults; import org.archive.wayback.core.WaybackRequest; /** * Class which wraps functionality for converting a Resource(InputStream + * HTTP headers) into a StringBuilder, performing several common URL * resolution methods against that StringBuilder, inserting arbitrary Strings * into the page, and then converting the page back to a byte array. * * @author brad * @version $Date$, $Revision$ */ public class TextDocument { // if documents are marked up before sending to clients, the data is // decoded into a String in chunks. This is how big a chunk to decode with. private final static int C_BUFFER_SIZE = 4096; private Resource resource = null; private CaptureSearchResult result = null; private ResultURIConverter uriConverter = null; /** * the internal StringBuilder */ public StringBuilder sb = null; private String charSet = null; private byte[] resultBytes = null; /** * @param resource * @param result * @param uriConverter */ public TextDocument(Resource resource, CaptureSearchResult result, ResultURIConverter uriConverter) { this.resource = resource; this.result = result; this.uriConverter = uriConverter; } public void addBase() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref == null) { insertAtStartOfHead("<base href=\"" + pageUrl + "\" />"); } else { pageUrl = existingBaseHref; } } /** * Update URLs inside the page, so those URLs which must be correct at * page load time resolve correctly to absolute URLs. * * This means ensuring there is a BASE HREF tag, adding one if missing, * and then resolving: * FRAME-SRC, META-URL, LINK-HREF, SCRIPT-SRC * tag-attribute pairs against either the existing BASE-HREF, or the * page's absolute URL if it was missing. */ public void resolvePageUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref == null) { insertAtStartOfHead("<base href=\"" + pageUrl + "\" />"); } else { pageUrl = existingBaseHref; } String markups[][] = { {"FRAME","SRC"}, {"META","URL"}, {"LINK","HREF"}, {"SCRIPT","SRC"}, {TagMagix.ANY_TAGNAME,"background"} }; // TODO: The classic WM added a js_ to the datespec, so NotInArchives // can return an valid javascript doc, and not cause Javascript errors. for(String tagAttr[] : markups) { TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, tagAttr[0], tagAttr[1]); } TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl); } /** * Update all URLs inside the page, so they resolve correctly to absolute * URLs within the Wayback service. */ public void resolveAllPageUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref != null) { pageUrl = existingBaseHref; } ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter); // TODO: forms...? String markups[][] = { {"FRAME","SRC"}, {"META","URL"}, {"LINK","HREF"}, {"SCRIPT","SRC"}, {"IMG","SRC"}, {"A","HREF"}, {"AREA","HREF"}, {"OBJECT","CODEBASE"}, {"OBJECT","CDATA"}, {"APPLET","CODEBASE"}, {"APPLET","ARCHIVE"}, {"EMBED","SRC"}, {"IFRAME","SRC"}, {TagMagix.ANY_TAGNAME,"background"} }; for(String tagAttr[] : markups) { TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, tagAttr[0], tagAttr[1]); } TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl); } public void resolveCSSUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); } public void resolveASXRefUrls() { // TODO: get url from Resource instead of SearchResult? String pageUrl = result.getOriginalUrl(); String captureDate = result.getCaptureTimestamp(); ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter); TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, "REF", "HREF"); } public void stripHTML() { String stripped = sb.toString().replaceAll("\\<.*?>",""); sb.setLength(0); sb.append(stripped); } /** * @param charSet * @throws IOException */ public void readFully(String charSet) throws IOException { this.charSet = charSet; int recordLength = (int) resource.getRecordLength(); // convert bytes to characters for charset: InputStreamReader isr = new InputStreamReader(resource, charSet); char[] cbuffer = new char[C_BUFFER_SIZE]; // slurp the whole thing into RAM: sb = new StringBuilder(recordLength); //Skip the UTF-8 BOM 0xFEFF int firstChar = isr.read(); if ((firstChar != '\uFEFF') && (firstChar != -1)) { sb.append((char)firstChar); } for (int r = -1; (r = isr.read(cbuffer, 0, C_BUFFER_SIZE)) != -1;) { sb.append(cbuffer, 0, r); } } /** * Read bytes from input stream, using best-guess for character encoding * @throws IOException */ public void readFully() throws IOException { readFully(null); } /** * @return raw bytes contained in internal StringBuilder * @throws UnsupportedEncodingException */ public byte[] getBytes() throws UnsupportedEncodingException { if(resultBytes != null) { return resultBytes; } if(sb == null) { throw new IllegalStateException("No interal StringBuffer"); } if(resultBytes == null) { resultBytes = sb.toString().getBytes(charSet); } return resultBytes; } public void setResultBytes(byte[] resultBytes) { this.resultBytes = resultBytes; } /** * Write the contents of the page to the client. * * @param os * @throws IOException */ public void writeToOutputStream(OutputStream os) throws IOException { if(sb == null) { throw new IllegalStateException("No interal StringBuffer"); } byte[] b; try { b = getBytes(); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } os.write(b); } /** * insert {@code toInsert} at the beginning of this text. * @param toInsert */ public final void insertAtStartOfDocument(CharSequence toInsert) { sb.insert(0,toInsert); } /** * @param charSequence */ public void insertAtEndOfDocument(CharSequence charSequence) { sb.append("\n" + charSequence); } /** * @param toInsert */ public void insertAtStartOfHead(CharSequence toInsert) { int insertPoint = TagMagix.getEndOfFirstTag(sb,"head"); if (-1 == insertPoint) { insertPoint = 0; } sb.insert(insertPoint,toInsert); } /** * @param toInsert */ public void insertAtEndOfBody(CharSequence toInsert) { int insertPoint = sb.lastIndexOf("</body>"); if (-1 == insertPoint) { insertPoint = sb.lastIndexOf("</BODY>"); } if (-1 == insertPoint) { insertPoint = sb.length(); } sb.insert(insertPoint,toInsert); } /** * @param toInsert */ public void insertAtStartOfBody(CharSequence toInsert) { int insertPoint = TagMagix.getEndOfFirstTag(sb,"body"); if (-1 == insertPoint) { insertPoint = 0; } sb.insert(insertPoint,toInsert); } /** * @param jspPath * @param httpRequest * @param httpResponse * @param wbRequest * @param results * @param result * @param resource * @return * * @throws IOException * @throws ServletException */ public String includeJspString(String jspPath, HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, CaptureSearchResults results, CaptureSearchResult result, Resource resource) throws ServletException, IOException { if (wbRequest.isAjaxRequest()) { return ""; } UIResults uiResults = new UIResults(wbRequest,uriConverter,results, result,resource); StringHttpServletResponseWrapper wrappedResponse = new StringHttpServletResponseWrapper(httpResponse); uiResults.forward(httpRequest, wrappedResponse, jspPath); return wrappedResponse.getStringResponse(); } /** * @param jsUrl The javascript URL to be wrapped * @return A <code><script></code> tag containing the provided javascript URL. */ public String getJSIncludeString(final String jsUrl) { return "<script type=\"text/javascript\" src=\"" + jsUrl + "\" ></script>\n"; } /** * @return the charSet */ public String getCharSet() { return charSet; } /** * @param charSet the charSet to set */ public void setCharSet(String charSet) { this.charSet = charSet; } private class SpecialResultURIConverter implements ResultURIConverter { private static final String EMAIL_PROTOCOL_PREFIX = "mailto:"; private static final String JAVASCRIPT_PROTOCOL_PREFIX = "javascript:"; private ResultURIConverter base = null; public SpecialResultURIConverter(ResultURIConverter base) { this.base = base; } @Override public String makeReplayURI(String datespec, String url) { if(url.startsWith(EMAIL_PROTOCOL_PREFIX)) { return url; } if(url.startsWith(JAVASCRIPT_PROTOCOL_PREFIX)) { return url; } return base.makeReplayURI(datespec, url); } } private class MMSToHTTPResultURIConverter implements ResultURIConverter { private static final String MMS_PROTOCOL_PREFIX = "mms://"; private static final String HTTP_PROTOCOL_PREFIX = "http://"; private ResultURIConverter base = null; public MMSToHTTPResultURIConverter(ResultURIConverter base) { this.base = base; } @Override public String makeReplayURI(String datespec, String url) { if(url.startsWith(MMS_PROTOCOL_PREFIX)) { url = HTTP_PROTOCOL_PREFIX + url.substring(MMS_PROTOCOL_PREFIX.length()); } return base.makeReplayURI(datespec, url); } } }