/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.replay.html.transformer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.wayback.replay.html.ReplayParseContext; import org.archive.wayback.replay.html.StringTransformer; /** * Translates absolute URLs found in JavaScript code block. * <p>Looks for http/https absolute URLs in JavaScript code and translates * them with {@link ReplayParseContext#contextualizeUrl(String)}.</p> * <p>You can customize the pattern for finding URLs with {@code regex} property. * Regular expression must have at least one <em>capturing group</em>, and the first * capturing group is assumed to enclose URL to be rewritten. * (new feature 2014-04-22) Any matching text preceding and * following the first group will be preserved in the output.</p> * <p>For example: if you want to replace protocol-relative URL in addition to * regular full URL in JavaScript, you could use conservative regex like: * <pre> * "[\"']((?:https?:)?//(?:[^/]+@)?[^@:/]+(?:\\.[^@:/]+)+(?:[0-9]+)?)" * </pre> * Note single/double quote preceding URL is preserved in 2014-04-22 version and on.</p> * <p>TODO: org.archive.wayback.archivalurl.ArchivalUrlJSReplayRenderer has * similar code. can be consolidated, like ArchivalURLJSStringTransformerReplayRenderer?</p> * <p>May 1, 2014: slight design change: * Now JSStringTransformer does not run it's own should-rewrite check and sends all matching * text to {@link ReplayParseContext#contextualizeUrl(String)}. More specifically it no longer * be affected by {@code rewriteHttpsOnly} flag. This is a design choice to keep * {@code StringTransformer} detached from replay mode knowledge and focus on find-and-replace URLs * </p> * @author brad * */ public class JSStringTransformer implements StringTransformer { private final static Pattern defaultHttpPattern = Pattern .compile("(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"); private Pattern pattern = defaultHttpPattern; /** * a regular expression for searching URLs in the target resource. * @param regex */ public void setRegex(String regex) { pattern = Pattern.compile(regex); } public String getRegex() { return pattern.pattern(); } public String transform(ReplayParseContext context, String input) { StringBuffer replaced = new StringBuffer(input.length()); Matcher m = pattern.matcher(input); while (m.find()) { String url = m.group(1); String pre = input.substring(m.start(), m.start(1)); String post = input.substring(m.end(1), m.end()); String origUrl = url; url = context.contextualizeUrl(url); if (url != origUrl) { // reverse some changes made to url by contextualizeUrl method, that // may break assumptions in subsequent JavaScript processing. // eg. "http://example.org" -> "/20140101012345/http://example.org/" // eg. "https://domain" + ".example.org" -> "http://domain/" + ".example.org" // eg. "https://domain." + "example.org" -> "http://domain" + "example.org" // remove trailing "/" if origUrl doesn't have it. As Wayback does not need // trailing slash, it may make sense to this everywhere. Just doing this fix // in JavaScript for now. if (url.endsWith("/") && !origUrl.endsWith("/")) { url = url.substring(0, url.length() - 1); } // add trailing "." (removed by canonicalizer) back, if origUrl has it. if (origUrl.endsWith(".") && !url.endsWith(".")) { url = url + "."; } } m.appendReplacement(replaced, Matcher.quoteReplacement(pre + url + post)); } m.appendTail(replaced); return replaced.toString(); } }