/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.replay; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.wayback.ResultURIConverter; import org.archive.wayback.replay.html.ReplayParseContext; import org.archive.wayback.util.url.UrlOperations; /** * Library for updating arbitrary attributes in arbitrary tags to rewrite HTML * documents so URI references point back into the Wayback Machine. Attempts to * make minimal changes so nothing gets broken during this process. * * @author brad * @version $Date$, $Revision: * 1668 $ */ public class TagMagix { // minimum length XXXX in a 'ATTR=XXXX' declaration... mostly handy // to keep us from trying to mark up javascript generated HTML/CSS code. private static int MIN_ATTR_LENGTH = 3; private static HashMap<String, Pattern> pcPatterns = new HashMap<String, Pattern>(); private static HashMap<String, Pattern> wholeTagPatterns = new HashMap<String, Pattern>(); private static HashMap<String, Pattern> attrPatterns = new HashMap<String, Pattern>(); private static String QUOTED_ATTR_VALUE = "(?:\"[^\">]*\")"; private static String ESC_QUOTED_ATTR_VALUE = "(?:\\\\\"[^>\\\\]*\\\\\")"; private static String APOSED_ATTR_VALUE = "(?:'[^'>]*')"; private static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)"; public static String ANY_TAGNAME = "[a-z]+"; private static String STYLE_ATTR_NAME = "style"; private static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|" + APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|" + RAW_ATTR_VALUE; private static String cssUrlPatString = "(?<!@namespace [\\w\\s]{0,16})url\\s*\\(\\s*([\\\\\"']*.*?[\\\\\"']*)\\s*\\)"; private static String cssImportNoUrlPatString = "@import\\s+([\"'].+?[\"'])"; private static Pattern cssImportNoUrlPattern = Pattern.compile(cssImportNoUrlPatString); private static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString, Pattern.CASE_INSENSITIVE); /** * get (and cache) a regex Pattern for locating an HTML attribute value * within a particular tag. if found, the pattern will have the attribute * value in group 1. Note that the attribute value may contain surrounding * apostrophe(') or quote(") characters. * * @param tagName * @param attrName * @return Pattern to match the tag-attribute's value */ private synchronized static Pattern getPattern(String tagName, String attrName) { String key = tagName + " " + attrName; Pattern pc = pcPatterns.get(key); if (pc == null) { String tagPatString = "<\\s*" + tagName + "\\s+[^>]*\\b" + attrName + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; pc = Pattern.compile(tagPatString, Pattern.CASE_INSENSITIVE); pcPatterns.put(key, pc); } return pc; } /** * get (and cache) a regex Pattern for locating an entire HTML start tag. * * @param tagName * @return Pattern to match the tag */ private synchronized static Pattern getWholeTagPattern(String tagName) { Pattern pc = wholeTagPatterns.get(tagName); if (pc == null) { String tagPatString = "<\\s*" + tagName + "((>)|(\\s+[^>]*>))"; pc = Pattern.compile(tagPatString, Pattern.CASE_INSENSITIVE); wholeTagPatterns.put(tagName, pc); } return pc; } /** * get (and cache) a regex Pattern for locating an attribute value within an * HTML start tag. If this pattern matches, the attribute value will be in * group(1), and will include surrounding quotes, or apos, if they were * present in the original HTML. * * @param attrName * @return Pattern to match the attributes value */ private synchronized static Pattern getAttrPattern(String attrName) { Pattern pc = attrPatterns.get(attrName); if (pc == null) { String attrPatString = "\\b" + attrName + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; pc = Pattern.compile(attrPatString, Pattern.CASE_INSENSITIVE); attrPatterns.put(attrName, pc); } return pc; } public static void markupCSSImports(StringBuilder page, ResultURIConverter uriConverter, String captureDate, String baseUrl) { // markupTagREURIC(page,uriConverter,captureDate,baseUrl,cssImportPattern); markupTagREURIC(page,uriConverter,captureDate,baseUrl,cssImportNoUrlPattern); markupTagREURIC(page,uriConverter,captureDate,baseUrl,cssUrlPattern); } public static void markupStyleUrls(StringBuilder page, ResultURIConverter uriConverter, String captureDate, String baseUrl) { Pattern stylePattern = getPattern(ANY_TAGNAME, STYLE_ATTR_NAME); Matcher matcher = stylePattern.matcher(page); int idx = 0; while (matcher.find(idx)) { String attrValue = matcher.group(1); int origAttrLength = attrValue.length(); int attrStart = matcher.start(1); int attrEnd = matcher.end(1); idx = attrEnd; if(origAttrLength < MIN_ATTR_LENGTH) { continue; } if (attrValue.charAt(0) == '"') { attrValue = attrValue.substring(1, origAttrLength - 1); attrStart += 1; } else if (attrValue.charAt(0) == '\'') { attrValue = attrValue.substring(1, origAttrLength - 1); attrStart += 1; } else if (attrValue.charAt(0) == '\\') { attrValue = attrValue.substring(2, origAttrLength - 2); attrStart += 2; } Matcher urlMatcher = cssUrlPattern.matcher(attrValue); int attrIdx = 0; while(urlMatcher.find(attrIdx)) { String url = urlMatcher.group(1); int origUrlLength = url.length(); int urlStart = urlMatcher.start(1); int urlEnd = urlMatcher.end(1); attrIdx = urlEnd; if (url.charAt(0) == '"') { url = url.substring(1, origUrlLength - 1); urlStart += 1; } else if (url.charAt(0) == '\'') { url = url.substring(1, origUrlLength - 1); urlStart += 1; } else if (url.charAt(0) == '\\') { url = url.substring(2, origUrlLength - 2); urlStart += 2; } int urlLength = url.length(); if (url.startsWith(ReplayParseContext.DATA_PREFIX)) { continue; } String finalUrl = UrlOperations.resolveUrl(baseUrl,url); String replayUrl = uriConverter.makeReplayURI(captureDate, finalUrl); int delta = replayUrl.length() - urlLength; page.replace(attrStart + urlStart, attrStart + urlStart + urlLength , replayUrl); idx += delta; attrStart += delta; } } } /** * Alter the HTML document in page, updating URLs in the attrName attributes * of all tagName tags such that: * * 1) absolute URLs are prefixed with: wmPrefix + pageTS 2) server-relative * URLs are prefixed with: wmPrefix + pageTS + (host of page) 3) * path-relative URLs are prefixed with: wmPrefix + pageTS + (attribute URL * resolved against pageUrl) * * @param page * @param uriConverter * @param captureDate * @param baseUrl which must be absolute * @param tagName * @param attrName */ public static void markupTagREURIC(StringBuilder page, ResultURIConverter uriConverter, String captureDate, String baseUrl, String tagName, String attrName) { Pattern tagPat = getPattern(tagName, attrName); markupTagREURIC(page,uriConverter,captureDate,baseUrl,tagPat); } public static void markupTagREURIC(StringBuilder page, ResultURIConverter uriConverter, String captureDate, String baseUrl, Pattern pattern) { Matcher matcher = pattern.matcher(page); int idx = 0; while (matcher.find(idx)) { String url = matcher.group(1); int origUrlLength = url.length(); int attrStart = matcher.start(1); int attrEnd = matcher.end(1); if(origUrlLength < MIN_ATTR_LENGTH) { idx = attrEnd; continue; } String quote = ""; if (url.charAt(0) == '"') { quote = "\""; url = url.substring(1, origUrlLength - 1); } else if (url.charAt(0) == '\'') { quote = "'"; url = url.substring(1, origUrlLength - 1); } else if (url.charAt(0) == '\\') { quote = "\\\""; url = url.substring(2, origUrlLength - 2); } // Additional border case, probably embedded string not actual url if ((url.charAt(0) == '\'' || url.charAt(0) == '"') && url.length() <= MIN_ATTR_LENGTH) { idx = attrEnd; continue; } if (url.startsWith(ReplayParseContext.DATA_PREFIX)) { idx = attrEnd; continue; } String finalUrl = UrlOperations.resolveUrl(baseUrl,url); String replayUrl = quote + uriConverter.makeReplayURI(captureDate, finalUrl) + quote; int delta = replayUrl.length() - origUrlLength; page.replace(attrStart, attrEnd, replayUrl); idx = attrEnd + delta; } } private static String trimAttrValue(String value) { if (value.charAt(0) == '"') { value = value.substring(1, value.length() - 1); } else if (value.charAt(0) == '\'') { value = value.substring(1, value.length() - 1); } return value; } /** * find and return the ATTR value within a TAG tag inside the HTML document * within the StringBuffer page. returns null if no TAG-ATTR is found. * * @param page * @param tag * @param attr * @return URL of base-href within page, or null if none is found. */ public static String getTagAttr(StringBuilder page, final String tag, final String attr) { String found = null; Pattern daPattern = TagMagix.getPattern(tag, attr); Matcher matcher = daPattern.matcher(page); int idx = 0; if (matcher.find(idx)) { found = matcher.group(1); found = trimAttrValue(found); } return found; } /** * Search through the HTML contained in page, returning the value of a * particular attribute. This version allows matching only tags that contain * a particular attribute-value pair, which is useful in extracting META tag * values, for example, in returning the value of the "content" attribute in * a META tag that also contains an attribute "http-equiv" with a value of * "Content-Type". All comparision is case-insensitive, but the value * returned is the original attribute value, as unmolested as possible. * * If nothing matches, returns null. * * * @param page * StringBuilding holding HTML * @param tag * String containing tagname of interest * @param findAttr * name of attribute within the tag to return * @param whereAttr * only match tags with an attribute whereAttr * @param whereVal * only match tags with whereAttr having this value * @return the value of attribute attr in tag where the tag also contains an * attribute whereAttr, with value whereVal, or null if nothing * matches. */ public static String getTagAttrWhere(StringBuilder page, final String tag, final String findAttr, final String whereAttr, final String whereVal) { Pattern tagPattern = getWholeTagPattern(tag); Pattern findAttrPattern = getAttrPattern(findAttr); Pattern whereAttrPattern = getAttrPattern(whereAttr); Matcher tagMatcher = tagPattern.matcher(page); while (tagMatcher.find()) { String wholeTag = tagMatcher.group(); Matcher whereAttrMatcher = whereAttrPattern.matcher(wholeTag); if (whereAttrMatcher.find()) { String attrValue = whereAttrMatcher.group(1); attrValue = trimAttrValue(attrValue); if (attrValue.compareToIgnoreCase(whereVal) == 0) { // this tag contains the right set, return the value for // the attribute findAttr: Matcher findAttrMatcher = findAttrPattern.matcher(wholeTag); String value = null; if (findAttrMatcher.find()) { value = findAttrMatcher.group(1); value = trimAttrValue(value); } return value; } // not the tag we want... maybe there is another: loop } } return null; } /** * find and return the href value within a BASE tag inside the HTML document * within the StringBuffer page. returns null if no BASE-HREF is found. * * @param page * @return URL of base-href within page, or null if none is found. */ public static String getBaseHref(StringBuilder page) { return getTagAttr(page, "BASE", "HREF"); } public static int getEndOfFirstTag(StringBuilder page, String tag) { Pattern tagPattern = getWholeTagPattern(tag); Matcher tagMatcher = tagPattern.matcher(page); int offset = -1; if(tagMatcher.find()) { offset = tagMatcher.end(); } return offset; } }