package org.archive.wayback.archivalurl;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.replay.html.StringTransformer;
import org.archive.wayback.replay.html.rules.AttributeModifyingRule;
import org.archive.wayback.replay.html.transformer.InlineCSSStringTransformer;
import org.archive.wayback.replay.html.transformer.JSStringTransformer;
import org.archive.wayback.replay.html.transformer.MetaRefreshUrlStringTransformer;
import org.archive.wayback.replay.html.transformer.SrcsetStringTransformer;
import org.archive.wayback.replay.html.transformer.URLStringTransformer;
import org.htmlparser.nodes.TagNode;
/**
* Standard implementation of {@link AttributeRewriter}.
* <p>
* The {@code StandardAttributeRewriter} is an re-implementation of
* attribute-rewrite process that used to be embedded in
* {@link FastArchivalUrlReplayParseEventHandler}, under
* {@code AttributeRewriter} interface.
* </p>
* <p>
* It also offers simple configuration mechanism for adding/changing
* attribute rewrite rules through {@link Properties} object. New rules
* can be added easily through SpringFramework's support for {@code Properties}
* object. Well-known rewrite rules for attributes, such as {@code A/@HREF},
* {@code LINK/@HREF} and <code>*/@ONLOAD</code>, are provided by default and
* implemented with the same mechanism.
* Use {@link #setConfigProperties(Properties)} to add more rewrite rules.
* </p>
* <p>
* Just in case you need an entirely different set of rewrite rules, it is possible to
* disable default rule set by setting {@code true} to {@code defaultRulesDisabled}
* property.
* </p>
* <p>
* If you need to configure rewrite rules from different sources than {@link Properties}
* object, it'd be easiest to override {@link #loadRules()} method.
* </p>
* <p>
* Note: caller of this class may implement more complex rewrite process, and may
* by-pass this class for certain tags/attributes.
* </p>
* <p>
* While this class is similar to {@link AttributeModifyingRule} class in functionality,
* interface is more specific to attribute rewriting and it is easier to customize.
* </p>
* @see FastArchivalUrlReplayParseEventHandler
* @since 1.8.1
*/
public class StandardAttributeRewriter implements AttributeRewriter {
private static final Logger LOGGER = Logger.getLogger(StandardAttributeRewriter.class.getName());
private Map<String, StringTransformer> transformers;
private Map<String, TransformAttr> rules;
/**
* name of properties file with default rewrite rules.
* it should be in the same package as this class.
*/
public static final String CONFIG_PROPERTIES = "attribute-rewrite.properties";
private Properties configProperties;
private boolean defaultRulesDisabled = false;
private StringTransformer jsBlockTrans;
private boolean unescapeAttributeValues = true;
private Map<String, StringTransformer> customTransformers;
/**
* set StringTransformer for rewriting JavaScript attribute values.
* (event handler attributes and also {@code HREF="javascript:..."}.)
* Note changing this property after bean initialization is not supported.
* @param jsBlockTrans StringTransformer
*/
public void setJsBlockTrans(StringTransformer jsBlockTrans) {
this.jsBlockTrans = jsBlockTrans;
}
/**
* set {@link Properties} object with <em>additional</em> attribute rewrite rules.
* if you want this to replace entire rewrite rule, set {@code defaultRulesDisabled}
* to {@code true}.
* @param configProperties
*/
public void setConfigProperties(Properties configProperties) {
this.configProperties = configProperties;
}
/**
* set {@code true} if you want to disable loading default rewrite
* rules from in-class-path properties file.
* @param defaultRulesDisabled
*/
public void setDefaultRulesDisabled(boolean defaultRulesDisabled) {
this.defaultRulesDisabled = defaultRulesDisabled;
}
public boolean isDefaultRulesDisabled() {
return defaultRulesDisabled;
}
/**
* add more {@link StringTransformer}s.
* <p>Caveat: change has effect only before calling {@link #init()}.</p>
* @param customTransformers a Map with transformer name
* as keys and corresponding {@link StringTransformer}
* object as values.
*/
public void setCustomTransformers(
Map<String, StringTransformer> customTransformers) {
this.customTransformers = customTransformers;
}
/**
* set this property false if you want to disable unescaping
* (and corresponding re-escaping) of attribute values.
* <p>By default, HTML entities (such as <code>&</code>)
* in attribute values are unescaped before translation attempt,
* and then escaped back before writing out. Although this is
* supposedly the right thing to do, it has a side-effect: all
* bare "<code>&</code>" (not escaped as "<code>&</code>")
* will be replaced by "<code>&</code>". Setting this property
* to <code>false</code> disables it.</p>
* <p>As URL rewrite does neither parse nor modify query part, it
* should mostly work without unescaping. But there may be some
* corner cases where escaping is crucial. Don't set this to {@code false}
* unless it's absolutely necessary.</p>
* @param unescapeAttributeValues <code>false</code> to disable unescaping
*/
public void setUnescapeAttributeValues(boolean unescapeAttributeValues) {
this.unescapeAttributeValues = unescapeAttributeValues;
}
protected class TransformAttr implements Comparable<TransformAttr> {
public final String attrName;
private final StringTransformer transformer;
public TransformAttr alt;
public TransformAttr next;
public TransformAttr(String attrName, StringTransformer transformer) {
this.attrName = attrName;
this.transformer = transformer;
}
public boolean apply(ReplayParseContext context, TagNode tag) {
String orig = tag.getAttribute(attrName);
if (orig == null) return false;
// htmlparser does neither unescape HTML entities while it parses HTML, nor escape
// HTML special chars while writing HTML. So we take care of it for ourselves.
// ParseContext.resolve() used to unescape URL (but did not escape back). It no longer
// does in alignment with this change.
if (unescapeAttributeValues)
orig = StringEscapeUtils.unescapeHtml(orig);
String transformed = transformer.transform(context, orig);
if (unescapeAttributeValues)
transformed = StringEscapeUtils.escapeHtml(transformed);
tag.setAttribute(attrName, transformed);
return true;
}
protected int specificity() {
return 0;
}
@Override
public int compareTo(TransformAttr o) {
return specificity() - o.specificity();
}
}
protected class TransformAttrIfAttrValue extends TransformAttr {
private final String testAttrName;
private final String testValue;
public TransformAttrIfAttrValue(String attrName, StringTransformer transformer,
String testAttrName, String testAttrValue) {
super(attrName, transformer);
this.testAttrName = testAttrName;
this.testValue = testAttrValue;
}
@Override
public boolean apply(ReplayParseContext context, TagNode tag) {
String value = tag.getAttribute(testAttrName);
if (!testValue.equalsIgnoreCase(value)) return false;
return super.apply(context, tag);
}
@Override
protected int specificity() {
return 1;
}
}
public void init() throws IOException {
initTransformers();
rules = new HashMap<String, TransformAttr>();
loadRules();
}
/**
* Initialize {@code StringTransformer}s, register them to
* {@code transformers} map by their name.
*/
protected void initTransformers() {
if (jsBlockTrans == null)
jsBlockTrans = new JSStringTransformer();
URLStringTransformer anchorTrans = new URLStringTransformer();
anchorTrans.setJsTransformer(jsBlockTrans);
transformers = new HashMap<String, StringTransformer>();
transformers.put("fw", new URLStringTransformer("fw_"));
transformers.put("if", new URLStringTransformer("if_"));
transformers.put("cs", new URLStringTransformer("cs_"));
transformers.put("js", new URLStringTransformer("js_"));
transformers.put("im", new URLStringTransformer("im_"));
transformers.put("oe", new URLStringTransformer("oe_"));
transformers.put("an", anchorTrans);
transformers.put("jb", jsBlockTrans);
transformers.put("ci", new InlineCSSStringTransformer());
transformers.put("mt", new MetaRefreshUrlStringTransformer());
transformers.put("ss", new SrcsetStringTransformer());
if (customTransformers != null) {
transformers.putAll(customTransformers);
customTransformers = null;
}
}
protected void loadRules() throws IOException {
// first load default rewrite rules from in-class-path resource.
// make it a fatal error if load fails.
if (!defaultRulesDisabled) {
Properties defaultRules = new Properties();
InputStream is = getClass().getResourceAsStream(CONFIG_PROPERTIES);
defaultRules.load(is);
loadRulesFromProperties(defaultRules);
}
if (configProperties != null) {
loadRulesFromProperties(configProperties);
} else {
if (defaultRulesDisabled) {
LOGGER.warning("No attribute rewrite rule is configured"
+ " (defaultRulesDisabeld==true, and configProperties==null");
}
}
}
/**
* Regular expression for parsing tag name part of rule keys.
*/
protected static final String RE_TAG = "\\p{Alpha}[-\\p{Alnum}]*";
protected static final String RE_ATTR = RE_TAG;
protected static final Pattern RE_TAG_ATTR_TYPE =
Pattern.compile("(" + RE_TAG + "|\\*)(?:\\[(" + RE_ATTR + ")=([^]]*)\\])?\\.(" + RE_ATTR + ")\\.type");
/**
* build rewrite rule set from textual description in {@code p}.
* rule set is a collection of a sub-tree for each element, stored in {@code rules}
* Map. {@code rules} maps tag name to a tree of {@code TransformAttr}. {@code TransformAttr}
* tree is in essence a linked list of {@code TransformAttr} grouped by attribute name. In each
* group, {@code TransformAttr}s are sorted by their <em>specificity</em>. Within the same
* specificity, the {@code TransformAttr} added last comes on top (so that rules added through
* {@code Properties} override default rules.). As the order of keys in a single {@code Properties}
* is unspecified, there's no guarantee later line overrides former lines with the same key.
* @param p Properties with rewrite rules.
*/
protected void loadRulesFromProperties(Properties p) {
for (String key : p.stringPropertyNames()) {
Matcher m = RE_TAG_ATTR_TYPE.matcher(key);
if (m.matches()) {
String tagName = m.group(1);
String testAttrName = m.group(2);
String testAttrValue = m.group(3);
String attrNameMatch = m.group(4);
String transformerName = p.getProperty(key);
StringTransformer transformer = transformers.get(transformerName);
if (transformer == null) {
LOGGER.warning("Unknown transformer name \"" +
transformerName + "\" for key \"" + key + "\"");
continue;
}
TransformAttr t;
if (testAttrName != null) {
t = new TransformAttrIfAttrValue(attrNameMatch,
transformer, testAttrName, testAttrValue);
} else {
t = new TransformAttr(attrNameMatch, transformer);
}
TransformAttr ta = rules.get(tagName);
TransformAttr pta = null;
while (true) {
if (ta == null) {
if (pta == null) {
rules.put(tagName, t);
} else {
pta.next = t;
}
break;
}
// TransformAttr for the same attrName are chained
// on "alt" link, sorted by specificity, descending.
if (ta.attrName.equals(t.attrName)) {
TransformAttr palt = null;
while (true) {
if (t.compareTo(ta) >= 0) {
t.alt = ta;
t.next = ta.next;
ta.next = null;
if (palt == null) {
if (pta == null) {
rules.put(tagName, t);
} else {
pta.next = t;
}
} else {
palt.next = t;
}
break;
}
palt = ta;
ta = ta.alt;
if (ta == null) {
palt.alt = t;
break;
}
}
break;
}
pta = ta;
ta = ta.next;
}
continue;
}
LOGGER.warning("unrecogized key syntax \"" + key + "\"");
}
}
@Override
public void rewrite(ReplayParseContext context, TagNode tag) {
String tagName = tag.getTagName();
TransformAttr ta = rules.get(tagName);
while (ta != null) {
for (TransformAttr attrta = ta; attrta != null; attrta = attrta.alt) {
if (attrta.apply(context, tag))
break;
}
ta = ta.next;
}
// TODO: should we check if anyElementRules is rewriting
// an attribute which has been rewritten by per-element rules?
ta = rules.get("*");
while (ta != null) {
for (TransformAttr attrta = ta; attrta != null; attrta = attrta.alt) {
if (attrta.apply(context, tag))
break;
}
ta = ta.next;
}
}
}