/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.archivalurl;
import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import javax.servlet.ServletException;
import org.archive.wayback.accesscontrol.oracleclient.CustomPolicyOracleFilter;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.replay.html.StringTransformer;
import org.archive.wayback.replay.html.transformer.BlockCSSStringTransformer;
import org.archive.wayback.replay.html.transformer.InlineCSSStringTransformer;
import org.archive.wayback.replay.html.transformer.JSStringTransformer;
import org.archive.wayback.replay.html.transformer.MetaRefreshUrlStringTransformer;
import org.archive.wayback.replay.html.transformer.URLStringTransformer;
import org.archive.wayback.util.htmllex.NodeUtils;
import org.archive.wayback.util.htmllex.ParseContext;
import org.archive.wayback.util.htmllex.ParseEventHandler;
import org.htmlparser.Node;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
/**
* Lean and mean ParseEventHandler implementing current best-known server-side
* HTML rewrite rules, and should be much faster than the fully configurable
* version.
*
* @author brad
*
*/
public class FastArchivalUrlReplayParseEventHandler implements
ParseEventHandler {
public final static String FERRET_DONE_KEY =
FastArchivalUrlReplayParseEventHandler.class.toString();
protected final static String FERRET_IN_HEAD = "FERRET_IN_HEAD";
private String jspInsertPath = "/WEB-INF/replay/DisclaimChooser.jsp";
private String endJsp = "/WEB-INF/replay/ArchiveComment.jsp";
private String startJsp = null;
private final String[] okHeadTags = { "![CDATA[*", "![CDATA[", "?",
"!DOCTYPE", "HTML", "HEAD", "BASE", "LINK", "META", "TITLE",
"STYLE", "SCRIPT" , "BGSOUND"};
private HashMap<String, Object> okHeadTagMap = null;
private final static String FRAMESET_TAG = "FRAMESET";
private final static String BODY_TAG = "BODY";
protected static final String FERRET_HEAD_INSERTED = "FERRET_HEAD_INSERTED";
private BlockCSSStringTransformer cssBlockTrans =
new BlockCSSStringTransformer();
private InlineCSSStringTransformer cssInlineTrans =
new InlineCSSStringTransformer();
private StringTransformer jsBlockTrans =
new JSStringTransformer();
private MetaRefreshUrlStringTransformer metaRefreshTrans =
new MetaRefreshUrlStringTransformer();
private URLStringTransformer anchorUrlTrans = new URLStringTransformer();
protected String headInsertJsp = null;
// static {
// anchorUrlTrans = new URLStringTransformer();
// anchorUrlTrans.setJsTransformer(jsBlockTrans);
// }
private static URLStringTransformer framesetUrlTrans =
new URLStringTransformer("fw_");
private static URLStringTransformer cssUrlTrans =
new URLStringTransformer("cs_");
private static URLStringTransformer jsUrlTrans =
new URLStringTransformer("js_");
private static URLStringTransformer imageUrlTrans =
new URLStringTransformer("im_");
/** Constructor... */
public FastArchivalUrlReplayParseEventHandler() {
okHeadTagMap = new HashMap<String, Object>(okHeadTags.length);
for (String tag : okHeadTags) {
okHeadTagMap.put(tag, null);
}
anchorUrlTrans.setJsTransformer(jsBlockTrans);
}
// TODO: This should all be refactored up into an abstract base class with
// default no-op methods, allowing a subclass to only override the ones they
// want...
public void handleNode(ParseContext pContext, Node node)
throws IOException {
ReplayParseContext context = (ReplayParseContext) pContext;
if(NodeUtils.isRemarkNode(node)) {
RemarkNode remarkNode = (RemarkNode) node;
remarkNode.setText(jsBlockTrans.transform(context, remarkNode.getText()));
emit(context,null,node,null);
} else if(NodeUtils.isTextNode(node)) {
TextNode textNode = (TextNode) node;
if(context.isInCSS()) {
handleCSSTextNode(context,textNode);
} else if(context.isInScriptText()) {
handleJSTextNode(context,textNode);
} else {
emit(context,null,textNode,null);
// handleContentTextNode(context,textNode);
}
} else if(NodeUtils.isTagNode(node)) {
TagNode tagNode = (TagNode) node;
if (NodeUtils.isOpenTagNodeNamed(tagNode, NodeUtils.SCRIPT_TAG_NAME)) {
handleJSIncludeNode(context, tagNode);
} else if(tagNode.isEndTag()) {
if (tagNode.getTagName().equals("HEAD")) {
context.putData(FERRET_IN_HEAD, null);
}
if (checkAllowTag(pContext, tagNode)) {
emit(context,null,tagNode,null);
}
// handleCloseTagNode(context,tagNode);
} else {
// assume start, possibly empty:
handleOpenTagNode(context,tagNode);
}
} else {
throw new IllegalArgumentException("Unknown node type..");
}
}
/**
* @param context
* @param textNode
* @throws IOException
*/
private void handleCSSTextNode(ReplayParseContext context, TextNode textNode) throws IOException {
textNode.setText(cssBlockTrans.transform(context, textNode.getText()));
emit(context,null,textNode,null);
}
/**
* @param context
* @param textNode
* @throws IOException
*/
private void handleJSTextNode(ReplayParseContext context, TextNode textNode) throws IOException {
textNode.setText(jsBlockTrans.transform(context, textNode.getText()));
emit(context,null,textNode,null);
}
private void handleJSIncludeNode(ReplayParseContext context, TagNode tagNode) throws IOException {
String file = tagNode.getAttribute("SRC");
if (file != null) {
//TODO: This is hacky.. fix it
// This is used to check if the file should be skipped...
//from a custom rule..
String result = jsBlockTrans.transform(context, file);
//The rewriting is done by the js_ rewriter
if ((result != null) && !result.isEmpty()) {
tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file));
} else {
file = "";
tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file));
}
}
emit(context,null,tagNode,null);
}
private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode)
throws IOException {
boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null;
String preEmit = null;
String postEmit = null;
String tagName = tagNode.getTagName();
boolean alreadyInsertedHead = (context.getData(FERRET_HEAD_INSERTED) != null);
if (!alreadyInsertedHead) {
// If we're at the beginning of a <head> tag, and haven't inserted yet,
// insert right AFTER head tag
if (tagName.equals("HEAD")) {
emitHeadInsert(context, tagNode, true);
context.putData(FERRET_IN_HEAD, FERRET_IN_HEAD);
return;
}
// If we're at the beginning of any tag, other than <html>,
// (including <body>) and haven't inserted yet,
// insert right BEFORE the next tag, also continue other default processing
// of the tag
if (!tagName.equals("HTML") && !tagName.equals("!DOCTYPE")) {
emitHeadInsert(context, null, false);
// Don't return continue to further processing
}
}
boolean inHead = (context.getData(FERRET_IN_HEAD) != null);
// Time to insert the JSP header?
//IK added check to avoid inserting inside css or script
if(!insertedJsp && !context.isInCSS() && !context.isInScriptText() && !inHead) {
if(!okHeadTagMap.containsKey(tagName)) {
if(tagName.equals(FRAMESET_TAG)) {
// don't put the insert in framsets:
} else {
if(jspInsertPath != null) {
String tmp = null;
try {
tmp =
context.getJspExec().jspToString(jspInsertPath);
} catch (ServletException e) {
e.printStackTrace();
}
if (tagName.equals(BODY_TAG)) {
// insert it now, *after* the current Tag:
postEmit = tmp;
} else {
// hrm... we are seeing a node that should be in
// the body.. lets emit the jsp now, *before*
// the current Tag:
preEmit = tmp;
}
}
}
context.putData(FERRET_DONE_KEY,"");
}
}
// now do all the usual attribute rewriting:
// this could be slightly optimized by moving tags more likely to occur
// to the front of the if/else if/else if routing...
if(tagName.equals("A")) {
transformAttr(context, tagNode, "HREF", anchorUrlTrans);
} else if(tagName.equals("APPLET")) {
transformAttr(context, tagNode, "CODEBASE", anchorUrlTrans);
transformAttr(context, tagNode, "ARCHIVE", anchorUrlTrans);
} else if(tagName.equals("AREA")) {
transformAttr(context, tagNode, "HREF", anchorUrlTrans);
} else if(tagName.equals("BASE")) {
String orig = tagNode.getAttribute("HREF");
if(orig != null) {
try {
context.setBaseUrl(new URL(orig));
transformAttr(context, tagNode, "HREF", anchorUrlTrans);
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
} else if(tagName.equals("EMBED")) {
transformAttr(context, tagNode, "SRC", anchorUrlTrans);
} else if(tagName.equals("IFRAME")) {
transformAttr(context, tagNode, "SRC", anchorUrlTrans);
} else if(tagName.equals("IMG")) {
transformAttr(context, tagNode, "SRC", imageUrlTrans);
} else if(tagName.equals("INPUT")) {
transformAttr(context, tagNode, "SRC", imageUrlTrans);
} else if(tagName.equals("FORM")) {
transformAttr(context, tagNode, "ACTION", anchorUrlTrans);
} else if(tagName.equals("FRAME")) {
transformAttr(context, tagNode, "SRC", framesetUrlTrans);
} else if(tagName.equals("LINK")) {
if(transformAttrWhere(context, tagNode, "REL", "STYLESHEET",
"HREF",cssUrlTrans)) {
// no-op
} else if(transformAttrWhere(context,tagNode,"REL","SHORTCUT ICON",
"HREF", imageUrlTrans)) {
// no-op
} else {
transformAttr(context, tagNode, "HREF", anchorUrlTrans);
}
} else if(tagName.equals("META")) {
transformAttrWhere(context, tagNode, "HTTP-EQUIV", "REFRESH",
"CONTENT", metaRefreshTrans);
transformAttr(context, tagNode, "URL", anchorUrlTrans);
} else if(tagName.equals("OBJECT")) {
transformAttr(context, tagNode, "CODEBASE", anchorUrlTrans);
transformAttr(context, tagNode, "CDATA", anchorUrlTrans);
} else if(tagName.equals("SCRIPT")) {
transformAttr(context, tagNode, "SRC", jsUrlTrans);
} else {
if (!checkAllowTag(context, tagNode)) {
return;
}
}
// now, for *all* tags...
transformAttr(context,tagNode,"BACKGROUND", imageUrlTrans);
transformAttr(context,tagNode,"STYLE", cssInlineTrans);
transformAttr(context,tagNode,"onclick", jsBlockTrans);
emit(context,preEmit,tagNode,postEmit);
}
protected boolean checkAllowTag(ParseContext context, TagNode tagNode)
{
String tagName = tagNode.getTagName();
// Check the NOSCRIPT tag, if force-noscript is set,
// then skip the NOSCRIPT tags and include contents explicitly
if (tagName.equals("NOSCRIPT")) {
String allPolicies = getOraclePolicies(context);
if ((allPolicies != null) && allPolicies.contains("force-noscript")) {
return false;
}
}
return true;
}
protected String getOraclePolicies(ParseContext context)
{
return context.getData(CustomPolicyOracleFilter.CAPTURE_ORACLE_POLICY);
}
//IK: changed private to protected
protected void emit(ReplayParseContext context, String pre, Node node,
String post) throws IOException {
OutputStream out = context.getOutputStream();
if(out != null) {
// Charset charset = Charset.forName(context.getOutputCharset());
String charset = context.getOutputCharset();
if(pre != null) {
out.write(pre.getBytes(charset));
}
if (node != null) {
out.write(node.toHtml(true).getBytes(charset));
}
if(post != null) {
out.write(post.getBytes(charset));
}
}
}
/**
* Transform a particular attribute on a TagNode, if that TagNode has a
* previous value for the updated attribute, AND if that TagNode contains
* another named attribute with a specific value.
*
* @param context the ReplayParseContext
* @param node the TagNode to be updated
* @param attrName update only occurs if the TagNode has an attribute with
* this name.
* @param attrVal update only occurs if the TagNode has an attribute
* attrName has this value, case insensitive. In fact as an optimization,
* it is ASSUMED that this argument is already UPPER-CASED
* @param modAttr the attribute value to update
* @param transformer the StringTransformer responsible for creating the
* new value based on the old one.
* @return true if the attribute was updated.
*/
private boolean transformAttrWhere(ReplayParseContext context, TagNode node,
String attrName, String attrVal, String modAttr,
StringTransformer transformer) {
String val = node.getAttribute(attrName);
if(val != null) {
if(val.toUpperCase().equals(attrVal)) {
return transformAttr(context,node,modAttr,transformer);
}
}
return false;
}
/**
* Transform a particular attribute on a TagNode, iff that attribute exists
*
* @param context The ReplayParseContext being transformed
* @param node the TagNode to update
* @param attr the attribute name to transform
* @param transformer the StringTransformer responsible for creating the
* new value
* @return true if the attribute was found and updated
*/
private boolean transformAttr(ReplayParseContext context, TagNode node,
String attr, StringTransformer transformer) {
String orig = node.getAttribute(attr);
if(orig != null) {
node.setAttribute(attr,
transformer.transform(context, orig));
return true;
}
return false;
}
public void handleParseComplete(ParseContext pContext) throws IOException {
if(endJsp != null) {
ReplayParseContext context = (ReplayParseContext) pContext;
OutputStream out = context.getOutputStream();
String tmp = null;
try {
tmp = context.getJspExec().jspToString(endJsp);
} catch (ServletException e) {
e.printStackTrace();
}
if(tmp != null) {
// Charset charset = Charset.forName(context.getOutputCharset());
String charset = context.getOutputCharset();
out.write(tmp.getBytes(charset));
}
}
}
public void handleParseStart(ParseContext pContext) throws IOException {
if(startJsp != null) {
ReplayParseContext context = (ReplayParseContext) pContext;
OutputStream out = context.getOutputStream();
String tmp = null;
try {
tmp = context.getJspExec().jspToString(startJsp);
} catch (ServletException e) {
e.printStackTrace();
}
if(tmp != null) {
// Charset charset = Charset.forName(context.getOutputCharset());
String charset = context.getOutputCharset();
out.write(tmp.getBytes(charset));
}
}
}
/**
* @return the jspInsertPath
*/
public String getJspInsertPath() {
return jspInsertPath;
}
/**
* @param jspInsertPath the jspInsertPath to set
*/
public void setJspInsertPath(String jspInsertPath) {
this.jspInsertPath = jspInsertPath;
}
/**
* @return the commentJsp
* @deprecated use getEndJsp()
*/
public String getCommentJsp() {
return getEndJsp();
}
/**
* @param commentJsp the commentJsp to set
* @deprecated use setEndJsp()
*/
public void setCommentJsp(String commentJsp) {
setEndJsp(commentJsp);
}
/**
* @return the path to the JSP to execute and include at the start of the
* document
*/
public String getStartsp() {
return startJsp;
}
/**
* @param endJsp the path to the JSP to execute and include at the start
* of the document
*/
public void setStartJsp(String startJsp) {
this.startJsp = startJsp;
}
/**
* @return the path to the JSP to execute and include at the end of the
* document
*/
public String getEndJsp() {
return endJsp;
}
/**
* @param endJsp the path to the JSP to execute and include at the end
* of the document
*/
public void setEndJsp(String endJsp) {
this.endJsp = endJsp;
}
/**
* @return the jsBlockTrans
*/
public StringTransformer getJsBlockTrans() {
return jsBlockTrans;
}
/**
* @param jsBlockTrans the jsBlockTrans to set
*/
public void setJsBlockTrans(StringTransformer jsBlockTrans) {
this.jsBlockTrans = jsBlockTrans;
anchorUrlTrans.setJsTransformer(jsBlockTrans);
}
public String getHeadInsertJsp() {
return headInsertJsp;
}
public void setHeadInsertJsp(String headInsertJsp) {
this.headInsertJsp = headInsertJsp;
}
protected void emitHeadInsert(ReplayParseContext context, Node node, boolean postInsert)
throws IOException {
String headInsert = null;
if (headInsertJsp == null) {
this.emit(context, null, node, null);
return;
}
try {
headInsert = context.getJspExec().jspToString(headInsertJsp);
context.putData(FERRET_HEAD_INSERTED, FERRET_HEAD_INSERTED);
} catch (ServletException e) {
e.printStackTrace();
}
if (postInsert) {
this.emit(context, null, node, headInsert);
} else {
this.emit(context, headInsert, node, null);
}
}
}