package com.grendelscan.commons.html;
import java.util.ArrayList;
import org.apache.commons.lang.StringEscapeUtils;
import org.cobra_grendel.html.domimpl.NodeImpl;
import org.w3c.dom.Text;
import org.w3c.dom.html2.HTMLAnchorElement;
import org.w3c.dom.html2.HTMLAppletElement;
import org.w3c.dom.html2.HTMLBaseElement;
import org.w3c.dom.html2.HTMLBodyElement;
import org.w3c.dom.html2.HTMLButtonElement;
import org.w3c.dom.html2.HTMLDocument;
import org.w3c.dom.html2.HTMLFormElement;
import org.w3c.dom.html2.HTMLFrameElement;
import org.w3c.dom.html2.HTMLFrameSetElement;
import org.w3c.dom.html2.HTMLHeadElement;
import org.w3c.dom.html2.HTMLHtmlElement;
import org.w3c.dom.html2.HTMLIFrameElement;
import org.w3c.dom.html2.HTMLImageElement;
import org.w3c.dom.html2.HTMLInputElement;
import org.w3c.dom.html2.HTMLIsIndexElement;
import org.w3c.dom.html2.HTMLLinkElement;
import org.w3c.dom.html2.HTMLMetaElement;
import org.w3c.dom.html2.HTMLObjectElement;
import org.w3c.dom.html2.HTMLOptGroupElement;
import org.w3c.dom.html2.HTMLOptionElement;
import org.w3c.dom.html2.HTMLParamElement;
import org.w3c.dom.html2.HTMLScriptElement;
import org.w3c.dom.html2.HTMLSelectElement;
import org.w3c.dom.html2.HTMLTextAreaElement;
public class HtmlUtils
{
/**
* This will remove the parent node, the document field and all child elements from a node tree, except the following: a, applet, base, body, button, document, form, frame, frame set, head, html,
* IFrame, img, input, isindex, link, meta, object, optgroup, option, param, script, select, and textarea. It will also keep text nodes with textarea parents, since they are used as input.<br>
* <br>
* The idea is that these are the only tags that cause some kind of request to the server. There are two exceptions: style tags can cause files to be requested, but style tags aren't parsed by
* Grendel-Scan anyway; event handlers are another big exception.<br>
* <br>
* This is primarily to let the ByHtmlElementCategorizer strip out unnecessary DOM elements, which saves a ton of memory. One result is that ByHtmlElement tests cannot interact with some tags. If
* this is necessary, they can run as an ByMimeType test and run through the entire DOM.
*
* @param element
*/
public static void CleanElement(final NodeImpl node)
{
node.clearDocument(false);
if (node.getRawNodeList() != null)
{
ArrayList<NodeImpl> tmpNodes = new ArrayList<NodeImpl>(node.getRawNodeList());
for (NodeImpl child : tmpNodes)
{
CleanElement(child);
if (unnecessaryElement(child))
{
int index = node.getRawNodeList().indexOf(child);
node.getRawNodeList().remove(index);
if (child.getRawNodeList() != null)
{
for (NodeImpl grandChild : child.getRawNodeList())
{
node.getRawNodeList().add(index, grandChild);
index++;
}
}
}
}
// If there are no more child nodes, kill the node list
if (node.getRawNodeList().size() == 0)
{
node.setRawNodeList(null);
}
}
}
public static String escapeHTML(final String text)
{
return StringEscapeUtils.escapeHtml(text);
}
public static boolean isDOMEvent(final String name)
{
return isStandardDOMEvent(name) || isMicrosoftOnlyDOMEvent(name) || isMozillaOnlyDOMEvent(name);
}
public static boolean isMicrosoftOnlyDOMEvent(final String name)
{
if (name.equalsIgnoreCase("oncut") || name.equalsIgnoreCase("oncopy") || name.equalsIgnoreCase("onpaste") || name.equalsIgnoreCase("onbeforecut") || name.equalsIgnoreCase("onbeforecopy") || name.equalsIgnoreCase("onbeforepaste")
|| name.equalsIgnoreCase("onafterupdate") || name.equalsIgnoreCase("onbeforeupdate") || name.equalsIgnoreCase("oncellchange") || name.equalsIgnoreCase("ondataavailable") || name.equalsIgnoreCase("ondatasetchanged")
|| name.equalsIgnoreCase("ondatasetcomplete") || name.equalsIgnoreCase("onerrorupdate") || name.equalsIgnoreCase("onrowenter") || name.equalsIgnoreCase("onrowexit") || name.equalsIgnoreCase("onrowsdelete")
|| name.equalsIgnoreCase("onrowinserted") || name.equalsIgnoreCase("oncontextmenu") || name.equalsIgnoreCase("ondrag") || name.equalsIgnoreCase("ondragstart") || name.equalsIgnoreCase("ondragenter")
|| name.equalsIgnoreCase("ondragover") || name.equalsIgnoreCase("ondragleave") || name.equalsIgnoreCase("ondragend") || name.equalsIgnoreCase("ondrop") || name.equalsIgnoreCase("onselectstart") || name.equalsIgnoreCase("onhelp")
|| name.equalsIgnoreCase("onbeforeunload") || name.equalsIgnoreCase("onstop") || name.equalsIgnoreCase("onbeforeeditfocus") || name.equalsIgnoreCase("onstart") || name.equalsIgnoreCase("onfinish") || name.equalsIgnoreCase("onbounce")
|| name.equalsIgnoreCase("onbeforeprint") || name.equalsIgnoreCase("onafterprint") || name.equalsIgnoreCase("onpropertychange") || name.equalsIgnoreCase("onfilterchange") || name.equalsIgnoreCase("onreadystatechange")
|| name.equalsIgnoreCase("onlosecapture"))
{
return true;
}
return false;
}
public static boolean isMozillaOnlyDOMEvent(final String name)
{
if (name.equalsIgnoreCase("DOMMouseScroll") || name.equalsIgnoreCase("ondragdrop") || name.equalsIgnoreCase("ondragenter") || name.equalsIgnoreCase("ondragexit") || name.equalsIgnoreCase("ondraggesture") || name.equalsIgnoreCase("ondragover")
|| name.equalsIgnoreCase("onclose") || name.equalsIgnoreCase("oncommand") || name.equalsIgnoreCase("oninput") || name.equalsIgnoreCase("DOMMenuItemActive") || name.equalsIgnoreCase("DOMMenuItemInactive")
|| name.equalsIgnoreCase("oncontextmenu") || name.equalsIgnoreCase("onoverflow") || name.equalsIgnoreCase("onoverflowchanged") || name.equalsIgnoreCase("onunderflow") || name.equalsIgnoreCase("onpopuphidden")
|| name.equalsIgnoreCase("onpopuphiding") || name.equalsIgnoreCase("onpopupshowing") || name.equalsIgnoreCase("onpopupshown") || name.equalsIgnoreCase("onbroadcast") || name.equalsIgnoreCase("oncommandupdate")
|| name.equalsIgnoreCase("DOMContentLoaded"))
{
return true;
}
return false;
}
public static boolean isStandardDOMEvent(final String name)
{
if (name.equalsIgnoreCase("onclick") || name.equalsIgnoreCase("ondblclick") || name.equalsIgnoreCase("onmousedown") || name.equalsIgnoreCase("onmouseup") || name.equalsIgnoreCase("onmouseover") || name.equalsIgnoreCase("onmousemove")
|| name.equalsIgnoreCase("onmouseout") || name.equalsIgnoreCase("onkeypress") || name.equalsIgnoreCase("onkeydown") || name.equalsIgnoreCase("onkeyup") || name.equalsIgnoreCase("onload") || name.equalsIgnoreCase("onunload")
|| name.equalsIgnoreCase("onabort") || name.equalsIgnoreCase("onerror") || name.equalsIgnoreCase("onresize") || name.equalsIgnoreCase("onscroll") || name.equalsIgnoreCase("onselect") || name.equalsIgnoreCase("onchange")
|| name.equalsIgnoreCase("onsubmit") || name.equalsIgnoreCase("onreset") || name.equalsIgnoreCase("onfocus") || name.equalsIgnoreCase("onblur") || name.equalsIgnoreCase("ondomfocusin") || name.equalsIgnoreCase("ondomfocusout")
|| name.equalsIgnoreCase("ondomactivate") || name.equalsIgnoreCase("onsubtreemodified") || name.equalsIgnoreCase("onnodeinserted") || name.equalsIgnoreCase("onnoderemoved") || name.equalsIgnoreCase("ondomnoderemovedfromdocument")
|| name.equalsIgnoreCase("ondomnodeinsertedintodocument") || name.equalsIgnoreCase("onattrmodified") || name.equalsIgnoreCase("oncharacterdatamodified"))
{
return true;
}
return false;
}
public static String makeLink(final String url)
{
return makeLink(url, null);
}
public static String makeLink(final String url, String text)
{
if (text == null || text.equals(""))
{
text = url;
}
return "<a target=\"_blank\" href=\"" + url + "\">" + text + "</a>";
}
private static boolean nonInputElement(final NodeImpl node)
{
boolean unneccissary = true;
if (node instanceof HTMLButtonElement || node instanceof HTMLInputElement || node instanceof HTMLOptGroupElement || node instanceof HTMLOptionElement || node instanceof HTMLSelectElement || node instanceof HTMLTextAreaElement
|| node.getParentNode() instanceof HTMLTextAreaElement && node instanceof Text)
{
unneccissary = false;
}
return unneccissary;
}
public static void StripAllChildrenButInput(final NodeImpl node)
{
node.clearDocument(false);
if (node.getRawNodeList() != null)
{
ArrayList<NodeImpl> tmpNodes = new ArrayList<NodeImpl>(node.getRawNodeList());
for (NodeImpl child : tmpNodes)
{
StripAllChildrenButInput(child);
if (nonInputElement(child))
{
int index = node.getRawNodeList().indexOf(child);
node.getRawNodeList().remove(index);
if (child.getRawNodeList() != null)
{
for (NodeImpl grandChild : child.getRawNodeList())
{
node.getRawNodeList().add(index, grandChild);
index++;
}
}
}
}
// If there are no more child nodes, kill the node list
if (node.getRawNodeList().size() == 0)
{
node.setRawNodeList(null);
}
}
}
public static void StripAllFamily(final NodeImpl node)
{
node.clearDocument(false);
node.clearParent();
node.setRawNodeList(null);
}
private static boolean unnecessaryElement(final NodeImpl node)
{
boolean unneccissary = true;
if (node instanceof HTMLAnchorElement || node instanceof HTMLAppletElement || node instanceof HTMLBaseElement || node instanceof HTMLBodyElement || node instanceof HTMLButtonElement || node instanceof HTMLDocument || node instanceof HTMLFormElement
|| node instanceof HTMLFrameElement || node instanceof HTMLFrameSetElement || node instanceof HTMLHeadElement || node instanceof HTMLHtmlElement || node instanceof HTMLIFrameElement || node instanceof HTMLImageElement
|| node instanceof HTMLInputElement || node instanceof HTMLIsIndexElement || node instanceof HTMLLinkElement || node instanceof HTMLMetaElement || node instanceof HTMLObjectElement || node instanceof HTMLOptGroupElement
|| node instanceof HTMLOptionElement || node instanceof HTMLParamElement || node instanceof HTMLScriptElement || node instanceof HTMLSelectElement || node instanceof HTMLTextAreaElement
|| node.getParentNode() instanceof HTMLTextAreaElement && node instanceof Text)
{
unneccissary = false;
}
return unneccissary;
}
}