package com.smartandroid.sa.tag.safety;
import com.smartandroid.sa.tag.helper.Validate;
import com.smartandroid.sa.tag.nodes.Attribute;
import com.smartandroid.sa.tag.nodes.Attributes;
import com.smartandroid.sa.tag.nodes.Document;
import com.smartandroid.sa.tag.nodes.Element;
import com.smartandroid.sa.tag.nodes.Node;
import com.smartandroid.sa.tag.nodes.TextNode;
import com.smartandroid.sa.tag.parser.Tag;
import com.smartandroid.sa.tag.select.NodeTraversor;
import com.smartandroid.sa.tag.select.NodeVisitor;
/**
* The whitelist based HTML cleaner. Use to ensure that end-user provided HTML
* contains only the elements and attributes that you are expecting; no junk,
* and no cross-site scripting attacks!
* <p/>
* The HTML cleaner parses the input as HTML and then runs it through a
* white-list, so the output HTML can only contain HTML that is allowed by the
* whitelist.
* <p/>
* It is assumed that the input HTML is a body fragment; the clean methods only
* pull from the source's body, and the canned white-lists only allow body
* contained tags.
* <p/>
* Rather than interacting directly with a Cleaner object, generally see the
* {@code clean} methods in {@link org.SmartTag.Jsoup}.
*/
public class Cleaner {
private Whitelist whitelist;
/**
* Create a new cleaner, that sanitizes documents using the supplied
* whitelist.
*
* @param whitelist
* white-list to clean with
*/
public Cleaner(Whitelist whitelist) {
Validate.notNull(whitelist);
this.whitelist = whitelist;
}
/**
* Creates a new, clean document, from the original dirty document,
* containing only elements allowed by the whitelist. The original document
* is not modified. Only elements from the dirt document's <code>body</code>
* are used.
*
* @param dirtyDocument
* Untrusted base document to clean.
* @return cleaned document.
*/
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
if (dirtyDocument.body() != null) // frameset documents won't have a
// body. the clean doc will have
// empty body.
copySafeNodes(dirtyDocument.body(), clean.body());
return clean;
}
/**
* Determines if the input document is valid, against the whitelist. It is
* considered valid if all the tags and attributes in the input HTML are
* allowed by the whitelist.
* <p/>
* This method can be used as a validator for user input forms. An invalid
* document will still be cleaned successfully using the
* {@link #clean(Document)} document. If using as a validator, it is
* recommended to still clean the document to ensure enforced attributes are
* set correctly, and that the output is tidied.
*
* @param dirtyDocument
* document to test
* @return true if no tags or attributes need to be removed; false if they
* do
*/
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
return numDiscarded == 0;
}
/**
* Iterates the input and copies trusted nodes (tags, attributes, text) into
* the destination.
*/
private final class CleaningVisitor implements NodeVisitor {
private int numDiscarded = 0;
private final Element root;
private Element destination; // current element to append nodes to
private CleaningVisitor(Element root, Element destination) {
this.root = root;
this.destination = destination;
}
public void head(Node source, int depth) {
if (source instanceof Element) {
Element sourceEl = (Element) source;
if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone
// and copy safe
// attrs
ElementMeta meta = createSafeElement(sourceEl);
Element destChild = meta.el;
destination.appendChild(destChild);
numDiscarded += meta.numAttribsDiscarded;
destination = destChild;
} else if (source != root) { // not a safe tag, so don't add.
// don't count root against
// discarded.
numDiscarded++;
}
} else if (source instanceof TextNode) {
TextNode sourceText = (TextNode) source;
TextNode destText = new TextNode(sourceText.getWholeText(),
source.baseUri());
destination.appendChild(destText);
} else { // else, we don't care about comments, xml proc
// instructions, etc
numDiscarded++;
}
}
public void tail(Node source, int depth) {
if (source instanceof Element
&& whitelist.isSafeTag(source.nodeName())) {
destination = destination.parent(); // would have descended, so
// pop destination stack
}
}
}
private int copySafeNodes(Element source, Element dest) {
CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
traversor.traverse(source);
return cleaningVisitor.numDiscarded;
}
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(),
destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
numDiscarded++;
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);
return new ElementMeta(dest, numDiscarded);
}
private static class ElementMeta {
Element el;
int numAttribsDiscarded;
ElementMeta(Element el, int numAttribsDiscarded) {
this.el = el;
this.numAttribsDiscarded = numAttribsDiscarded;
}
}
}