package com.smartandroid.sa.tag.nodes; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.List; import com.smartandroid.sa.tag.helper.StringUtil; import com.smartandroid.sa.tag.helper.Validate; import com.smartandroid.sa.tag.parser.Tag; import com.smartandroid.sa.tag.select.Elements; /** * A HTML Document. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Document extends Element { private OutputSettings outputSettings = new OutputSettings(); private QuirksMode quirksMode = QuirksMode.noQuirks; private String location; /** * Create a new, empty Document. * * @param baseUri * base URI of document * @see org.SmartTag.Jsoup#parse * @see #createShell */ public Document(String baseUri) { super(Tag.valueOf("#root"), baseUri); this.location = baseUri; } /** * Create a valid, empty shell of a document, suitable for adding more * elements to. * * @param baseUri * baseUri of document * @return document with html, head, and body elements. */ static public Document createShell(String baseUri) { Validate.notNull(baseUri); Document doc = new Document(baseUri); Element html = doc.appendElement("html"); html.appendElement("head"); html.appendElement("body"); return doc; } /** * Get the URL this Document was parsed from. If the starting URL is a * redirect, this will return the final URL from which the document was * served from. * * @return location */ public String location() { return location; } /** * Accessor to the document's {@code head} element. * * @return {@code head} */ public Element head() { return findFirstElementByTagName("head", this); } /** * Accessor to the document's {@code body} element. * * @return {@code body} */ public Element body() { return findFirstElementByTagName("body", this); } /** * Get the string contents of the document's {@code title} element. * * @return Trimmed title, or empty string if none set. */ public String title() { // title is a preserve whitespace tag (for document output), but // normalised here Element titleEl = getElementsByTag("title").first(); return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()) .trim() : ""; } /** * Set the document's {@code title} element. Updates the existing element, * or adds {@code title} to {@code head} if not present * * @param title * string to set as title */ public void title(String title) { Validate.notNull(title); Element titleEl = getElementsByTag("title").first(); if (titleEl == null) { // add to head head().appendElement("title").text(title); } else { titleEl.text(title); } } /** * Create a new Element, with this document's base uri. Does not make the * new element a child of this document. * * @param tagName * element tag name (e.g. {@code a}) * @return new element */ public Element createElement(String tagName) { return new Element(Tag.valueOf(tagName), this.baseUri()); } /** * Normalise the document. This happens after the parse phase so generally * does not need to be called. Moves any text content that is not in the * body element into the body. * * @return this document after normalisation */ public Document normalise() { Element htmlEl = findFirstElementByTagName("html", this); if (htmlEl == null) htmlEl = appendElement("html"); if (head() == null) htmlEl.prependElement("head"); if (body() == null) htmlEl.appendElement("body"); // pull text nodes out of root, html, and head els, and push into body. // non-text nodes are already taken care // of. do in inverse order to maintain text order. normaliseTextNodes(head()); normaliseTextNodes(htmlEl); normaliseTextNodes(this); normaliseStructure("head", htmlEl); normaliseStructure("body", htmlEl); return this; } // does not recurse. private void normaliseTextNodes(Element element) { List<Node> toMove = new ArrayList<Node>(); for (Node node : element.childNodes) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; if (!tn.isBlank()) toMove.add(tn); } } for (int i = toMove.size() - 1; i >= 0; i--) { Node node = toMove.get(i); element.removeChild(node); body().prependChild(new TextNode(" ", "")); body().prependChild(node); } } // merge multiple <head> or <body> contents into one, delete the remainder, // and ensure they are owned by <html> private void normaliseStructure(String tag, Element htmlEl) { Elements elements = this.getElementsByTag(tag); Element master = elements.first(); // will always be available as // created above if not existent if (elements.size() > 1) { // dupes, move contents to master List<Node> toMove = new ArrayList<Node>(); for (int i = 1; i < elements.size(); i++) { Node dupe = elements.get(i); for (Node node : dupe.childNodes) toMove.add(node); dupe.remove(); } for (Node dupe : toMove) master.appendChild(dupe); } // ensure parented by <html> if (!master.parent().equals(htmlEl)) { htmlEl.appendChild(master); // includes remove() } } // fast method to get first by tag name, used for html, head, body finders private Element findFirstElementByTagName(String tag, Node node) { if (node.nodeName().equals(tag)) return (Element) node; else { for (Node child : node.childNodes) { Element found = findFirstElementByTagName(tag, child); if (found != null) return found; } } return null; } @Override public String outerHtml() { return super.html(); // no outer wrapper tag } /** * Set the text of the {@code body} of this document. Any existing nodes * within the body will be cleared. * * @param text * unencoded text * @return this document */ @Override public Element text(String text) { body().text(text); // overridden to not nuke doc structure return this; } @Override public String nodeName() { return "#document"; } @Override public Document clone() { Document clone = (Document) super.clone(); clone.outputSettings = this.outputSettings.clone(); return clone; } /** * A Document's output settings control the form of the text() and html() * methods. */ public static class OutputSettings implements Cloneable { /** * The output serialization syntax. */ public enum Syntax { html, xml } private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; private Charset charset = Charset.forName("UTF-8"); private CharsetEncoder charsetEncoder = charset.newEncoder(); private boolean prettyPrint = true; private boolean outline = false; private int indentAmount = 1; private Syntax syntax = Syntax.html; public OutputSettings() { } /** * Get the document's current HTML escape mode: <code>base</code>, which * provides a limited set of named HTML entities and escapes other * characters as numbered entities for maximum compatibility; or * <code>extended</code>, which uses the complete set of HTML named * entities. * <p> * The default escape mode is <code>base</code>. * * @return the document's current escape mode */ public Entities.EscapeMode escapeMode() { return escapeMode; } /** * Set the document's escape mode, which determines how characters are * escaped when the output character set does not support a given * character:- using either a named or a numbered escape. * * @param escapeMode * the new escape mode to use * @return the document's output settings, for chaining */ public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { this.escapeMode = escapeMode; return this; } /** * Get the document's current output charset, which is used to control * which characters are escaped when generating HTML (via the * <code>html()</code> methods), and which are kept intact. * <p> * Where possible (when parsing from a URL or File), the document's * output charset is automatically set to the input charset. Otherwise, * it defaults to UTF-8. * * @return the document's current charset. */ public Charset charset() { return charset; } /** * Update the document's output charset. * * @param charset * the new charset to use. * @return the document's output settings, for chaining */ public OutputSettings charset(Charset charset) { // todo: this should probably update the doc's meta charset this.charset = charset; charsetEncoder = charset.newEncoder(); return this; } /** * Update the document's output charset. * * @param charset * the new charset (by name) to use. * @return the document's output settings, for chaining */ public OutputSettings charset(String charset) { charset(Charset.forName(charset)); return this; } CharsetEncoder encoder() { return charsetEncoder; } /** * Get the document's current output syntax. * * @return current syntax */ public Syntax syntax() { return syntax; } /** * Set the document's output syntax. Either {@code html}, with empty * tags and boolean attributes (etc), or {@code xml}, with self-closing * tags. * * @param syntax * serialization syntax * @return the document's output settings, for chaining */ public OutputSettings syntax(Syntax syntax) { this.syntax = syntax; return this; } /** * Get if pretty printing is enabled. Default is true. If disabled, the * HTML output methods will not re-format the output, and the output * will generally look like the input. * * @return if pretty printing is enabled. */ public boolean prettyPrint() { return prettyPrint; } /** * Enable or disable pretty printing. * * @param pretty * new pretty print setting * @return this, for chaining */ public OutputSettings prettyPrint(boolean pretty) { prettyPrint = pretty; return this; } /** * Get if outline mode is enabled. Default is false. If enabled, the * HTML output methods will consider all tags as block. * * @return if outline mode is enabled. */ public boolean outline() { return outline; } /** * Enable or disable HTML outline mode. * * @param outlineMode * new outline setting * @return this, for chaining */ public OutputSettings outline(boolean outlineMode) { outline = outlineMode; return this; } /** * Get the current tag indent amount, used when pretty printing. * * @return the current indent amount */ public int indentAmount() { return indentAmount; } /** * Set the indent amount for pretty printing * * @param indentAmount * number of spaces to use for indenting each level. Must be * >= 0. * @return this, for chaining */ public OutputSettings indentAmount(int indentAmount) { Validate.isTrue(indentAmount >= 0); this.indentAmount = indentAmount; return this; } @Override public OutputSettings clone() { OutputSettings clone; try { clone = (OutputSettings) super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.charset(charset.name()); // new charset and charset encoder clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); // indentAmount, prettyPrint are primitives so object.clone() will // handle return clone; } } /** * Get the document's current output settings. * * @return the document's current output settings. */ public OutputSettings outputSettings() { return outputSettings; } /** * Set the document's output settings. * * @param outputSettings * new output settings. * @return this document, for chaining. */ public Document outputSettings(OutputSettings outputSettings) { Validate.notNull(outputSettings); this.outputSettings = outputSettings; return this; } public enum QuirksMode { noQuirks, quirks, limitedQuirks; } public QuirksMode quirksMode() { return quirksMode; } public Document quirksMode(QuirksMode quirksMode) { this.quirksMode = quirksMode; return this; } @Override public boolean equals(Object o) { return super.equals(o); } }