package com.smartandroid.sa.tag.parser;
import java.util.List;
import com.smartandroid.sa.tag.nodes.Document;
import com.smartandroid.sa.tag.nodes.Element;
import com.smartandroid.sa.tag.nodes.Node;
/**
* Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use
* one of the more convenient parse methods in {@link org.SmartTag.Jsoup}.
*/
public class Parser {
private static final int DEFAULT_MAX_ERRORS = 0; // by default, error
// tracking is disabled.
private TreeBuilder treeBuilder;
private int maxErrors = DEFAULT_MAX_ERRORS;
private ParseErrorList errors;
/**
* Create a new Parser, using the specified TreeBuilder
*
* @param treeBuilder
* TreeBuilder to use to parse input into Documents.
*/
public Parser(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
}
public Document parseInput(String html, String baseUri) {
errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors)
: ParseErrorList.noTracking();
Document doc = treeBuilder.parse(html, baseUri, errors);
return doc;
}
// gets & sets
/**
* Get the TreeBuilder currently in use.
*
* @return current TreeBuilder.
*/
public TreeBuilder getTreeBuilder() {
return treeBuilder;
}
/**
* Update the TreeBuilder used when parsing content.
*
* @param treeBuilder
* current TreeBuilder
* @return this, for chaining
*/
public Parser setTreeBuilder(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
return this;
}
/**
* Check if parse error tracking is enabled.
*
* @return current track error state.
*/
public boolean isTrackErrors() {
return maxErrors > 0;
}
/**
* Enable or disable parse error tracking for the next parse.
*
* @param maxErrors
* the maximum number of errors to track. Set to 0 to disable.
* @return this, for chaining
*/
public Parser setTrackErrors(int maxErrors) {
this.maxErrors = maxErrors;
return this;
}
/**
* Retrieve the parse errors, if any, from the last parse.
*
* @return list of parse errors, up to the size of the maximum errors
* tracked.
*/
public List<ParseError> getErrors() {
return errors;
}
// static parse functions below
/**
* Parse HTML into a Document.
*
* @param html
* HTML to parse
* @param baseUri
* base URI of document (i.e. original fetch location), for
* resolving relative URLs.
*
* @return parsed Document
*/
public static Document parse(String html, String baseUri) {
TreeBuilder treeBuilder = new HtmlTreeBuilder();
return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking());
}
/**
* Parse a fragment of HTML into a list of nodes. The context element, if
* supplied, supplies parsing context.
*
* @param fragmentHtml
* the fragment of HTML to parse
* @param context
* (optional) the element that this HTML fragment is being parsed
* for (i.e. for inner HTML). This provides stack context (for
* implicit element creation).
* @param baseUri
* base URI of document (i.e. original fetch location), for
* resolving relative URLs.
*
* @return list of nodes parsed from the input HTML. Note that the context
* element, if supplied, is not modified.
*/
public static List<Node> parseFragment(String fragmentHtml,
Element context, String baseUri) {
HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
return treeBuilder.parseFragment(fragmentHtml, context, baseUri,
ParseErrorList.noTracking());
}
/**
* Parse a fragment of XML into a list of nodes.
*
* @param fragmentXml
* the fragment of XML to parse
* @param baseUri
* base URI of document (i.e. original fetch location), for
* resolving relative URLs.
* @return list of nodes parsed from the input XML.
*/
public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
return treeBuilder.parseFragment(fragmentXml, baseUri,
ParseErrorList.noTracking());
}
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml
* fragment of HTML
* @param baseUri
* base URI of document (i.e. original fetch location), for
* resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node
// list gets
// modified
// when
// re-parented
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
/**
* Utility method to unescape HTML entities from a string
*
* @param string
* HTML escaped string
* @param inAttribute
* if the string is to be escaped in strict mode (as attributes
* are)
* @return an unescaped string
*/
public static String unescapeEntities(String string, boolean inAttribute) {
Tokeniser tokeniser = new Tokeniser(new CharacterReader(string),
ParseErrorList.noTracking());
return tokeniser.unescapeEntities(inAttribute);
}
/**
* @param bodyHtml
* HTML to parse
* @param baseUri
* baseUri base URI of document (i.e. original fetch location),
* for resolving relative URLs.
*
* @return parsed Document
* @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment}
* instead.
*/
public static Document parseBodyFragmentRelaxed(String bodyHtml,
String baseUri) {
return parse(bodyHtml, baseUri);
}
// builders
/**
* Create a new HTML parser. This parser treats input as HTML5, and enforces
* the creation of a normalised document, based on a knowledge of the
* semantics of the incoming tags.
*
* @return a new HTML parser.
*/
public static Parser htmlParser() {
return new Parser(new HtmlTreeBuilder());
}
/**
* Create a new XML parser. This parser assumes no knowledge of the incoming
* tags and does not treat it as HTML, rather creates a simple tree directly
* from the input.
*
* @return a new simple XML parser.
*/
public static Parser xmlParser() {
return new Parser(new XmlTreeBuilder());
}
}