package com.grendelscan.commons.html;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.List;
import org.cobra_grendel.html.domimpl.HTMLHtmlElementImpl;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.html2.HTMLElement;
/**
* The W3C DOM binding does not contain a way to convert
* nodes into the corresponding HTML text. This class
* performs that function. Based on the sample DOM writer in
* Xerces by Andy Clark.
*
* @author Andy Clark, David Byrne
*/
public class HtmlNodeWriter
{
/** Default canonical output (false). */
protected static final boolean DEFAULT_CANONICAL = false;
/** Default dynamic validation support (false). */
protected static final boolean DEFAULT_DYNAMIC_VALIDATION = false;
/**
* Default generate synthetic schema annotations
* (false).
*/
protected static final boolean DEFAULT_GENERATE_SYNTHETIC_ANNOTATIONS = false;
/** Default honour all schema locations (false). */
protected static final boolean DEFAULT_HONOUR_ALL_SCHEMA_LOCATIONS = false;
/** Default load external DTD (true). */
protected static final boolean DEFAULT_LOAD_EXTERNAL_DTD = true;
/** Default namespaces support (true). */
protected static final boolean DEFAULT_NAMESPACES = true;
/** Default parser name. */
protected static final String DEFAULT_PARSER_NAME = "dom.wrappers.Xerces";
/** Default Schema full checking support (false). */
protected static final boolean DEFAULT_SCHEMA_FULL_CHECKING = false;
/** Default Schema validation support (false). */
protected static final boolean DEFAULT_SCHEMA_VALIDATION = false;
/** Default validate schema annotations (false). */
protected static final boolean DEFAULT_VALIDATE_ANNOTATIONS = false;
/** Default validation support (false). */
protected static final boolean DEFAULT_VALIDATION = false;
/** Default XInclude processing support (false). */
protected static final boolean DEFAULT_XINCLUDE = false;
// default settings
/** Default XInclude fixup base URIs support (true). */
protected static final boolean DEFAULT_XINCLUDE_FIXUP_BASE_URIS = true;
/** Default XInclude fixup language support (true). */
protected static final boolean DEFAULT_XINCLUDE_FIXUP_LANGUAGE = true;
/**
* Dynamic validation feature id
* (http://apache.org/xml/features/validation/dynamic).
*/
protected static final String DYNAMIC_VALIDATION_FEATURE_ID = "http://apache.org/xml/features/validation/dynamic";
/**
* Generate synthetic schema annotations feature id
* (http://apache.org/xml/features/generate-synthetic-annotations).
*/
protected static final String GENERATE_SYNTHETIC_ANNOTATIONS_ID = "http://apache.org/xml/features/generate-synthetic-annotations";
/**
* Honour all schema locations feature id
* (http://apache.org/xml/features/honour-all-schemaLocations).
*/
protected static final String HONOUR_ALL_SCHEMA_LOCATIONS_ID = "http://apache.org/xml/features/honour-all-schemaLocations";
/**
* Load external DTD feature id
* (http://apache.org/xml/features/nonvalidating/load-external-dtd).
*/
protected static final String LOAD_EXTERNAL_DTD_FEATURE_ID = "http://apache.org/xml/features/nonvalidating/load-external-dtd";
/**
* Namespaces feature id
* (http://xml.org/sax/features/namespaces).
*/
protected static final String NAMESPACES_FEATURE_ID = "http://xml.org/sax/features/namespaces";
/**
* Schema full checking feature id
* (http://apache.org/xml/features/validation/schema-full-checking).
*/
protected static final String SCHEMA_FULL_CHECKING_FEATURE_ID = "http://apache.org/xml/features/validation/schema-full-checking";
/**
* Schema validation feature id
* (http://apache.org/xml/features/validation/schema).
*/
protected static final String SCHEMA_VALIDATION_FEATURE_ID = "http://apache.org/xml/features/validation/schema";
/**
* Validate schema annotations feature id
* (http://apache.org/xml/features/validate-annotations).
*/
protected static final String VALIDATE_ANNOTATIONS_ID = "http://apache.org/xml/features/validate-annotations";
/**
* Validation feature id
* (http://xml.org/sax/features/validation).
*/
protected static final String VALIDATION_FEATURE_ID = "http://xml.org/sax/features/validation";
/**
* XInclude feature id
* (http://apache.org/xml/features/xinclude).
*/
protected static final String XINCLUDE_FEATURE_ID = "http://apache.org/xml/features/xinclude";
/**
* XInclude fixup base URIs feature id
* (http://apache.org/xml/features/xinclude/fixup-base-uris).
*/
protected static final String XINCLUDE_FIXUP_BASE_URIS_FEATURE_ID = "http://apache.org/xml/features/xinclude/fixup-base-uris";
/**
* XInclude fixup language feature id
* (http://apache.org/xml/features/xinclude/fixup-language).
*/
protected static final String XINCLUDE_FIXUP_LANGUAGE_FEATURE_ID = "http://apache.org/xml/features/xinclude/fixup-language";
//
// Data
//
/**
* Creates HTML of the supplied node, including any
* child nodes and attributes.
*
* @param node
* The HTML node to write
* @param fXML11
* Sets XML version to 1.1. In other methods,
* the default is 1.0
* @param fCanonical
* Sets cannonical mode. Basically,
* cannonical only outputs what is needed for
* proper parsing. The primary impact of this
* is that comments are not parsed.
* @param processChildNodes
* Whether or not to recursively process
* child nodes. Attributes are processed
* regardless of this setting.
*/
public static String write(Node node, boolean processChildNodes, List<Class<? extends HTMLElement>> nodesToPrint)
{
boolean fCanonical = false;
boolean fXML11 = false;
String text = "";
// is there anything to do?
if (node == null)
{
return text;
}
short type = node.getNodeType();
switch (type)
{
case Node.DOCUMENT_NODE:
{
Document document = (Document) node;
fXML11 = "1.1".equals(getVersion(document));
if (!fCanonical)
{
if (fXML11)
{
text += "<?xml version=\"1.1\" encoding=\"UTF-8\"?>";
}
else
{
text += "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
}
if (processChildNodes)
{
text += write(document.getDoctype(), processChildNodes, nodesToPrint);
}
}
if (processChildNodes)
{
Element e = document.getDocumentElement();
if (! (e instanceof HTMLHtmlElementImpl))
{
// This is in case there is no HTML tag. Some of the DOM can still be salvaged.
Node child = document.getFirstChild();
while (child != null)
{
text += write(child, processChildNodes, nodesToPrint);
child = child.getNextSibling();
}
}
else
{
text += write(e, processChildNodes, nodesToPrint);
}
}
break;
}
case Node.DOCUMENT_TYPE_NODE:
{
if ((nodesToPrint == null || nodesToPrint.contains(node.getClass())))
{
DocumentType doctype = (DocumentType) node;
text += "<!DOCTYPE " + doctype.getName();
String publicId = doctype.getPublicId();
String systemId = doctype.getSystemId();
if (publicId != null)
{
text += " PUBLIC '" + publicId + "' '" + systemId + '\'';
}
else if (systemId != null)
{
text += " SYSTEM '" + systemId + '\'';
}
String internalSubset = doctype.getInternalSubset();
if (internalSubset != null)
{
text += " [\n" + internalSubset + ']';
}
text += ">\n";
}
break;
}
case Node.ATTRIBUTE_NODE:
{
text += ' ' + node.getNodeName() + "=\"";
text += normalizeAndPrint(node.getNodeValue(), true, fCanonical, fXML11);
text += '"';
break;
}
case Node.ELEMENT_NODE:
{
if (nodesToPrint == null || nodesToPrint.contains(node.getClass()))
{
text += '<' + node.getNodeName();
Attr attrs[] = sortAttributes(node.getAttributes());
for (int i = 0; i < attrs.length; i++)
{
text += write(attrs[i], processChildNodes, nodesToPrint);
}
text += '>';
}
if (processChildNodes)
{
Node child = node.getFirstChild();
while (child != null)
{
text += write(child, processChildNodes, nodesToPrint);
child = child.getNextSibling();
}
}
break;
}
case Node.ENTITY_REFERENCE_NODE:
{
if (fCanonical)
{
if (processChildNodes)
{
Node child = node.getFirstChild();
while (child != null)
{
text += write(child, processChildNodes, nodesToPrint);
child = child.getNextSibling();
}
}
}
else
{
text += '&' + node.getNodeName() + ';';
}
break;
}
case Node.CDATA_SECTION_NODE:
{
if (nodesToPrint == null || nodesToPrint.contains(node.getClass()))
{
if (fCanonical)
{
text += normalizeAndPrint(node.getNodeValue(), false, fCanonical, fXML11);
}
else
{
text += "<![CDATA[" + node.getNodeValue() + "]]>";
}
}
break;
}
case Node.TEXT_NODE:
{
if (nodesToPrint == null || nodesToPrint.contains(node.getClass()))
{
text += normalizeAndPrint(node.getNodeValue(), false, fCanonical, fXML11);
}
break;
}
case Node.PROCESSING_INSTRUCTION_NODE:
{
if (nodesToPrint == null || nodesToPrint.contains(node.getClass()))
{
text += "<?" + node.getNodeName();
String data = node.getNodeValue();
if ((data != null) && (data.length() > 0))
{
text += ' ' + data;
}
text += "?>";
}
break;
}
case Node.COMMENT_NODE:
{
if (!fCanonical && (nodesToPrint == null || nodesToPrint.contains(node.getClass())))
{
text += "<!--";
String comment = node.getNodeValue();
if ((comment != null) && (comment.length() > 0))
{
text += comment;
}
text += "-->";
}
}
}
if (type == Node.ELEMENT_NODE && (nodesToPrint == null || nodesToPrint.contains(node.getClass())))
{
text += "</" + node.getNodeName() + '>';
}
return text;
} // write(Node)
/** Extracts the XML version from the Document. */
protected static String getVersion(Document document)
{
if (document == null)
{
return null;
}
String version = null;
Method getXMLVersion = null;
try
{
getXMLVersion = document.getClass().getMethod("getXmlVersion", new Class[] {});
// If Document class implements DOM L3, this
// method will exist.
if (getXMLVersion != null)
{
version = (String) getXMLVersion.invoke(document, (Object[]) null);
}
}
catch (InvocationTargetException e)
{
}
catch (IllegalArgumentException e)
{
}
catch (IllegalAccessException e)
{
}
catch (SecurityException e)
{
}
catch (NoSuchMethodException e)
{
}
return version;
} // getVersion(Document)
//
// Protected methods
//
/** Normalizes and print the given character. */
protected static String normalizeAndPrint(char c, boolean isAttValue, boolean fCanonical, boolean fXML11)
{
String text = "";
switch (c)
{
case '<':
{
text += "<";
break;
}
case '>':
{
text += ">";
break;
}
case '&':
{
text += "&";
break;
}
case '"':
{
// A '"' that appears in character data
// does not need to be escaped.
if (isAttValue)
{
text += """;
}
else
{
text += "\"";
}
break;
}
case '\r':
{
// If CR is part of the document's content,
// it
// must not be printed as a literal
// otherwise
// it would be normalized to LF when the
// document
// is reparsed.
text += "
";
break;
}
case '\n':
{
if (fCanonical)
{
text += "
";
break;
}
// else, default print char
}
default:
{
// In XML 1.1, control chars in the ranges
// [#x1-#x1F, #x7F-#x9F] must be escaped.
//
// Escape space characters that would be
// normalized to #x20 in attribute values
// when the document is reparsed.
//
// Escape NEL (0x85) and LSEP (0x2028) that
// appear in content
// if the document is XML 1.1, since they
// would
// be normalized to LF
// when the document is reparsed.
if ((fXML11 && (((c >= 0x01) && (c <= 0x1F) && (c != 0x09) && (c != 0x0A)) || ((c >= 0x7F) && (c <= 0x9F)) || (c == 0x2028))) || (isAttValue && ((c == 0x09) || (c == 0x0A))))
{
text += "" + Integer.toHexString(c).toUpperCase() + ";";
}
else
{
text += c;
}
}
}
return text;
} // normalizeAndPrint(char,boolean)
/** Normalizes and prints the given string. */
protected static String normalizeAndPrint(String s, boolean isAttValue, boolean fCanonical, boolean fXML11)
{
String text = "";
int len = (s != null) ? s.length() : 0;
for (int i = 0; i < len; i++)
{
char c = s.charAt(i);
text += normalizeAndPrint(c, isAttValue, fCanonical, fXML11);
}
return text;
} // normalizeAndPrint(String,boolean)
/** Returns a sorted list of attributes. */
protected static Attr[] sortAttributes(NamedNodeMap attrs)
{
int len = (attrs != null) ? attrs.getLength() : 0;
Attr array[] = new Attr[len];
for (int i = 0; i < len; i++)
{
array[i] = (Attr) attrs.item(i);
}
for (int i = 0; i < len - 1; i++)
{
String name = array[i].getNodeName();
int index = i;
for (int j = i + 1; j < len; j++)
{
String curName = array[j].getNodeName();
if (curName.compareTo(name) < 0)
{
name = curName;
index = j;
}
}
if (index != i)
{
Attr temp = array[i];
array[i] = array[index];
array[index] = temp;
}
}
return array;
} // sortAttributes(NamedNodeMap):Attr[]
public static String writeTextOnly(Node node, boolean includeComments)
{
String text = "";
// is there anything to do?
if (node == null) return text;
switch (node.getNodeType())
{
case Node.DOCUMENT_NODE:
case Node.ELEMENT_NODE:
case Node.ENTITY_REFERENCE_NODE:
{
String name = node.getNodeName();
if (name.equalsIgnoreCase("script"))
{
break;
}
Node child = node.getFirstChild();
int count = 0;
int t = node.getNodeType();
while (child != null && count++ < 1000)
{
if (child.getNodeType() == Node.TEXT_NODE)
{
text += child.getNodeValue();
}
else
{
text += writeTextOnly(child, includeComments);
}
child = child.getNextSibling();
}
break;
}
case Node.COMMENT_NODE:
{
if (includeComments)
{
text += " ";
String comment = node.getNodeValue();
if ((comment != null) && (comment.length() > 0))
{
text += comment;
}
text += " ";
}
break;
}
default:
{
// Don't do anything
}
}
return text;
}
} // class Writer