/**
* (The MIT License)
*
* Copyright (c) 2008 - 2012:
*
* * {Aaron Patterson}[http://tenderlovemaking.com]
* * {Mike Dalessio}[http://mike.daless.io]
* * {Charles Nutter}[http://blog.headius.com]
* * {Sergio Arbeo}[http://www.serabe.com]
* * {Patrick Mahoney}[http://polycrystal.org]
* * {Yoko Harada}[http://yokolet.blogspot.com]
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* 'Software'), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package nokogiri.internals;
import static nokogiri.internals.NokogiriHelpers.canonicalizeWhitespce;
import static nokogiri.internals.NokogiriHelpers.encodeJavaString;
import static nokogiri.internals.NokogiriHelpers.isNamespace;
import static nokogiri.internals.NokogiriHelpers.isWhitespaceText;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Deque;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.cyberneko.html.HTMLElements;
import org.w3c.dom.Attr;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.Entity;
import org.w3c.dom.EntityReference;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.Notation;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
/**
* A class for serializing a document.
*
* @author sergio
* @author Patrick Mahoney <pat@polycrystal.org>
* @author Yoko Harada <yokolet@gmail.com>
*/
public class SaveContextVisitor {
private final StringBuffer buffer;
private final Stack<String> indentation;
private String encoding;
private final String indentString;
private boolean format;
private final boolean noDecl;
private final boolean noEmpty;
private final boolean noXhtml;
private final boolean asXhtml;
private boolean asXml;
private final boolean asHtml;
private final boolean asBuilder;
private boolean htmlDoc;
private final boolean fragment;
private final boolean canonical, incl_ns, with_comments;
private boolean subsets;
private boolean exclusive;
private final List<Node> c14nNodeList;
private final Deque<Attr[]> c14nNamespaceStack;
private final Deque<Attr[]> c14nAttrStack;
private List<String> c14nExclusiveInclusivePrefixes = null;
/*
* U can't touch this.
* http://www.youtube.com/watch?v=WJ2ZFVx6A4Q
*
* Taken from libxml save options.
*/
public static final int FORMAT = 1;
public static final int NO_DECL = 2;
public static final int NO_EMPTY = 4;
public static final int NO_XHTML = 8;
public static final int AS_XHTML = 16;
public static final int AS_XML = 32;
public static final int AS_HTML = 64;
public static final int AS_BUILDER = 128;
public static final int CANONICAL = 1;
public static final int INCL_NS = 2;
public static final int WITH_COMMENTS = 4;
public static final int SUBSETS = 8;
public static final int EXCLUSIVE = 16;
public SaveContextVisitor(int options, String indent, String encoding, boolean htmlDoc, boolean fragment, int canonicalOpts) {
buffer = new StringBuffer();
this.encoding = encoding;
indentation = new Stack<String>(); indentation.push("");
this.htmlDoc = htmlDoc;
this.fragment = fragment;
c14nNodeList = new ArrayList<Node>();
c14nNamespaceStack = new ArrayDeque<Attr[]>();
c14nAttrStack = new ArrayDeque<Attr[]>();
format = (options & FORMAT) == FORMAT;
noDecl = (options & NO_DECL) == NO_DECL;
noEmpty = (options & NO_EMPTY) == NO_EMPTY;
noXhtml = (options & NO_XHTML) == NO_XHTML;
asXhtml = (options & AS_XHTML) == AS_XHTML;
asXml = (options & AS_XML) == AS_XML;
asHtml = (options & AS_HTML) == AS_HTML;
asBuilder = (options & AS_BUILDER) == AS_BUILDER;
canonical = (canonicalOpts & CANONICAL) == CANONICAL;
incl_ns = (canonicalOpts & INCL_NS) == INCL_NS;
with_comments = (canonicalOpts & WITH_COMMENTS) == WITH_COMMENTS;
subsets = (canonicalOpts & SUBSETS) == SUBSETS;
if ((format && indent == null) || (format && indent.length() == 0)) indent = " "; // default, two spaces
if ((!format && indent != null) && indent.length() > 0) format = true;
if ((asBuilder && indent == null) || (asBuilder && indent.length() == 0)) indent = " "; // default, two spaces
indentString = indent;
if (!asXml && !asHtml && !asXhtml && !asBuilder) asXml = true;
}
@Override
public String toString() {
return (new String(buffer));
}
public void setHtmlDoc(boolean htmlDoc) {
this.htmlDoc = htmlDoc;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public List<Node> getC14nNodeList() {
return c14nNodeList;
}
public void setC14nExclusiveInclusivePrefixes(List<String> prefixes) {
c14nExclusiveInclusivePrefixes = prefixes;
}
public boolean enter(Node node) {
if (node instanceof Document) {
return enter((Document)node);
}
if (node instanceof Element) {
return enter((Element)node);
}
if (node instanceof Attr) {
return enter((Attr)node);
}
if (node instanceof Text) {
return enter((Text)node);
}
if (node instanceof CDATASection) {
return enter((CDATASection)node);
}
if (node instanceof Comment) {
return enter((Comment)node);
}
if (node instanceof DocumentType) {
return enter((DocumentType)node);
}
if (node instanceof Entity) {
return enter((Entity)node);
}
if (node instanceof EntityReference) {
return enter((EntityReference) node);
}
if (node instanceof Notation) {
return enter((Notation)node);
}
if (node instanceof ProcessingInstruction) {
return enter((ProcessingInstruction)node);
}
return false;
}
public void leave(Node node) {
if (node instanceof Document) {
leave((Document)node);
return;
}
if (node instanceof Element) {
leave((Element)node);
return;
}
if (node instanceof Attr) {
leave((Attr)node);
return;
}
if (node instanceof Text) {
return;
}
if (node instanceof CDATASection) {
leave((CDATASection)node);
return;
}
if (node instanceof Comment) {
leave((Comment)node);
return;
}
if (node instanceof DocumentType) {
leave((DocumentType)node);
return;
}
if (node instanceof Entity) {
leave((Entity)node);
return;
}
if (node instanceof EntityReference) {
leave((EntityReference) node);
return;
}
if (node instanceof Notation) {
leave((Notation)node);
return;
}
if (node instanceof ProcessingInstruction) {
leave((ProcessingInstruction)node);
return;
}
}
public boolean enter(String string) {
buffer.append(string);
return true;
}
public void leave(String string) {
// no-op
}
public boolean enter(Attr attr) {
String name = attr.getName();
buffer.append(name);
if (!asHtml || !isHtmlBooleanAttr(name)) {
buffer.append("=");
buffer.append("\"");
String value = replaceCharsetIfNecessary(attr);
buffer.append(serializeAttrTextContent(value, htmlDoc));
buffer.append("\"");
}
return true;
}
private static Pattern p =
Pattern.compile("charset(()|\\s+)=(()|\\s+)(\\w|\\_|\\.|\\-)+", Pattern.CASE_INSENSITIVE);
private String replaceCharsetIfNecessary(Attr attr) {
String value = attr.getValue();
if (encoding == null) return value; // unable to replace in any case
if (!"content".equals(attr.getName().toLowerCase())) return value; // must be content attr
if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) return value;
Matcher m = p.matcher(value);
if (!m.find()) return value;
if (value.contains(encoding)) return value; // no need to replace
return value.replace(m.group(), "charset=" + encoding);
}
public static final String[] HTML_BOOLEAN_ATTRS = {
"checked", "compact", "declare", "defer", "disabled", "ismap",
"multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
"selected"
};
private boolean isHtmlBooleanAttr(String name) {
for (String s : HTML_BOOLEAN_ATTRS) {
if (s.equals(name)) return true;
}
return false;
}
private String serializeAttrTextContent(String s, boolean htmlDoc) {
if (s == null) return "";
char[] c = s.toCharArray();
StringBuffer buffer = new StringBuffer(c.length);
for(int i = 0; i < c.length; i++) {
switch(c[i]){
case '\n': buffer.append("
"); break;
case '\r': buffer.append("
"); break;
case '\t': buffer.append(" "); break;
case '"': if (htmlDoc) buffer.append("%22");
else buffer.append(""");
break;
case '<': buffer.append("<"); break;
case '>': buffer.append(">"); break;
case '&': buffer.append("&"); break;
default: buffer.append(c[i]);
}
}
return buffer.toString();
}
public void leave(Attr attr) {
// no-op
}
public boolean enter(CDATASection cdata) {
buffer.append("<![CDATA[");
buffer.append(cdata.getData());
buffer.append("]]>");
return true;
}
public void leave(CDATASection cdata) {
// no-op
}
public boolean enter(Comment comment) {
if (canonical) {
c14nNodeList.add(comment);
if (!with_comments) return true;
}
buffer.append("<!--");
buffer.append(comment.getData());
buffer.append("-->");
return true;
}
public void leave(Comment comment) {
// no-op
}
public boolean enter(Document document) {
if (!noDecl) {
buffer.append("<?xml version=\"");
buffer.append(document.getXmlVersion());
buffer.append("\"");
if (encoding != null) {
buffer.append(" encoding=\"");
buffer.append(encoding);
buffer.append("\"");
}
buffer.append("?>\n");
}
return true;
}
public void leave(Document document) {
// no-op
}
public boolean enter(DocumentType docType) {
if (canonical) {
c14nNodeList.add(docType);
return true;
}
String name = docType.getName();
String pubId = docType.getPublicId();
String sysId = docType.getSystemId();
String internalSubset = docType.getInternalSubset();
if (docType.getPreviousSibling() != null) {
buffer.append("\n");
}
buffer.append("<!DOCTYPE " + name + " ");
if (pubId != null) {
buffer.append("PUBLIC \"" + pubId + "\"");
if (sysId != null) buffer.append(" \"" + sysId + "\"");
} else if (sysId != null) {
buffer.append("SYSTEM \"" + sysId + "\"");
}
if (internalSubset != null) {
buffer.append(" [");
buffer.append(internalSubset);
buffer.append("]");
}
buffer.append(">\n");
return true;
}
public void leave(DocumentType docType) {
// no-op
}
public boolean enter(Element element) {
if (canonical) {
c14nNodeList.add(element);
if (element == element.getOwnerDocument().getDocumentElement()) {
c14nNodeList.add(element.getOwnerDocument());
}
}
String current = indentation.peek();
buffer.append(current);
if (needIndent(element)) {
indentation.push(current + indentString);
}
String name = element.getTagName();
buffer.append("<" + name);
Attr[] attrs = getAttrsAndNamespaces(element);
for (Attr attr : attrs) {
if (attr.getSpecified()) {
buffer.append(" ");
enter(attr);
leave(attr);
}
}
if (element.hasChildNodes()) {
buffer.append(">");
if (needBreakInOpening(element)) buffer.append("\n");
return true;
}
// no child
if (asHtml) {
buffer.append(">");
} else if (asXml && noEmpty) {
buffer.append(">");
} else if (asXhtml) {
if (isEmpty(name)) {
buffer.append(" />"); // see http://www.w3.org/TR/xhtml1/#C_2
} else {
buffer.append(">");
}
} else {
buffer.append("/>");
}
if (needBreakInOpening(element)) {
buffer.append("\n");
}
return true;
}
private boolean needIndent(Element element) {
if (containsText(element)) return false;
if (fragment) return false; // a given option might be fragment and format. fragment matters
if (format || asBuilder) return true;
return false;
}
private boolean needBreakInOpening(Element element) {
if (containsText(element)) return false;
if (fragment) return false;
if (format) return true;
if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true;
if (format && element.getNextSibling() == null && element.hasChildNodes()) return true;
return false;
}
private boolean isEmpty(String name) {
HTMLElements.Element element = HTMLElements.getElement(name);
return element.isEmpty();
}
private Attr[] getAttrsAndNamespaces(Element element) {
NamedNodeMap attrs = element.getAttributes();
if (!canonical) {
if (attrs == null || attrs.getLength() == 0) return new Attr[0];
Attr[] attrsAndNamespaces = new Attr[attrs.getLength()];
for (int i=0; i<attrs.getLength(); i++) {
attrsAndNamespaces[i] = (Attr) attrs.item(i);
}
return attrsAndNamespaces;
} else {
List<Attr> namespaces = new ArrayList<Attr>();
List<Attr> attributes = new ArrayList<Attr>();
if (subsets) {
getAttrsOfAncestors(element.getParentNode(), namespaces, attributes);
Attr[] namespaceOfAncestors = getSortedArray(namespaces);
Attr[] attributeOfAncestors = getSortedArray(attributes);
c14nNamespaceStack.push(namespaceOfAncestors);
c14nAttrStack.push(attributeOfAncestors);
subsets = false; // namespace propagation should be done only once on top level node.
}
getNamespacesAndAttrs(element, namespaces, attributes);
Attr[] namespaceArray = getSortedArray(namespaces);
Attr[] attributeArray = getSortedArray(attributes);
Attr[] allAttrs = new Attr[namespaceArray.length + attributeArray.length];
for (int i=0; i<allAttrs.length; i++) {
if (i < namespaceArray.length) {
allAttrs[i] = namespaceArray[i];
} else {
allAttrs[i] = attributeArray[i-namespaceArray.length];
}
}
c14nNamespaceStack.push(namespaceArray);
c14nAttrStack.push(attributeArray);
return allAttrs;
}
}
private void getAttrsOfAncestors(Node parent, List<Attr> namespaces, List<Attr> attributes) {
if (parent == null) return;
NamedNodeMap attrs = parent.getAttributes();
if (attrs == null || attrs.getLength() == 0) return;
for (int i=0; i < attrs.getLength(); i++) {
Attr attr = (Attr)attrs.item(i);
if (isNamespace(attr.getNodeName())) namespaces.add(attr);
else attributes.add(attr);
}
getAttrsOfAncestors(parent.getParentNode(), namespaces, attributes);
}
private void getNamespacesAndAttrs(Node current, List<Attr> namespaces, List<Attr> attributes) {
NamedNodeMap attrs = current.getAttributes();
for (int i=0; i<attrs.getLength(); i++) {
Attr attr = (Attr)attrs.item(i);
if (isNamespace(attr.getNodeName())) {
getNamespacesWithPropagated(namespaces, attr);
} else {
getAttributesWithPropagated(attributes, attr);
}
if (exclusive) {
verifyXmlSpace(attributes, attrs);
}
}
}
private void getNamespacesWithPropagated(List<Attr> namespaces, Attr attr) {
boolean newNamespace = true;
Iterator<Attr[]> iter = c14nNamespaceStack.iterator();
while (iter.hasNext()) {
Attr[] parentNamespaces = iter.next();
for (int n=0; n < parentNamespaces.length; n++) {
if (parentNamespaces[n].getNodeName().equals(attr.getNodeName())) {
if (parentNamespaces[n].getNodeValue().equals(attr.getNodeValue())) {
// exactly the same namespace should not be added
newNamespace = false;
} else {
// in case of namespace url change, propagated namespace will be override
namespaces.remove(parentNamespaces[n]);
}
}
}
if (newNamespace && !namespaces.contains(attr)) namespaces.add(attr);
}
}
private void getAttributesWithPropagated(List<Attr> attributes, Attr attr) {
boolean newAttribute = true;
Iterator<Attr[]> iter = c14nAttrStack.iterator();
while (iter.hasNext()) {
Attr[] parentAttr = iter.next();
for (int n=0; n < parentAttr.length; n++) {
if (!parentAttr[n].getNodeName().startsWith("xml:")) continue;
if (parentAttr[n].getNodeName().equals(attr.getNodeName())) {
if (parentAttr[n].getNodeValue().equals(attr.getNodeValue())) {
// exactly the same attribute should not be added
newAttribute = false;
} else {
// in case of attribute value change, propagated attribute will be override
attributes.remove(parentAttr[n]);
}
}
}
if (newAttribute) attributes.add(attr);
}
}
private void verifyXmlSpace(List<Attr> attributes, NamedNodeMap attrs) {
Attr attr = (Attr) attrs.getNamedItem("xml:space");
if (attr == null) {
for (int i=0; i < attributes.size(); i++) {
if (attributes.get(i).getNodeName().equals("xml:space")) {
attributes.remove(i);
break;
}
}
}
}
private Attr[] getSortedArray(List<Attr> attrList) {
Attr[] attrArray = attrList.toArray(new Attr[0]);
Arrays.sort(attrArray, new Comparator<Attr>() {
@Override
public int compare(Attr attr0, Attr attr1) {
return attr0.getNodeName().compareTo(attr1.getNodeName());
}
});
return attrArray;
}
public void leave(Element element) {
if (canonical) {
c14nNamespaceStack.poll();
c14nAttrStack.poll();
}
String name = element.getTagName();
if (element.hasChildNodes()) {
if (needIndentInClosing(element)) {
indentation.pop();
buffer.append(indentation.peek());
} else if (asBuilder) {
if (!containsText(element)) indentation.pop();
}
buffer.append("</" + name + ">");
if (needBreakInClosing(element)) {
buffer.append("\n");
}
return;
}
// no child, but HTML might need a closing tag.
if (asHtml || noEmpty) {
if (!isEmpty(name) && noEmpty) {
buffer.append("</" + name + ">");
}
}
if (needBreakInClosing(element)) {
if (!containsText(element)) indentation.pop();
buffer.append("\n");
}
}
private boolean needIndentInClosing(Element element) {
if (containsText(element)) return false;
if (fragment) return false; // a given option might be fragment and format. fragment matters
if (format) return true;
if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true;
return false;
}
private boolean needBreakInClosing(Element element) {
if (fragment) return false;
if (format || asBuilder) return true;
return false;
}
private boolean containsText(Element element) {
return (element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.TEXT_NODE);
}
public boolean enter(Entity entity) {
String name = entity.getNodeName();
String pubId = entity.getPublicId();
String sysId = entity.getSystemId();
String notation = entity.getNotationName();
buffer.append("<!ENTITY ");
buffer.append(name);
if (pubId != null) {
buffer.append(" PUBLIC \"");
buffer.append(pubId);
buffer.append("\"");
}
if (sysId != null) {
buffer.append(" SYSTEM \"");
buffer.append(sysId);
buffer.append("\"");
}
if (notation != null) {
buffer.append(" NDATA ");
buffer.append(notation);
}
buffer.append(">");
return true;
}
public void leave(Entity entity) {
// no-op
}
public boolean enter(EntityReference entityRef) {
buffer.append("&" + entityRef.getNodeName() + ";");
return true;
}
public void leave(EntityReference entityRef) {
// no-op
}
public boolean enter(Notation notation) {
String name = notation.getNodeName();
String pubId = notation.getPublicId();
String sysId = notation.getSystemId();
buffer.append("<!NOTATION ");
buffer.append(name);
if (pubId != null) {
buffer.append(" PUBLIC \"");
buffer.append(pubId);
buffer.append("\"");
if (sysId != null) {
buffer.append(" \"");
buffer.append(sysId);
buffer.append("\"");
}
} else if (sysId != null) {
buffer.append(" SYSTEM \"");
buffer.append(sysId);
buffer.append("\"");
}
buffer.append(">");
return true;
}
public void leave(Notation notation) {
// no-op
}
public boolean enter(ProcessingInstruction pi) {
buffer.append("<?");
buffer.append(pi.getTarget());
buffer.append(" ");
buffer.append(pi.getData());
if (asHtml) buffer.append(">");
else buffer.append("?>");
buffer.append("\n");
if (canonical) c14nNodeList.add(pi);
return true;
}
public void leave(ProcessingInstruction pi) {
// no-op
}
private boolean isHtmlScript(Text text) {
return htmlDoc && text.getParentNode().getNodeName().equals("script");
}
private static char lineSeparator = '\n'; // System.getProperty("line.separator"); ?
public boolean enter(Text text) {
String textContent = text.getNodeValue();
if (canonical) {
c14nNodeList.add(text);
if (isWhitespaceText(textContent)) {
buffer.append(canonicalizeWhitespce(textContent));
return true;
}
}
if (NokogiriHelpers.shouldEncode(text) && !isHtmlScript(text)) {
textContent = encodeJavaString(textContent);
}
textContent = encodeStringToHtmlEntity(textContent);
buffer.append(textContent);
return true;
}
private String getEncoding(Text text) {
if (encoding != null) return encoding;
encoding = text.getOwnerDocument().getInputEncoding();
return encoding;
}
private String encodeStringToHtmlEntity(String text) {
if (encoding == null)
return text;
CharsetEncoder encoder = Charset.forName(encoding).newEncoder();
int last = 126; // = U+007E. No need to encode under U+007E.
StringBuffer sb = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i);
if (encoder.canEncode(ch)) sb.append(ch);
else sb.append("" + Integer.toHexString(ch) + ";");
}
return new String(sb);
}
}