/* * An HTML5-only version of Validator.nu's HTML serializer. The original license * is reproduced below. * * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen * Copyright (c) 2006 Henri Sivonen * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package org.icij.extract.sax; import java.io.IOException; import java.io.Writer; import java.util.Arrays; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; /** * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document * to a <code>Writer</code> as a UTF-8-encoded HTML 5 document. The SAX events * must represent a valid XHTML 1.0 document, except the namespace prefixes * don't matter and there may be <code>startElement</code> and <code>endElement</code> * calls for elements from other namespaces. The <code>startElement</code> and * <code>endElement</code> calls for non-XHTML elements are ignored. No * validity checking is performed. Hence, the emitter of the SAX events is * responsible for making sure the events represent a document that meets the * above requirements. The <code>Writer</code> is not closed when the end of * the document is seen. * * @since 1.0.0-beta */ public class HTML5Serializer implements ContentHandler { /** * The XHTML namespace URI */ private final static String XHTML_NS = "http://www.w3.org/1999/xhtml"; /** * HTML 4.01 elements which don't have an end tag */ private static final String[] emptyElements = { "area", "base", "basefont", "br", "col", "command", "frame", "hr", "img", "input", "isindex", "link", "meta", "param" }; /** * Minimized "boolean" HTML attributes */ private static final String[] booleanAttributes = { "active", "async", "autofocus", "autosubmit", "checked", "compact", "declare", "default", "defer", "disabled", "ismap", "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", "required", "selected" }; /** * The writer used for output */ private final Writer writer; /** * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode * with the UTF-8 encoding and no charset meta. * * @param writer the writer to which the output is written */ public HTML5Serializer(final Writer writer) { this.writer = writer; } /** * Writes out characters. * * @param ch the source array * @param start the index of the first character to be written * @param length the number of characters to write * @throws SAXException if there are IO problems */ @Override public void characters(char[] ch, int start, int length) throws SAXException { try { for (int j = 0; j < length; j++) { char c = ch[start + j]; switch (c) { case '<': writer.write("<"); break; case '>': writer.write(">"); break; case '&': writer.write("&"); break; default: writer.write(c); } } } catch (IOException e) { throw (SAXException) new SAXException(e).initCause(e); } } /** * Must be called last. * * @throws SAXException if there are IO problems */ @Override public void endDocument() throws SAXException { try { writer.write('\n'); } catch (IOException e) { throw (SAXException) new SAXException(e).initCause(e); } } /** * Writes an end tag if the element is an XHTML element and is not an empty * element in HTML 4.01 Strict. * * @param namespaceURI the XML namespace * @param localName the element name in the namespace * @param qName ignored * @throws SAXException if there are IO problems */ @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { try { if (XHTML_NS.equals(namespaceURI) && Arrays.binarySearch(emptyElements, localName) < 0) { writer.write("</"); writer.write(localName); writer.write('>'); } } catch (IOException e) { throw (SAXException) new SAXException(e).initCause(e); } } /** * Must be called first. */ @Override public void startDocument() throws SAXException { try { writer.write("<!DOCTYPE html>\n"); } catch (IOException e) { throw (SAXException) new SAXException(e).initCause(e); } } /** * Writes a start tag if the element is an XHTML element. * * @param namespaceURI the XML namespace * @param localName the element name in the namespace * @param qName ignored * @param atts the attribute list * @throws SAXException if there are IO problems */ @Override public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { try { if (XHTML_NS.equals(namespaceURI)) { if ("meta".equals(localName) && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex( "", "httpequiv") != -1))) { return; } // start and element name writer.write('<'); writer.write(localName); // attributes int length = atts.getLength(); boolean langPrinted = false; for (int i = 0; i < length; i++) { String ns = atts.getURI(i); String name = null; if ("".equals(ns)) { name = atts.getLocalName(i); } else if ("http://www.w3.org/XML/1998/namespace".equals(ns) && "lang".equals(atts.getLocalName(i))) { name = "lang"; } if (name != null && !(langPrinted && "lang".equals(name))) { writer.write(' '); writer.write(name); if ("lang".equals(name)) { langPrinted = true; } if (Arrays.binarySearch(booleanAttributes, name) < 0) { // write value, escape certain characters writer.write("=\""); String value = atts.getValue(i); for (int j = 0; j < value.length(); j++) { char c = value.charAt(j); switch (c) { case '<': writer.write("<"); break; case '>': writer.write(">"); break; case '&': writer.write("&"); break; case '"': writer.write("""); break; default: writer.write(c); } } writer.write('"'); } } } // close writer.write('>'); if ("head".equals(localName)) { writer.write("<meta charset=\"UTF-8\">"); } } } catch (IOException e) { throw (SAXException) new SAXException(e).initCause(e); } } /** * This method does nothing. */ @Override public void endPrefixMapping(String str) throws SAXException { } /** * This method does nothing. */ @Override public void ignorableWhitespace(char[] values, int param, int param2) throws SAXException { } /** * This method does nothing. */ @Override public void processingInstruction(String str, String str1) throws SAXException { } /** * This method does nothing. */ @Override public void setDocumentLocator(Locator locator) { } /** * This method does nothing. */ @Override public void skippedEntity(String str) throws SAXException { } /** * This method does nothing. */ @Override public void startPrefixMapping(String str, String str1) throws SAXException { } }