/* * SimpleHTMLFragment.java * * Version: $Revision: 4695 $ * * Date: $Date: 2010-01-15 17:06:30 +0000 (Fri, 15 Jan 2010) $ * * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts * Institute of Technology. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the Hewlett-Packard Company nor the name of the * Massachusetts Institute of Technology nor the names of their * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.xmlui.wing.element; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.dspace.app.xmlui.wing.WingConstants; import org.dspace.app.xmlui.wing.WingContext; import org.dspace.app.xmlui.wing.WingException; import org.jdom.Attribute; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.Text; import org.jdom.input.SAXBuilder; import org.jdom.output.SAXOutputter; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.ext.LexicalHandler; import org.xml.sax.helpers.NamespaceSupport; /** * This class represents data that is translated from simple HTML or plain text. * * This class represents a simple HTML fragment. It allows for user supplied * HTML to be translated on the fly into DRI. * * At the present time it only supports the following tags: h1, h2, h3, h4, h5, * p, a, b, i, u, ol, li and img. Each are translated into their DRI equivelents, note * the "h" tags are translated into a paragraph of rend=heading. * * If the linkbreaks flag is set then line breaks are treated as paragraphs. This * allows plain text files to also be included and they will be mapped into DRI as * well. * * @author Scott Phillips * @author Jay Paz */ public class SimpleHTMLFragment extends AbstractWingElement { /** The HTML Fragment */ private String fragment; /** Determine if blank lines mark a new paragraph */ private boolean blankLines; /** * Construct a fragment object for translating into DRI. * * @param context * (Required) The context this element is contained in, such as * where to route SAX events and what i18n catalogue to use. * @param blankLines * (Required) Determine if blank lines should be treated as * paragraphs delimeters. * @param fragment * (Required) The HTML Fragment to be translated into DRI. * @throws WingException */ protected SimpleHTMLFragment(WingContext context, boolean blankLines, String fragment) throws WingException { super(context); this.blankLines = blankLines; this.fragment = fragment; } /** * Translate this element into SAX * * @param contentHandler * (Required) The registered contentHandler where SAX events * should be routed too. * @param lexicalHandler * (Required) The registered lexicalHandler where lexical events * (such as CDATA, DTD, etc) should be routed too. * @param namespaces * (Required) SAX Helper class to keep track of namespaces able * to determine the correct prefix for a given namespace URI. */ public void toSAX(ContentHandler contentHandler, LexicalHandler lexicalHandler, NamespaceSupport namespaces) throws SAXException { try { String xml = "<fragment>" + fragment + "</fragment>"; ByteArrayInputStream inputStream = new ByteArrayInputStream(xml .getBytes()); SAXBuilder builder = new SAXBuilder(); Document document = builder.build(inputStream); try { translate(document.getRootElement()); } catch (Throwable t) { throw new JDOMException( "Error translating HTML fragment into DRI", t); } SAXFilter filter = new SAXFilter(contentHandler, lexicalHandler, namespaces); SAXOutputter outputter = new SAXOutputter(); outputter.setContentHandler(filter); outputter.setLexicalHandler(filter); Element root = document.getRootElement(); @SuppressWarnings("unchecked") // This cast is correct List<Element> children = root.getChildren(); for (Element child : children) { outputter.output(child); } } catch (JDOMException e) { //If we are here, then a parsing error occurred within the XHTML fragment. We'll just assume // that this is not supposed to be XHTML and display the fragment as plain text within <dri:p> tags. startElement(contentHandler, namespaces, Para.E_PARA, null); sendCharacters(contentHandler, fragment); endElement(contentHandler, namespaces, Para.E_PARA); } catch (IOException ioe) { throw new SAXException(ioe); } } /** * dispose */ public void dispose() { super.dispose(); } /** * Remove the given content from the Element. * * If the content is an element then render it as text and include it's * children in the parent. * * @param content * The DOM Content to be removed. */ private void removeContent(Content content) { if (content instanceof Element) { // If it's an element replace the content with a text node. Element element = (Element) content; if (element.getContent().size() == 0) { // The element contains nothing, we can use shorthand notation // for it. String replacement = "<" + element.getName(); @SuppressWarnings("unchecked") // This cast is correct List<Attribute> attributes = element.getAttributes(); for (Attribute attribute : attributes) { replacement += " " + attribute.getName() + "=\"" + attribute.getValue() + "\""; } replacement += "/>"; Element parent = element.getParentElement(); int index = parent.indexOf(element); parent.setContent(index, new Text(replacement)); } else { // The element contains data String prepend = "<" + element.getName(); @SuppressWarnings("unchecked") // This cast is correct List<Attribute> attributes = element.getAttributes(); for (Attribute attribute : attributes) { prepend += " " + attribute.getName() + "=\"" + attribute.getValue() + "\""; } prepend += ">"; String postpend = "</" + element.getName() + ">"; Element parent = element.getParentElement(); int index = parent.indexOf(element); parent.addContent(index, new Text(postpend)); parent.addContent(index, element.removeContent()); parent.addContent(index, new Text(prepend)); parent.removeContent(element); } } else { // If it's not an element just remove the content from the document. Element parent = content.getParentElement(); parent.removeContent(content); } } /** * Wrap the given set of contents into a paragraph and place it at the * supplied index. * * This method will also check for trivial paragraphs, i.e. those that * contain nothing but white space. If they are found then they are removed. * * @param parent * The parent element to attach the wrapped paragraph too. * @param index * The index within the parent for where the content should be * attached. * @param contents * The contents that should be wrapped in a paragraph. * @return wheather a paragraph was actualy added. */ private boolean paragraphWrap(Element parent, int index, List<Content> contents) { if (contents == null || contents.size() <= 0) return false; boolean empty = true; for (Content content : contents) { if (empty == false) continue; if (content instanceof Text) { Text text = (Text) content; if (!"".equals(text.getTextNormalize())) empty = false; } else { empty = false; } } if (empty == true) return false; // May be usefull for debugging: // contents.add(0, new Text("("+index+") ")); Element para = new Element(Para.E_PARA); para.addContent(contents); if (index >= 0) parent.addContent(index, para); else parent.addContent(para); return true; } /** * Ensure that the given element only has the supplied attributes. Also * remove any possible namespaces on the attributes. * * @param element * The element to be checked. * @param names * A list of all allowed attribute names, all others will be * removed. */ private void limitAttributes(Element element, String... names) { Map<String, String> attributes = new HashMap<String, String>(); for (String name : names) { String value = element.getAttributeValue(name); if (value != null) attributes.put(name, value); } element.setAttributes(new ArrayList<Attributes>()); for (String name : attributes.keySet()) { String value = attributes.get(name); element.setAttribute(name, value); } } /** * Move the old attribute to a new attribute. * * @param element * The element * @param oldName * The old attribute's name. * @param newName * The new attribute's name. */ private void moveAttribute(Element element, String oldName, String newName) { Attribute attribute = element.getAttribute(oldName); if (attribute != null) attribute.setName(newName); } /** * Translate the given HTML fragment into a DRI document. * * The translation is broken up into two steps, 1) recurse through all * elements and either translate them into their DRI equivelents or remove * them from the document. * * The second step, 2) is to iterate over all top level elements and ensure * that they only consist of paragraphs. Also at this stage if linkBreaks is * true then \n are treated as paragraph breaks. * * @param parent * The Element to translate into DRI. */ private void translate(Element parent) { // Step 1: // Recurse through all elements and either // translate them or remove them. for (int i = 0; i < parent.getContentSize(); i++) { Content decedent = parent.getContent(i); if (decedent instanceof org.jdom.Text) { } else if (decedent instanceof Element) { Element element = (Element) decedent; String name = element.getName(); // First all the DRI elements, allow them to pass. if ("p".equals(name)) { // Paragraphs are tricky, it may be either an HTML // or DRI <p> element. However, while HTML will allow // <p> to nest DRI does not, thus first we need to // check if this is at the block level, if it is then // we need remove it. if (parent.isRootElement()) { // The paragraph is not nested, so translate it to // a DRI <p> moveAttribute(element, "class", "rend"); limitAttributes(element, "id", "n", "rend"); translate(element); } else { // The paragraph is nested which is not allowed in // DRI, so remove it. removeContent(element); } } else if ("h1".equals(name) || "h2".equals(name) || "h3".equals(name) || "h4".equals(name) || "h5".equals(name)) { // The HTML <H1> tag is translated into the DRI // <p rend="heading"> tag. if (parent.isRootElement()) { limitAttributes(element); element.setName("p"); element.setAttribute("rend", "heading"); translate(element); } else { // DRI paragraphs can not be nested. removeContent(element); } } else if ("a".equals(name)) { // The HTML <a> tag is translated into the DRI // <xref> tag. moveAttribute(element, "href", "target"); limitAttributes(element, "target"); element.setName("xref"); translate(element); } else if ("ol".equals(name)) { // the HTML tag <ol> its translated into the DRI // <list> tag // <list type="ordered" n="list_part_one" // id="css.submit.LicenseAgreement.list.list_part_one"> moveAttribute(element, "class", "rend"); limitAttributes(element, "id", "n", "rend"); element.setName("list"); element.setAttribute("type", "ordered"); translate(element); } else if ("li".equals(name)) { // the HTML tag <li> its translated into the DRI // <item> tag moveAttribute(element, "class", "rend"); limitAttributes(element, "id", "n", "rend"); element.setName("item"); translate(element); } else if ("b".equals(name)) { // The HTML <b> tag is translated to a highlight // element with a rend of bold. limitAttributes(element); element.setName("hi"); element.setAttribute("rend", "bold"); translate(element); } else if ("i".equals(name)) { // The HTML <i> tag is translated to a highlight // element with a rend of italic. limitAttributes(element); element.setName("hi"); element.setAttribute("rend", "italic"); translate(element); } else if ("u".equals(name)) { // The HTML <u> tag is translated to a highlight // element with a rend of underline. limitAttributes(element); element.setName("hi"); element.setAttribute("rend", "underline"); translate(element); } else if ("img".equals(name)) { // The HTML <img> element is translated into a DRI figure moveAttribute(element, "src", "source"); limitAttributes(element, "source"); element.setName("figure"); translate(element); } // Next all the DRI elements that we allow to pass through. else if ("hi".equals(name)) { limitAttributes(element, "rend"); translate(element); } else if ("xref".equals(name)) { limitAttributes(element, "target"); translate(element); } else if ("figure".equals(name)) { limitAttributes(element, "rend", "source", "target"); translate(element); } else { removeContent(decedent); } } else { removeContent(decedent); } } // Step 2: // Ensure that all top level elements are encapusalted inside // a block level element (i.e. a paragraph) if (parent.isRootElement()) { List<Content> removed = new ArrayList<Content>(); for (int i = 0; i < parent.getContentSize(); i++) { Content current = parent.getContent(i); if ((current instanceof Element) && ("p".equals(((Element) current).getName()))) { // A paragraph is being open, combine anything up to this // point into a paragraph. if (paragraphWrap(parent, i, removed)) { removed.clear(); i++; // account for the field added } } else if ((current instanceof Element) && ("list".equals(((Element) current).getName()))) { if (paragraphWrap(parent, i, removed)) { removed.clear(); i++; // account for the field added } } else { // If we break paragraphs based upon blank lines then we // need to check if // there are any in this text element. if (this.blankLines && current instanceof Text) { String rawText = ((Text) current).getText(); parent.removeContent(current); i--;// account text field removed. // Regular expressiot to split based upon blank lines. // FIXME: This may not work for windows people who // insist on using \r\n for line breaks. @SuppressWarnings("unchecked") // This cast is correct List<String> parts = new ArrayList(Arrays .asList(rawText.split("\n\\s*\n"))); if (parts.size() > 0) { String lastPart = parts.remove(parts.size()-1); for (String part : parts) { removed.add(new Text(part)); if (paragraphWrap(parent, i+1, removed)) { removed.clear(); i++;// account for the field added } } removed.add(new Text(lastPart)); } } else { removed.add(current); parent.removeContent(current); i--; // move back to account for the removed content. } } } // if anything is left, wrap it up in a para also. if (removed.size() > 0) { paragraphWrap(parent, -1, removed); removed.clear(); } } } /** * This is a simple SAX Handler that filters out start and end documents. * This class is needed for two reasons, 1) namespaces need to be corrected * from the originating HTML fragment, 2) to get around a JDOM bug where it * can not output SAX events for just a document fragment. Since it only * works with documents this class was created to filter out the events. * * As far as I can tell the first time the bug was identified is in the * following email, point #1: * * http://www.servlets.com/archive/servlet/ReadMsg?msgId=491592&listName=jdom-interest * * I, Scott Phillips, checked the JDOM CVS source tree on 3-8-2006 and the * bug had not been patch at that time. * */ public class SAXFilter implements ContentHandler, LexicalHandler { private final String URI = WingConstants.DRI.URI; private ContentHandler contentHandler; // private LexicalHandler lexicalHandler; may be used in the future private NamespaceSupport namespaces; public SAXFilter(ContentHandler contentHandler, LexicalHandler lexicalHandler, NamespaceSupport namespaces) { this.contentHandler = contentHandler; // this.lexicalHandler = lexicalHandler; this.namespaces = namespaces; } /** * Create the qName for the element with the given localName and * namespace prefix. * * @param localName * (Required) The element's local name. * @return */ private String qName(String localName) { String prefix = namespaces.getPrefix(URI); if (prefix == null || prefix.equals("")) return localName; else return prefix + ":" + localName; } /** ContentHandler methods: */ public void endDocument() { // Filter out endDocument events } public void startDocument() { // filter out startDocument events } public void characters(char[] ch, int start, int length) throws SAXException { contentHandler.characters(ch, start, length); } public void endElement(String uri, String localName, String qName) throws SAXException { contentHandler.endElement(URI, localName, qName(localName)); } public void endPrefixMapping(String prefix) throws SAXException { // No namespaces may be declared. } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { contentHandler.ignorableWhitespace(ch, start, length); } public void processingInstruction(String target, String data) throws SAXException { // filter out processing instructions } public void setDocumentLocator(Locator locator) { // filter out document locators } public void skippedEntity(String name) throws SAXException { contentHandler.skippedEntity(name); } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { contentHandler.startElement(URI, localName, qName(localName), atts); } public void startPrefixMapping(String prefix, String uri) throws SAXException { // No namespaces can be declared. } /** Lexical Handler methods: */ public void startDTD(String name, String publicId, String systemId) throws SAXException { // filter out DTDs } public void endDTD() throws SAXException { // filter out DTDs } public void startEntity(String name) throws SAXException { // filter out Entities } public void endEntity(String name) throws SAXException { // filter out Entities } public void startCDATA() throws SAXException { // filter out CDATA } public void endCDATA() throws SAXException { // filter out CDATA } public void comment(char[] ch, int start, int length) throws SAXException { // filter out comments; } } }