/*
* SimpleHTMLFragment.java
*
* Version: $Revision: 4695 $
*
* Date: $Date: 2010-01-15 17:06:30 +0000 (Fri, 15 Jan 2010) $
*
* Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
* Institute of Technology. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the Hewlett-Packard Company nor the name of the
* Massachusetts Institute of Technology nor the names of their
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
package org.dspace.app.xmlui.wing.element;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.dspace.app.xmlui.wing.WingConstants;
import org.dspace.app.xmlui.wing.WingContext;
import org.dspace.app.xmlui.wing.WingException;
import org.jdom.Attribute;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Text;
import org.jdom.input.SAXBuilder;
import org.jdom.output.SAXOutputter;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.NamespaceSupport;
/**
* This class represents data that is translated from simple HTML or plain text.
*
* This class represents a simple HTML fragment. It allows for user supplied
* HTML to be translated on the fly into DRI.
*
* At the present time it only supports the following tags: h1, h2, h3, h4, h5,
* p, a, b, i, u, ol, li and img. Each are translated into their DRI equivelents, note
* the "h" tags are translated into a paragraph of rend=heading.
*
* If the linkbreaks flag is set then line breaks are treated as paragraphs. This
* allows plain text files to also be included and they will be mapped into DRI as
* well.
*
* @author Scott Phillips
* @author Jay Paz
*/
public class SimpleHTMLFragment extends AbstractWingElement {
/** The HTML Fragment */
private String fragment;
/** Determine if blank lines mark a new paragraph */
private boolean blankLines;
/**
* Construct a fragment object for translating into DRI.
*
* @param context
* (Required) The context this element is contained in, such as
* where to route SAX events and what i18n catalogue to use.
* @param blankLines
* (Required) Determine if blank lines should be treated as
* paragraphs delimeters.
* @param fragment
* (Required) The HTML Fragment to be translated into DRI.
* @throws WingException
*/
protected SimpleHTMLFragment(WingContext context, boolean blankLines,
String fragment) throws WingException {
super(context);
this.blankLines = blankLines;
this.fragment = fragment;
}
/**
* Translate this element into SAX
*
* @param contentHandler
* (Required) The registered contentHandler where SAX events
* should be routed too.
* @param lexicalHandler
* (Required) The registered lexicalHandler where lexical events
* (such as CDATA, DTD, etc) should be routed too.
* @param namespaces
* (Required) SAX Helper class to keep track of namespaces able
* to determine the correct prefix for a given namespace URI.
*/
public void toSAX(ContentHandler contentHandler,
LexicalHandler lexicalHandler, NamespaceSupport namespaces)
throws SAXException {
try {
String xml = "<fragment>" + fragment + "</fragment>";
ByteArrayInputStream inputStream = new ByteArrayInputStream(xml
.getBytes());
SAXBuilder builder = new SAXBuilder();
Document document = builder.build(inputStream);
try {
translate(document.getRootElement());
} catch (Throwable t) {
throw new JDOMException(
"Error translating HTML fragment into DRI", t);
}
SAXFilter filter = new SAXFilter(contentHandler, lexicalHandler,
namespaces);
SAXOutputter outputter = new SAXOutputter();
outputter.setContentHandler(filter);
outputter.setLexicalHandler(filter);
Element root = document.getRootElement();
@SuppressWarnings("unchecked")
// This cast is correct
List<Element> children = root.getChildren();
for (Element child : children) {
outputter.output(child);
}
} catch (JDOMException e) {
//If we are here, then a parsing error occurred within the XHTML fragment. We'll just assume
// that this is not supposed to be XHTML and display the fragment as plain text within <dri:p> tags.
startElement(contentHandler, namespaces, Para.E_PARA, null);
sendCharacters(contentHandler, fragment);
endElement(contentHandler, namespaces, Para.E_PARA);
} catch (IOException ioe) {
throw new SAXException(ioe);
}
}
/**
* dispose
*/
public void dispose() {
super.dispose();
}
/**
* Remove the given content from the Element.
*
* If the content is an element then render it as text and include it's
* children in the parent.
*
* @param content
* The DOM Content to be removed.
*/
private void removeContent(Content content) {
if (content instanceof Element) {
// If it's an element replace the content with a text node.
Element element = (Element) content;
if (element.getContent().size() == 0) {
// The element contains nothing, we can use shorthand notation
// for it.
String replacement = "<" + element.getName();
@SuppressWarnings("unchecked")
// This cast is correct
List<Attribute> attributes = element.getAttributes();
for (Attribute attribute : attributes) {
replacement += " " + attribute.getName() + "=\""
+ attribute.getValue() + "\"";
}
replacement += "/>";
Element parent = element.getParentElement();
int index = parent.indexOf(element);
parent.setContent(index, new Text(replacement));
} else {
// The element contains data
String prepend = "<" + element.getName();
@SuppressWarnings("unchecked")
// This cast is correct
List<Attribute> attributes = element.getAttributes();
for (Attribute attribute : attributes) {
prepend += " " + attribute.getName() + "=\""
+ attribute.getValue() + "\"";
}
prepend += ">";
String postpend = "</" + element.getName() + ">";
Element parent = element.getParentElement();
int index = parent.indexOf(element);
parent.addContent(index, new Text(postpend));
parent.addContent(index, element.removeContent());
parent.addContent(index, new Text(prepend));
parent.removeContent(element);
}
} else {
// If it's not an element just remove the content from the document.
Element parent = content.getParentElement();
parent.removeContent(content);
}
}
/**
* Wrap the given set of contents into a paragraph and place it at the
* supplied index.
*
* This method will also check for trivial paragraphs, i.e. those that
* contain nothing but white space. If they are found then they are removed.
*
* @param parent
* The parent element to attach the wrapped paragraph too.
* @param index
* The index within the parent for where the content should be
* attached.
* @param contents
* The contents that should be wrapped in a paragraph.
* @return wheather a paragraph was actualy added.
*/
private boolean paragraphWrap(Element parent, int index,
List<Content> contents) {
if (contents == null || contents.size() <= 0)
return false;
boolean empty = true;
for (Content content : contents) {
if (empty == false)
continue;
if (content instanceof Text) {
Text text = (Text) content;
if (!"".equals(text.getTextNormalize()))
empty = false;
} else {
empty = false;
}
}
if (empty == true)
return false;
// May be usefull for debugging:
// contents.add(0, new Text("("+index+") "));
Element para = new Element(Para.E_PARA);
para.addContent(contents);
if (index >= 0)
parent.addContent(index, para);
else
parent.addContent(para);
return true;
}
/**
* Ensure that the given element only has the supplied attributes. Also
* remove any possible namespaces on the attributes.
*
* @param element
* The element to be checked.
* @param names
* A list of all allowed attribute names, all others will be
* removed.
*/
private void limitAttributes(Element element, String... names) {
Map<String, String> attributes = new HashMap<String, String>();
for (String name : names) {
String value = element.getAttributeValue(name);
if (value != null)
attributes.put(name, value);
}
element.setAttributes(new ArrayList<Attributes>());
for (String name : attributes.keySet()) {
String value = attributes.get(name);
element.setAttribute(name, value);
}
}
/**
* Move the old attribute to a new attribute.
*
* @param element
* The element
* @param oldName
* The old attribute's name.
* @param newName
* The new attribute's name.
*/
private void moveAttribute(Element element, String oldName, String newName) {
Attribute attribute = element.getAttribute(oldName);
if (attribute != null)
attribute.setName(newName);
}
/**
* Translate the given HTML fragment into a DRI document.
*
* The translation is broken up into two steps, 1) recurse through all
* elements and either translate them into their DRI equivelents or remove
* them from the document.
*
* The second step, 2) is to iterate over all top level elements and ensure
* that they only consist of paragraphs. Also at this stage if linkBreaks is
* true then \n are treated as paragraph breaks.
*
* @param parent
* The Element to translate into DRI.
*/
private void translate(Element parent) {
// Step 1:
// Recurse through all elements and either
// translate them or remove them.
for (int i = 0; i < parent.getContentSize(); i++) {
Content decedent = parent.getContent(i);
if (decedent instanceof org.jdom.Text) {
} else if (decedent instanceof Element) {
Element element = (Element) decedent;
String name = element.getName();
// First all the DRI elements, allow them to pass.
if ("p".equals(name)) {
// Paragraphs are tricky, it may be either an HTML
// or DRI <p> element. However, while HTML will allow
// <p> to nest DRI does not, thus first we need to
// check if this is at the block level, if it is then
// we need remove it.
if (parent.isRootElement()) {
// The paragraph is not nested, so translate it to
// a DRI <p>
moveAttribute(element, "class", "rend");
limitAttributes(element, "id", "n", "rend");
translate(element);
} else {
// The paragraph is nested which is not allowed in
// DRI, so remove it.
removeContent(element);
}
} else if ("h1".equals(name) || "h2".equals(name)
|| "h3".equals(name) || "h4".equals(name)
|| "h5".equals(name)) {
// The HTML <H1> tag is translated into the DRI
// <p rend="heading"> tag.
if (parent.isRootElement()) {
limitAttributes(element);
element.setName("p");
element.setAttribute("rend", "heading");
translate(element);
} else {
// DRI paragraphs can not be nested.
removeContent(element);
}
} else if ("a".equals(name)) {
// The HTML <a> tag is translated into the DRI
// <xref> tag.
moveAttribute(element, "href", "target");
limitAttributes(element, "target");
element.setName("xref");
translate(element);
} else if ("ol".equals(name)) {
// the HTML tag <ol> its translated into the DRI
// <list> tag
// <list type="ordered" n="list_part_one"
// id="css.submit.LicenseAgreement.list.list_part_one">
moveAttribute(element, "class", "rend");
limitAttributes(element, "id", "n", "rend");
element.setName("list");
element.setAttribute("type", "ordered");
translate(element);
} else if ("li".equals(name)) {
// the HTML tag <li> its translated into the DRI
// <item> tag
moveAttribute(element, "class", "rend");
limitAttributes(element, "id", "n", "rend");
element.setName("item");
translate(element);
} else if ("b".equals(name)) {
// The HTML <b> tag is translated to a highlight
// element with a rend of bold.
limitAttributes(element);
element.setName("hi");
element.setAttribute("rend", "bold");
translate(element);
} else if ("i".equals(name)) {
// The HTML <i> tag is translated to a highlight
// element with a rend of italic.
limitAttributes(element);
element.setName("hi");
element.setAttribute("rend", "italic");
translate(element);
} else if ("u".equals(name)) {
// The HTML <u> tag is translated to a highlight
// element with a rend of underline.
limitAttributes(element);
element.setName("hi");
element.setAttribute("rend", "underline");
translate(element);
} else if ("img".equals(name)) {
// The HTML <img> element is translated into a DRI figure
moveAttribute(element, "src", "source");
limitAttributes(element, "source");
element.setName("figure");
translate(element);
}
// Next all the DRI elements that we allow to pass through.
else if ("hi".equals(name)) {
limitAttributes(element, "rend");
translate(element);
} else if ("xref".equals(name)) {
limitAttributes(element, "target");
translate(element);
} else if ("figure".equals(name)) {
limitAttributes(element, "rend", "source", "target");
translate(element);
} else {
removeContent(decedent);
}
} else {
removeContent(decedent);
}
}
// Step 2:
// Ensure that all top level elements are encapusalted inside
// a block level element (i.e. a paragraph)
if (parent.isRootElement()) {
List<Content> removed = new ArrayList<Content>();
for (int i = 0; i < parent.getContentSize(); i++) {
Content current = parent.getContent(i);
if ((current instanceof Element)
&& ("p".equals(((Element) current).getName()))) {
// A paragraph is being open, combine anything up to this
// point into a paragraph.
if (paragraphWrap(parent, i, removed)) {
removed.clear();
i++; // account for the field added
}
} else if ((current instanceof Element)
&& ("list".equals(((Element) current).getName()))) {
if (paragraphWrap(parent, i, removed)) {
removed.clear();
i++; // account for the field added
}
} else {
// If we break paragraphs based upon blank lines then we
// need to check if
// there are any in this text element.
if (this.blankLines && current instanceof Text) {
String rawText = ((Text) current).getText();
parent.removeContent(current);
i--;// account text field removed.
// Regular expressiot to split based upon blank lines.
// FIXME: This may not work for windows people who
// insist on using \r\n for line breaks.
@SuppressWarnings("unchecked")
// This cast is correct
List<String> parts = new ArrayList(Arrays
.asList(rawText.split("\n\\s*\n")));
if (parts.size() > 0) {
String lastPart = parts.remove(parts.size()-1);
for (String part : parts) {
removed.add(new Text(part));
if (paragraphWrap(parent, i+1, removed)) {
removed.clear();
i++;// account for the field added
}
}
removed.add(new Text(lastPart));
}
} else {
removed.add(current);
parent.removeContent(current);
i--; // move back to account for the removed content.
}
}
}
// if anything is left, wrap it up in a para also.
if (removed.size() > 0) {
paragraphWrap(parent, -1, removed);
removed.clear();
}
}
}
/**
* This is a simple SAX Handler that filters out start and end documents.
* This class is needed for two reasons, 1) namespaces need to be corrected
* from the originating HTML fragment, 2) to get around a JDOM bug where it
* can not output SAX events for just a document fragment. Since it only
* works with documents this class was created to filter out the events.
*
* As far as I can tell the first time the bug was identified is in the
* following email, point #1:
*
* http://www.servlets.com/archive/servlet/ReadMsg?msgId=491592&listName=jdom-interest
*
* I, Scott Phillips, checked the JDOM CVS source tree on 3-8-2006 and the
* bug had not been patch at that time.
*
*/
public class SAXFilter implements ContentHandler, LexicalHandler {
private final String URI = WingConstants.DRI.URI;
private ContentHandler contentHandler;
// private LexicalHandler lexicalHandler; may be used in the future
private NamespaceSupport namespaces;
public SAXFilter(ContentHandler contentHandler,
LexicalHandler lexicalHandler, NamespaceSupport namespaces) {
this.contentHandler = contentHandler;
// this.lexicalHandler = lexicalHandler;
this.namespaces = namespaces;
}
/**
* Create the qName for the element with the given localName and
* namespace prefix.
*
* @param localName
* (Required) The element's local name.
* @return
*/
private String qName(String localName) {
String prefix = namespaces.getPrefix(URI);
if (prefix == null || prefix.equals(""))
return localName;
else
return prefix + ":" + localName;
}
/** ContentHandler methods: */
public void endDocument() {
// Filter out endDocument events
}
public void startDocument() {
// filter out startDocument events
}
public void characters(char[] ch, int start, int length)
throws SAXException {
contentHandler.characters(ch, start, length);
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
contentHandler.endElement(URI, localName, qName(localName));
}
public void endPrefixMapping(String prefix) throws SAXException {
// No namespaces may be declared.
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
contentHandler.ignorableWhitespace(ch, start, length);
}
public void processingInstruction(String target, String data)
throws SAXException {
// filter out processing instructions
}
public void setDocumentLocator(Locator locator) {
// filter out document locators
}
public void skippedEntity(String name) throws SAXException {
contentHandler.skippedEntity(name);
}
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
contentHandler.startElement(URI, localName, qName(localName), atts);
}
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
// No namespaces can be declared.
}
/** Lexical Handler methods: */
public void startDTD(String name, String publicId, String systemId)
throws SAXException {
// filter out DTDs
}
public void endDTD() throws SAXException {
// filter out DTDs
}
public void startEntity(String name) throws SAXException {
// filter out Entities
}
public void endEntity(String name) throws SAXException {
// filter out Entities
}
public void startCDATA() throws SAXException {
// filter out CDATA
}
public void endCDATA() throws SAXException {
// filter out CDATA
}
public void comment(char[] ch, int start, int length)
throws SAXException {
// filter out comments;
}
}
}