/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.xmlui.wing.element;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.dspace.app.xmlui.wing.WingConstants;
import org.dspace.app.xmlui.wing.WingContext;
import org.dspace.app.xmlui.wing.WingException;
import org.jdom.Attribute;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Text;
import org.jdom.input.SAXBuilder;
import org.jdom.output.SAXOutputter;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.NamespaceSupport;
/**
* This class represents data that is translated from simple HTML or plain text.
*
* This class represents a simple HTML fragment. It allows for user-supplied
* HTML to be translated on the fly into DRI.
*
* At the present time it only supports the following tags: h1, h2, h3, h4, h5,
* p, a, b, i, u, ol, li and img. Each are translated into their DRI equivalents, note
* the "h" tags are translated into a paragraph of rend=heading.
*
* If the linkbreaks flag is set then line breaks are treated as paragraphs. This
* allows plain text files to also be included and they will be mapped into DRI as
* well.
*
* @author Scott Phillips
* @author Jay Paz
*/
public class SimpleHTMLFragment extends AbstractWingElement {
/** The HTML Fragment */
private String fragment;
/** Determine if blank lines mark a new paragraph */
private boolean blankLines;
/**
* Construct a fragment object for translating into DRI.
*
* @param context
* (Required) The context this element is contained in, such as
* where to route SAX events and what i18n catalogue to use.
* @param blankLines
* (Required) Determine if blank lines should be treated as
* paragraphs delimeters.
* @param fragment
* (Required) The HTML Fragment to be translated into DRI.
* @throws WingException
*/
protected SimpleHTMLFragment(WingContext context, boolean blankLines,
String fragment) throws WingException {
super(context);
this.blankLines = blankLines;
this.fragment = fragment;
}
/**
* Translate this element into SAX
*
* @param contentHandler
* (Required) The registered contentHandler where SAX events
* should be routed too.
* @param lexicalHandler
* (Required) The registered lexicalHandler where lexical events
* (such as CDATA, DTD, etc) should be routed too.
* @param namespaces
* (Required) SAX Helper class to keep track of namespaces able
* to determine the correct prefix for a given namespace URI.
*/
public void toSAX(ContentHandler contentHandler,
LexicalHandler lexicalHandler, NamespaceSupport namespaces)
throws SAXException {
try {
String xml = "<fragment>" + fragment + "</fragment>";
ByteArrayInputStream inputStream = new ByteArrayInputStream(xml
.getBytes("UTF-8"));
SAXBuilder builder = new SAXBuilder();
Document document = builder.build(inputStream);
try {
translate(document.getRootElement());
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new JDOMException(
"Error translating HTML fragment into DRI", e);
}
SAXFilter filter = new SAXFilter(contentHandler, lexicalHandler,
namespaces);
SAXOutputter outputter = new SAXOutputter();
outputter.setContentHandler(filter);
outputter.setLexicalHandler(filter);
Element root = document.getRootElement();
@SuppressWarnings("unchecked")
// This cast is correct
List<Element> children = root.getChildren();
for (Element child : children) {
outputter.output(child);
}
} catch (JDOMException e) {
// If we are here, then a parsing error occurred within the XHTML fragment. We'll just assume
// that this is not supposed to be XHTML and display the fragment as plain text within <dri:p> tags.
startElement(contentHandler, namespaces, Para.E_PARA, null);
sendCharacters(contentHandler, fragment);
endElement(contentHandler, namespaces, Para.E_PARA);
} catch (IOException ioe) {
throw new SAXException(ioe);
}
}
/**
* Remove the given content from the Element.
*
* If the content is an element then render it as text and include it's
* children in the parent.
*
* @param content
* The DOM Content to be removed.
*/
private void removeContent(Content content) {
if (content instanceof Element) {
// If it's an element replace the content with a text node.
Element element = (Element) content;
if (element.getContent().size() == 0) {
// The element contains nothing, we can use shorthand notation
// for it.
StringBuilder replacement = new StringBuilder().append("<").append(element.getName());
@SuppressWarnings("unchecked")
// This cast is correct
List<Attribute> attributes = element.getAttributes();
for (Attribute attribute : attributes) {
replacement .append(" ").append(attribute.getName()).append("=\"").append(attribute.getValue()).append("\"").toString();
}
replacement.append("/>");
Element parent = element.getParentElement();
int index = parent.indexOf(element);
parent.setContent(index, new Text(replacement.toString()));
} else {
// The element contains data
StringBuilder prepend = new StringBuilder();
prepend.append("<").append(element.getName());
@SuppressWarnings("unchecked")
// This cast is correct
List<Attribute> attributes = element.getAttributes();
for (Attribute attribute : attributes) {
prepend.append(" ").append(attribute.getName()).append("=\"").append(attribute.getValue()).append("\"");
}
prepend.append(">");
String postpend = "</" + element.getName() + ">";
Element parent = element.getParentElement();
int index = parent.indexOf(element);
parent.addContent(index, new Text(postpend));
parent.addContent(index, element.removeContent());
parent.addContent(index, new Text(prepend.toString()));
parent.removeContent(element);
}
} else {
// If it's not an element just remove the content from the document.
Element parent = content.getParentElement();
parent.removeContent(content);
}
}
/**
* Wrap the given set of contents into a paragraph and place it at the
* supplied index.
*
* This method will also check for trivial paragraphs, i.e. those that
* contain nothing but white space. If they are found then they are removed.
*
* @param parent
* The parent element to attach the wrapped paragraph, too.
* @param index
* The index within the parent for where the content should be
* attached.
* @param contents
* The contents that should be wrapped in a paragraph.
* @return whether a paragraph was actually added.
*/
private boolean paragraphWrap(Element parent, int index,
List<Content> contents) {
if (contents == null || contents.size() <= 0)
{
return false;
}
boolean empty = true;
for (Content content : contents) {
if (!empty)
{
continue;
}
if (content instanceof Text) {
Text text = (Text) content;
if (!"".equals(text.getTextNormalize()))
{
empty = false;
}
} else {
empty = false;
}
}
if (empty)
{
return false;
}
// May be useful for debugging:
// contents.add(0, new Text("("+index+") "));
Element para = new Element(Para.E_PARA);
para.addContent(contents);
if (index >= 0)
{
parent.addContent(index, para);
}
else
{
parent.addContent(para);
}
return true;
}
/**
* Ensure that the given element only has the supplied attributes. Also
* remove any possible namespaces on the attributes.
*
* @param element
* The element to be checked.
* @param names
* A list of all allowed attribute names, all others will be
* removed.
*/
private void limitAttributes(Element element, String... names) {
Map<String, String> attributes = new HashMap<String, String>();
for (String name : names) {
String value = element.getAttributeValue(name);
if (value != null)
{
attributes.put(name, value);
}
}
element.setAttributes(new ArrayList<Attributes>());
for (Map.Entry<String, String> attr : attributes.entrySet()) {
element.setAttribute(attr.getKey(), attr.getValue());
}
}
/**
* Move the old attribute to a new attribute.
*
* @param element
* The element
* @param oldName
* The old attribute's name.
* @param newName
* The new attribute's name.
*/
private void moveAttribute(Element element, String oldName, String newName) {
Attribute attribute = element.getAttribute(oldName);
if (attribute != null)
{
attribute.setName(newName);
}
}
/**
* Translate the given HTML fragment into a DRI document.
*
* The translation is broken up into two steps, 1) recurse through all
* elements and either translate them into their DRI equivalents or remove
* them from the document.
*
* The second step, 2) is to iterate over all top level elements and ensure
* that they only consist of paragraphs. Also at this stage if linkBreaks is
* true then \n are treated as paragraph breaks.
*
* @param parent
* The Element to translate into DRI.
*/
private void translate(Element parent) {
// Step 1:
// Recurse through all elements and either
// translate them or remove them.
for (int i = 0; i < parent.getContentSize(); i++) {
Content decedent = parent.getContent(i);
if (decedent instanceof org.jdom.Text) {
} else if (decedent instanceof Element) {
Element element = (Element) decedent;
String name = element.getName();
// First all the DRI elements, allow them to pass.
if ("p".equals(name)) {
// Paragraphs are tricky, it may be either an HTML
// or DRI <p> element. However, while HTML will allow
// <p> to nest DRI does not, thus first we need to
// check if this is at the block level, if it is then
// we need remove it.
if (parent.isRootElement()) {
// The paragraph is not nested, so translate it to
// a DRI <p>
moveAttribute(element, "class", "rend");
limitAttributes(element, "id", "n", "rend");
translate(element);
} else {
// The paragraph is nested which is not allowed in
// DRI, so remove it.
removeContent(element);
}
} else if ("h1".equals(name) || "h2".equals(name)
|| "h3".equals(name) || "h4".equals(name)
|| "h5".equals(name)) {
// The HTML <H1> tag is translated into the DRI
// <p rend="heading"> tag.
if (parent.isRootElement()) {
limitAttributes(element);
element.setName("p");
element.setAttribute("rend", "heading");
translate(element);
} else {
// DRI paragraphs can not be nested.
removeContent(element);
}
} else if ("a".equals(name)) {
// The HTML <a> tag is translated into the DRI
// <xref> tag.
moveAttribute(element, "href", "target");
limitAttributes(element, "target");
element.setName("xref");
translate(element);
} else if ("ol".equals(name)) {
// the HTML tag <ol> its translated into the DRI
// <list> tag
// <list type="ordered" n="list_part_one"
// id="css.submit.LicenseAgreement.list.list_part_one">
moveAttribute(element, "class", "rend");
limitAttributes(element, "id", "n", "rend");
element.setName("list");
element.setAttribute("type", "ordered");
translate(element);
} else if ("li".equals(name)) {
// the HTML tag <li> its translated into the DRI
// <item> tag
moveAttribute(element, "class", "rend");
limitAttributes(element, "id", "n", "rend");
element.setName("item");
translate(element);
} else if ("b".equals(name)) {
// The HTML <b> tag is translated to a highlight
// element with a rend of bold.
limitAttributes(element);
element.setName("hi");
element.setAttribute("rend", "bold");
translate(element);
} else if ("i".equals(name)) {
// The HTML <i> tag is translated to a highlight
// element with a rend of italic.
limitAttributes(element);
element.setName("hi");
element.setAttribute("rend", "italic");
translate(element);
} else if ("u".equals(name)) {
// The HTML <u> tag is translated to a highlight
// element with a rend of underline.
limitAttributes(element);
element.setName("hi");
element.setAttribute("rend", "underline");
translate(element);
} else if ("img".equals(name)) {
// The HTML <img> element is translated into a DRI figure
moveAttribute(element, "src", "source");
limitAttributes(element, "source");
element.setName("figure");
translate(element);
}
// Next all the DRI elements that we allow to pass through.
else if ("hi".equals(name)) {
limitAttributes(element, "rend");
translate(element);
} else if ("xref".equals(name)) {
limitAttributes(element, "target");
translate(element);
} else if ("figure".equals(name)) {
limitAttributes(element, "rend", "source", "target");
translate(element);
} else {
removeContent(decedent);
}
} else {
removeContent(decedent);
}
}
// Step 2:
// Ensure that all top level elements are encapsulated inside
// a block level element (i.e. a paragraph)
if (parent.isRootElement()) {
List<Content> removed = new ArrayList<Content>();
for (int i = 0; i < parent.getContentSize(); i++) {
Content current = parent.getContent(i);
if ((current instanceof Element)
&& ("p".equals(((Element) current).getName()))) {
// A paragraph is being open, combine anything up to this
// point into a paragraph.
if (paragraphWrap(parent, i, removed)) {
removed.clear();
i++; // account for the field added
}
} else if ((current instanceof Element)
&& ("list".equals(((Element) current).getName()))) {
if (paragraphWrap(parent, i, removed)) {
removed.clear();
i++; // account for the field added
}
} else {
// If we break paragraphs based upon blank lines then we
// need to check if
// there are any in this text element.
if (this.blankLines && current instanceof Text) {
String rawText = ((Text) current).getText();
parent.removeContent(current);
i--;// account text field removed.
// Regular expression to split based upon blank lines.
// FIXME: This may not work for windows people who
// insist on using \r\n for line breaks.
@SuppressWarnings("unchecked")
String[] parts = rawText.split("\n\\s*\n");
if (parts.length > 0) {
for (int partIdx = 0; partIdx < parts.length - 1; partIdx++) {
removed.add(new Text(parts[partIdx]));
if (paragraphWrap(parent, i+1, removed)) {
removed.clear();
i++;// account for the field added
}
}
removed.add(new Text(parts[parts.length - 1]));
}
} else {
removed.add(current);
parent.removeContent(current);
i--; // move back to account for the removed content.
}
}
}
// if anything is left, wrap it up in a para also.
if (removed.size() > 0) {
paragraphWrap(parent, -1, removed);
removed.clear();
}
}
}
/**
* This is a simple SAX Handler that filters out start and end documents.
* This class is needed for two reasons, 1) namespaces need to be corrected
* from the originating HTML fragment, 2) to get around a JDOM bug where it
* can not output SAX events for just a document fragment. Since it only
* works with documents this class was created to filter out the events.
*
* As far as I can tell, the first time the bug was identified is in the
* following email, point #1:
*
* http://www.servlets.com/archive/servlet/ReadMsg?msgId=491592&listName=jdom-interest
*
* I, Scott Phillips, checked the JDOM CVS source tree on 3-8-2006 and the
* bug had not been patch at that time.
*
*/
public static class SAXFilter implements ContentHandler, LexicalHandler {
private final String URI = WingConstants.DRI.URI;
private ContentHandler contentHandler;
// private LexicalHandler lexicalHandler; may be used in the future
private NamespaceSupport namespaces;
public SAXFilter(ContentHandler contentHandler,
LexicalHandler lexicalHandler, NamespaceSupport namespaces) {
this.contentHandler = contentHandler;
// this.lexicalHandler = lexicalHandler;
this.namespaces = namespaces;
}
/**
* Create the qName for the element with the given localName and
* namespace prefix.
*
* @param localName
* (Required) The element's local name.
* @return
*/
private String qName(String localName) {
String prefix = namespaces.getPrefix(URI);
if (prefix == null || prefix.equals(""))
{
return localName;
}
else
{
return prefix + ":" + localName;
}
}
/** ContentHandler methods: */
public void endDocument() {
// Filter out endDocument events
}
public void startDocument() {
// filter out startDocument events
}
public void characters(char[] ch, int start, int length)
throws SAXException {
contentHandler.characters(ch, start, length);
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
contentHandler.endElement(URI, localName, qName(localName));
}
public void endPrefixMapping(String prefix) throws SAXException {
// No namespaces may be declared.
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
contentHandler.ignorableWhitespace(ch, start, length);
}
public void processingInstruction(String target, String data)
throws SAXException {
// filter out processing instructions
}
public void setDocumentLocator(Locator locator) {
// filter out document locators
}
public void skippedEntity(String name) throws SAXException {
contentHandler.skippedEntity(name);
}
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
contentHandler.startElement(URI, localName, qName(localName), atts);
}
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
// No namespaces can be declared.
}
/** Lexical Handler methods: */
public void startDTD(String name, String publicId, String systemId)
throws SAXException {
// filter out DTDs
}
public void endDTD() throws SAXException {
// filter out DTDs
}
public void startEntity(String name) throws SAXException {
// filter out Entities
}
public void endEntity(String name) throws SAXException {
// filter out Entities
}
public void startCDATA() throws SAXException {
// filter out CDATA
}
public void endCDATA() throws SAXException {
// filter out CDATA
}
public void comment(char[] ch, int start, int length)
throws SAXException {
// filter out comments;
}
}
}