package org.meaningfulweb.util; import java.io.IOException; import java.io.StringWriter; import java.util.List; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; import org.jdom.Comment; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; import org.jdom.Text; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; public class XMLUtils { private static void getText(Content node, StringBuilder builder) { if (node instanceof Element) { Element elem = (Element)node; List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { getText(child, builder); } } } else if (node instanceof Text) { String textVal = StringUtils.trim(((Text)node).getTextNormalize() + " "); if (StringUtils.isNotBlank(textVal)) { String escaped = StringEscapeUtils.unescapeXml(textVal); builder.append(escaped + " "); } } else if (node instanceof Comment) { return; } } /** * Changes a non-ascii string into an HTML encoded ascii string. * * @param notAscii The string to change. * * @return The converted string. */ public static String toAscii(String notAscii) { StringBuilder builder = new StringBuilder(); char[] charArray = notAscii.toCharArray(); for (int i = 0; i < charArray.length; ++i) { char a = charArray[i]; if ((int)a > 255) { builder.append("&#" + (int)a + ";"); } else { builder.append(a); } } return builder.toString(); } /** * This method ensures that the output String has only valid XML unicode * characters as specified by the XML 1.0 standard. For reference, please see * <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the * standard</a>. This method will return an empty String if the input is null * or empty. * * @param in The String whose non-valid characters we want to remove. * @return The in String, stripped of non-valid characters. */ public static String stripNonValidXMLCharacters(String in) { StringBuffer out = new StringBuffer(); // Used to hold the output. char current; // Used to reference the current character. if (in == null || ("".equals(in))) return ""; // vacancy test. for (int i = 0; i < in.length(); i++) { current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; // it should not happen. if ((current == 0x9) || (current == 0xA) || (current == 0xD) || ((current >= 0x20) && (current <= 0xD7FF)) || ((current >= 0xE000) && (current <= 0xFFFD)) || ((current >= 0x10000) && (current <= 0x10FFFF))) out.append(current); } return out.toString(); } public static String toXml(Document doc) { return toXml(doc, "UTF-8"); } public static String toXml(Document doc, String encoding) { String htmlstr = null; try { // write out the xml to a string StringWriter writer = new StringWriter(); Format format = Format.getPrettyFormat(); format.setExpandEmptyElements(true); format.setOmitDeclaration(true); format.setEncoding(encoding); XMLOutputter out = new XMLOutputter(format); out.output(doc, writer); // xml processing will escape out certain characters that are legal in // html we convert those characters back here to html entity codes instead // of xml entitity codes. We also replace unicodeish characters to their // html entity equivalents. This helps in displaying with people don't // have the correct charset packs installed String output = StringEscapeUtils.unescapeXml(writer.toString()); htmlstr = StringEscapeUtils.unescapeHtml(output); writer.close(); } catch (IOException e) { // do nothing } return htmlstr; } public static String toHtml(Document doc) { return toHtml(doc, "UTF-8"); } /** * Converts an XML Document object to HTML. This includes pretty printing the * document and adding the appropriate DocType headers. */ public static String toHtml(Document doc, String encoding) { String htmlstr = null; try { // write out the xml to a string, without the xml declaration and use the // HTML outputter to add in an html doctype StringWriter writer = new StringWriter(); Format format = Format.getPrettyFormat(); format.setExpandEmptyElements(true); format.setOmitDeclaration(true); format.setEncoding(encoding); HTMLOutputter out = new HTMLOutputter(format); out.output(doc, writer); // xml processing will escape out certain characters that are legal in // html // we convert those characters back here to html entity codes instead of // xml entitity codes. We also replace unicodeish characters to their html // entity equivalents. This helps in displaying with people don't have the // correct charset packs installed String output = StringEscapeUtils.unescapeXml(writer.toString()); htmlstr = StringEscapeUtils.unescapeHtml(output); writer.close(); } catch (IOException e) { // do nothing } return htmlstr; } public static String toHtml(Element elem) { return toHtml(elem, "UTF-8"); } public static String toHtml(Element elem, String encoding) { String htmlstr = null; try { // write out the xml to a string StringWriter writer = new StringWriter(); Format format = Format.getPrettyFormat(); format.setExpandEmptyElements(true); format.setOmitDeclaration(true); format.setEncoding(encoding); XMLOutputter out = new XMLOutputter(format); out.output(elem, writer); // xml processing will escape out certain characters that are legal in // html we convert those characters back here to html entity codes instead // of xml entitity codes. We also replace unicodeish characters to their // html entity equivalents. This helps in displaying with people don't // have the correct charset packs installed String output = StringEscapeUtils.unescapeXml(writer.toString()); htmlstr = StringEscapeUtils.unescapeHtml(output); writer.close(); } catch (IOException e) { // do nothing } return htmlstr; } /** * Converts an XML Document object to text. */ public static String toText(Document doc) { Element rootElem = doc.getRootElement(); return toText(rootElem); } /** * Converts an XML Document object to text. */ public static String toText(Element elem) { // get only the text nodes from the dom StringBuilder builder = new StringBuilder(); List<Content> contents = elem.getContent(); for (Content child : contents) { getText(child, builder); } String text = builder.toString(); text = HtmlExtractUtils.removeNewlines(text); text = HtmlExtractUtils.removeTags(text); text = HtmlExtractUtils.removeContiguousWhitespace(text); text = StringEscapeUtils.unescapeXml(text); text = StringUtils.trim(StringEscapeUtils.unescapeHtml(text)); return text; } }