/* * Copyright (c) 2008, SQL Power Group Inc. * * This file is part of SQL Power Library. * * SQL Power Library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * SQL Power Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package ca.sqlpower.xml; import java.io.PrintWriter; /** * XMLHelper is a simple utility for outputting indented XML markup. It escapes * all illegal XML characters using a custom (application-level) escaping * mechanism. The rationale behind this is to allow us to save such characters * (like from binary data in a project) into a project file and read them back * in again when loading a project. * <p> * Note that <i>all</i> data that passes through the methods in this class will * be escaped. There is no check as to whether or not the illegal characters fall * in a CDATA section, a tag name, an attribute name or value, and so on. We * consider this to be a reasonable simplification of the problem because illegal * XML characters are just that: Illegal. If a tag name or attribute value contained * an illegal character, the resulting file would not be well-formed XML. The fact * that this class escapes everything that goes through it won't break what would * otherwise be well-formed XML. It will only further mangle malformed XML! Of * course, if the illegal characters fall in a CDATA section or attribute value, * the escaping will have saved the day. The results will be well-formed XML. * <p> * NB: When loading back in the data that was written using the XMLHelper, the * escaped characters will have to be 'unescaped' again after going through the * XML parser. See {@link #escape(String)} for a description of the escape * format. * <p> */ public class XMLHelper { public int indent; /** * Creates a new XMLHelper with an initial indentation amount of 0. */ public XMLHelper() { super(); } /** * Prints to the output writer {@link #out} indentation spaces * (according to {@link #indent}) followed by the given text. * @param out */ public void print(PrintWriter out, String text) { for (int i = 0; i < indent; i++) { out.print(" "); } out.print(escape(text)); } /** * Prints <code>text</code> to the output writer {@link #out} (no * indentation). */ public void niprint(PrintWriter out, String text) { out.print(escape(text)); } /** * Prints <code>text</code> followed by newline to the output * writer {@link #out} (no indentation). */ public void niprintln(PrintWriter out, String text) { out.println(escape(text)); } /** * Prints to the output writer {@link #out} indentation spaces * (according to {@link #indent}) followed by the given text * followed by a newline. */ public void println(PrintWriter out, String text) { for (int i = 0; i < indent; i++) { out.print(" "); } out.println(escape(text)); } /** * Takes a String argument and returns a string that escapes characters that * are illegal in an XML document according to the XML specification. The * set of valid XML characters is taken from the <a * href="http://www.w3.org/TR/REC-xml/">XML 1.0 specification</a>, section * 2.2. Additionally, the backslash character will be considered illegal if * it appears immediately before a lowercase u in the input string. * <p> * Illegal characters will be represented in the output in the "escaped * form," the string <tt>\\uNNNN</tt> where NNNN is the four-digit * hexadecimal value of the character. There will always be exactly four * characters following the \\u, and each of those four characters will be a * hex digit. * <p> * This escaping mechanism is not standard XML markup; it's * application-level data. No generic XML processor will unescape it on the * way in, so the job of unescaping lies with any application program that * wants to consume the XML data. The Architect handles this by wrapping a * SAX parser with a layer that detects and unescapes the \\u sequences. * <p> * * @param text * The input string that we want to check for illegal characters * @return Returns a string identical to the input string, except any * character values that fall outside the range of legal XML * characters will appear in the 6-character escaped form described * above. */ static String escape(String text) { if (text.equals("")) return ""; // arbitrary amount of extra space StringBuilder sb = new StringBuilder(text.length()+10); for (int i = 0, n = text.length(); i < n; i++) { char ch = text.charAt(i); char nextch; if (i == n - 1) { nextch = 0; } else { nextch = text.charAt(i + 1); } if (ch == 0x09 || ch == 0x0a || ch == 0x0d || (ch >= 0x20 && ch <= 0xd7ff && ch != '\\') || (ch >= 0xe000 && ch <=0xfffd) || (ch == '\\' && nextch != 'u')) { sb.append(ch); } else { sb.append(String.format("\\u%04x", (int)ch)); } } return sb.toString(); } /** * Unescapes the String text according to the format described above in escape(String text) * * @param text The String to escape. If the String is null, then we return null. * @return The unescaped version of the input string. If the string is null, return null */ static String unescape(String text) { if (text == null) return null; StringBuilder unescapedText = new StringBuilder(text.length()); for (int i = 0, n = text.length(); i < n ; ) { char ch = text.charAt(i); char nextch; if (i == n - 1) { nextch = 0; } else { nextch = text.charAt(i + 1); } if (ch == '\\' && nextch == 'u') { int charVal = Integer.parseInt(text.substring(i+2, i+6), 16); char unescapedChar = (char)charVal; unescapedText.append(unescapedChar); i += 6; } else { unescapedText.append(ch); i++; } } return unescapedText.toString(); } }