/* * ALMA - Atacama Large Millimiter Array * (c) European Southern Observatory, 2002 * Copyright by ESO (in the framework of the ALMA collaboration), * All rights reserved * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package alma.acs.util; /** * Normalizes XML data by escaping special characters such as '&' or ' <'. * * @author hsommer created Jul 25, 2003 10:14:34 AM */ public class XmlNormalizer { /** * Normalizes a string to not conflict with XML markup characters. * Only allocates memory if the string <code>s</code> does contain such markup. * Otherwise <code>s</code> is returned. */ private static String normalize(String s, boolean normalizeXMLEmbeddedTextOnly) { if (s == null) return s; StringBuffer str = null; int begin = 0; boolean currentlyInsideText = false; boolean currentlyInsideCharacterContent = false; for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); String trans = null; if (normalizeXMLEmbeddedTextOnly && !currentlyInsideText) { if (ch == '<') { currentlyInsideCharacterContent = false; continue; } else if (ch == '>') { currentlyInsideCharacterContent = true; continue; } } if (ch == '\"') { currentlyInsideText = !currentlyInsideText; // don't translate the quote char if we expect entire XML including valid markup if (!normalizeXMLEmbeddedTextOnly) { trans = translate(ch); } } else if (!normalizeXMLEmbeddedTextOnly || currentlyInsideText || currentlyInsideCharacterContent) { trans = translate(ch); } if (trans != null) { // we found a special char if (str == null) { // memory allocation only if character escapes are necessary str = new StringBuffer(s.length() + 16); } // copy all chars from the last special char to the current special char at once -- // hopefully with slightly better performance than 1 char at a time str.append(s.substring(begin, i)).append(trans); begin = i + 1; } } if (str != null) { str.append(s.substring(begin)); return str.toString(); } // nothing had to be translated return s; } /** * Normalizes a string to not conflict with XML markup characters. * Only allocates memory if the string <code>s</code> does contain such markup. * Otherwise <code>s</code> is returned. */ public static String normalize(String s) { return normalize(s, false); } /** * Normalizes any text in between quotes ("text") or text given as character data of mixed-content elements, * all inside <code>xmlString</code>. Other text is left untouched. * This method can be used to fix illegal text in an unparsable XML document, * without destroying the XML markup itself. * <p> * Note that some rather simple parsing is used, and that there may be cases in which the result * is undesirable. Thus this method should only be used as a fallback solution when the XML string can not be parsed with a * regular XML parser. * * @param xmlString * @return */ public static String normalizeXMLEmbeddedTextOnly(String xmlString) { return normalize(xmlString, true); } /** * Translates the given character <code>ch</code> to its masked XML representation * if this character is one of { <, >, &, ", ' }. * Otherwise returns <code>null</code>. * @param ch * @return The corresponding XML representation such as "&lt;" for a '<'. */ public static String translate(char ch) { switch (ch) { case '<': return "<"; case '>': return ">"; case '&': return "&"; case '"': return """; case '\'': return "'"; default: return null; } } }