/** * Copyright 2004-2016 Riccardo Solmi. All rights reserved. * This file is part of the Whole Platform. * * The Whole Platform is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The Whole Platform is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the Whole Platform. If not, see <http://www.gnu.org/licenses/>. */ package org.whole.lang.xml.util; import java.io.InputStream; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.whole.lang.util.StringUtils; /** * @author Enrico Persiani */ public class XmlUtils { public static final String IGNORABLE_WHITESPACE_CHARS = " \t\n\r"; /* see "Extensible Markup Language (XML) 1.0 (Second Edition)" */ public static final String CHARS = "\t\n\r" + "\u0020-\uD7FF" + // U+20-U+D7FF "\uE000-\uFFFD" + // U+E000-U+FFFD "\uD800\uDC00-\uDBFF\uDFFF"; // U+10000-U+10FFFF /* see "Namespaces in XML 1.0 (Second Edition)" */ public static final String NAME_START_CHARS = "\\:a-z_A-Z" + "\u00C0-\u00D6" + // U+C0-U+D6 "\u00D8-\u00F6" + // U+D8-U+F6 "\u00F8-\u02FF" + // U+F8-U+2FF "\u0370-\u037D" + // U+370-U+37D "\u037F-\u1FFF" + // U+37F-U+1FFF "\u200C-\u200D" + // U+200C-U+200D "\u2070-\u218F" + // U+2070-U+218F "\u2C00-\u2FEF" + // U+2C00-U+2FEF "\u3001-\uD7FF" + // U+3001-U+D7FF "\uF900-\uFDCF" + // U+F900-U+FDCF "\uFDF0-\uFFFD" + // U+FDF0-U+FFFD "\uD800\uDC00-\uDB7F\uDFFF"; // U+10000-U+EFFFF public static final String NAME_CHARS = ".\\-0-9" + "\u00B7" + // U+B7 "\u0300-\u036F" + // U+300-U+36F "\u203F-\u2040" + // U+203F-U+2040 NAME_START_CHARS; public static final String IGNORABLE_WHITESPACE_RE = "["+IGNORABLE_WHITESPACE_CHARS+"]*+"; public static final String STRING_RE = "["+CHARS+"]*+"; public static final String NORMALIZED_STRING_RE = "["+CHARS+"&&[^\t\n\r]]*+"; public static final String TOKEN_RE = "["+CHARS+"&&[^ \t\n\r]]*+( ["+CHARS+"&&[^ \t\n\r]]++)*+"; public static final String LANGUAGE_RE = "[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*+"; public static final String NAME_RE = "["+NAME_START_CHARS+"]["+NAME_CHARS+"]*+"; public static final String NMTOKEN_RE = "["+NAME_CHARS+"]++"; public static final String NCNAME_RE = "["+NAME_START_CHARS+"&&[^:]]["+NAME_CHARS+"&&[^:]]*+"; public static final String QNAME_RE = "("+NCNAME_RE+"):("+NCNAME_RE+")"; public static final Pattern IGNORABLE_WHITESPACE_PATTERN = Pattern.compile(IGNORABLE_WHITESPACE_RE); public static final Pattern STRING_PATTERN = Pattern.compile(STRING_RE); public static final Pattern NORMALIZED_STRING_PATTERN = Pattern.compile(NORMALIZED_STRING_RE); public static final Pattern TOKEN_PATTERN = Pattern.compile(TOKEN_RE); public static final Pattern LANGUAGE_PATTERN = Pattern.compile(LANGUAGE_RE); public static final Pattern NAME_PATTERN = Pattern.compile(NAME_RE); public static final Pattern NMTOKEN_PATTERN = Pattern.compile(NMTOKEN_RE); public static final Pattern NCNAME_PATTERN = Pattern.compile(NCNAME_RE); public static final Pattern QNAME_PATTERN = Pattern.compile(QNAME_RE); public static boolean isIgnorableWhitespace(String input) { return IGNORABLE_WHITESPACE_PATTERN.matcher(input).matches(); } public static boolean isString(String input) { return STRING_PATTERN.matcher(input).matches(); } public static boolean isNormalizedString(String input) { return NORMALIZED_STRING_PATTERN.matcher(input).matches(); } public static boolean isToken(String input) { return TOKEN_PATTERN.matcher(input).matches(); } public static boolean isLanguage(String input) { return LANGUAGE_PATTERN.matcher(input).matches(); } public static boolean isName(String input) { return NAME_PATTERN.matcher(input).matches(); } public static boolean isNMToken(String input) { return NMTOKEN_PATTERN.matcher(input).matches(); } public static boolean isNCName(String input) { return NCNAME_PATTERN.matcher(input).matches(); } public static String toNCName(String input) { input = input.replaceAll("[:[^"+NAME_CHARS+"]]", "_"); return isNCName(input) ? input : "_"+input; } public static boolean isQName(String input) { return QNAME_PATTERN.matcher(input).matches(); } public static String getPrefix(String input) { Matcher matcher = QNAME_PATTERN.matcher(input); return matcher.matches() ? matcher.group(1) : null; } public static String getLocalPart(String input) { Matcher matcher = QNAME_PATTERN.matcher(input); return matcher.matches() ? matcher.group(2) : null; } public static final String DECIMAL_RE = "[-+]?\\d+(\\.\\d*)?"; public static final Pattern DECIMAL_PATTERN = Pattern.compile(DECIMAL_RE); public static boolean isDecimal(String input) { return DECIMAL_PATTERN.matcher(input).matches(); } public static URI parseURI(String uri) { try { int length = uri.length(); StringBuilder sb = new StringBuilder(length*2); for (int i=0; i<length; i++) { char ch = uri.charAt(i); switch (ch) { case '<': case '>': case '{': case '}': case '"': case '|': case '^': case '`': case '\\': break; default: if (Character.isISOControl(ch) || Character.isWhitespace(ch)) break; sb.append(ch); continue; } sb.append('%'); sb.append(StringUtils.byteToHex((byte) (ch & 0xff))); } return new URI(sb.toString()); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } } public static String encodingFromXmlDeclaration(int magic) { // if there's no byte order mark, try to guess // using XML delcaration's first characters "<?xm" switch (magic) { case 0x0000003C: return "UTF-32BE"; case 0x3C000000: return "UTF-32LE"; case 0x003C003F: return "UTF-16BE"; case 0x3C003F00: return "UTF-16LE"; case 0x3C3F786D: return "UTF-8"; case 0x4C6FA794: return "EBCDIC"; } return null; } private static final Pattern ENCODING_PATTERN = Pattern.compile("encoding\\s*=\\s*[\"']([^\"']+)", Pattern.CASE_INSENSITIVE); public static String guessEncoding(InputStream is, String defaultEncoding) { String encoding = null; try { // both the BOM and the xml declaration heuristic // algorithms need at least the first four bytes byte[] bytes = StringUtils.peekBytes(is, 4, true); if (bytes != null) { int magic = 0; for (int i=0; i<bytes.length; i++) magic |= (bytes[i] & 0xFF) << (8*(3-i)); encoding = StringUtils.encodingFromBOM(magic); // skip bom length bytes is.read(bytes, 0, StringUtils.bomLength(magic)); if (encoding == null) { encoding = XmlUtils.encodingFromXmlDeclaration(magic); if (encoding != null && !encoding.startsWith("UTF-32")) { String xmlDecl = new String(StringUtils.peekBytes(is, 1024, false), encoding); int endIndex = xmlDecl.indexOf("?>"); xmlDecl = endIndex != -1 ? xmlDecl.substring(0, endIndex) : xmlDecl; java.util.regex.Matcher matcher = ENCODING_PATTERN.matcher(xmlDecl); if (matcher.find()) encoding = Charset.forName(matcher.group(1)).name(); } } } } catch (Exception e) { encoding = null; } return encoding != null ? encoding : defaultEncoding; } private static final String XML_DECL_PREFIX = "<?xml"; public static boolean hasXmlDecl(Reader reader, boolean defaultValue) { try { char[] prefixChars = StringUtils.peekChars(reader, XML_DECL_PREFIX.length()); String prefixString = new String(prefixChars); return XML_DECL_PREFIX.equals(prefixString); } catch (Exception e) { return defaultValue; } } public static boolean hasXmlDecl(InputStream is, String encoding, boolean defaultValue) { try { byte[] bytes = XML_DECL_PREFIX.getBytes(encoding); return Arrays.equals(bytes, StringUtils.peekBytes(is, bytes.length, false)); } catch (Exception e) { return defaultValue; } } }