XmlUtils.java example

Explorer
whole-master
/**
 * Copyright 2004-2016 Riccardo Solmi. All rights reserved.
 * This file is part of the Whole Platform.
 *
 * The Whole Platform is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * The Whole Platform is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with the Whole Platform. If not, see <http://www.gnu.org/licenses/>.
 */
package org.whole.lang.xml.util;

import java.io.InputStream;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.whole.lang.util.StringUtils;

/**
 * @author Enrico Persiani
 */
public class XmlUtils {
	public static final String IGNORABLE_WHITESPACE_CHARS = 
		" \t\n\r";
	
	/* see "Extensible Markup Language (XML) 1.0 (Second Edition)" */
	public static final String CHARS =
		"\t\n\r" +
		"\u0020-\uD7FF" +				// U+20-U+D7FF
		"\uE000-\uFFFD" +				// U+E000-U+FFFD
		"\uD800\uDC00-\uDBFF\uDFFF"; 	// U+10000-U+10FFFF

	/* see "Namespaces in XML 1.0 (Second Edition)" */
	public static final String NAME_START_CHARS =
		"\\:a-z_A-Z" +
		"\u00C0-\u00D6" +				// U+C0-U+D6
		"\u00D8-\u00F6" +				// U+D8-U+F6
		"\u00F8-\u02FF" +				// U+F8-U+2FF
		"\u0370-\u037D" +				// U+370-U+37D
		"\u037F-\u1FFF" +				// U+37F-U+1FFF
		"\u200C-\u200D" +				// U+200C-U+200D
		"\u2070-\u218F" +				// U+2070-U+218F
		"\u2C00-\u2FEF" +				// U+2C00-U+2FEF
		"\u3001-\uD7FF" +				// U+3001-U+D7FF
		"\uF900-\uFDCF" +				// U+F900-U+FDCF
		"\uFDF0-\uFFFD" +				// U+FDF0-U+FFFD
		"\uD800\uDC00-\uDB7F\uDFFF"; 	// U+10000-U+EFFFF

	public static final String NAME_CHARS =
		".\\-0-9" +
		"\u00B7" +						// U+B7
		"\u0300-\u036F" +				// U+300-U+36F
		"\u203F-\u2040" +				// U+203F-U+2040
		NAME_START_CHARS;

	public static final String IGNORABLE_WHITESPACE_RE = "["+IGNORABLE_WHITESPACE_CHARS+"]*+";

	public static final String STRING_RE = "["+CHARS+"]*+";
	public static final String NORMALIZED_STRING_RE = "["+CHARS+"&&[^\t\n\r]]*+";
	public static final String TOKEN_RE = "["+CHARS+"&&[^ \t\n\r]]*+( ["+CHARS+"&&[^ \t\n\r]]++)*+";
	public static final String LANGUAGE_RE = "[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*+";

	public static final String NAME_RE = "["+NAME_START_CHARS+"]["+NAME_CHARS+"]*+";
	public static final String NMTOKEN_RE = "["+NAME_CHARS+"]++";
	public static final String NCNAME_RE = "["+NAME_START_CHARS+"&&[^:]]["+NAME_CHARS+"&&[^:]]*+";
	public static final String QNAME_RE = "("+NCNAME_RE+"):("+NCNAME_RE+")";

	public static final Pattern IGNORABLE_WHITESPACE_PATTERN = Pattern.compile(IGNORABLE_WHITESPACE_RE);

	public static final Pattern STRING_PATTERN = Pattern.compile(STRING_RE);
	public static final Pattern NORMALIZED_STRING_PATTERN = Pattern.compile(NORMALIZED_STRING_RE);
	public static final Pattern TOKEN_PATTERN = Pattern.compile(TOKEN_RE);
	public static final Pattern LANGUAGE_PATTERN = Pattern.compile(LANGUAGE_RE);

	public static final Pattern NAME_PATTERN = Pattern.compile(NAME_RE);
	public static final Pattern NMTOKEN_PATTERN = Pattern.compile(NMTOKEN_RE);
	public static final Pattern NCNAME_PATTERN = Pattern.compile(NCNAME_RE);
	public static final Pattern QNAME_PATTERN = Pattern.compile(QNAME_RE);


	public static boolean isIgnorableWhitespace(String input) {
		return IGNORABLE_WHITESPACE_PATTERN.matcher(input).matches();
	}

	public static boolean isString(String input) {
		return STRING_PATTERN.matcher(input).matches();
	}
	public static boolean isNormalizedString(String input) {
		return NORMALIZED_STRING_PATTERN.matcher(input).matches();
	}
	public static boolean isToken(String input) {
		return TOKEN_PATTERN.matcher(input).matches();
	}
	public static boolean isLanguage(String input) {
		return LANGUAGE_PATTERN.matcher(input).matches();
	}

	public static boolean isName(String input) {
		return NAME_PATTERN.matcher(input).matches();
	}
	public static boolean isNMToken(String input) {
		return NMTOKEN_PATTERN.matcher(input).matches();
	}
	public static boolean isNCName(String input) {
		return NCNAME_PATTERN.matcher(input).matches();
	}

	public static String toNCName(String input) {
		input = input.replaceAll("[:[^"+NAME_CHARS+"]]", "_");
		return isNCName(input) ? input : "_"+input;
	}
	public static boolean isQName(String input) {
		return QNAME_PATTERN.matcher(input).matches();
	}
	public static String getPrefix(String input) {
		Matcher matcher = QNAME_PATTERN.matcher(input);
		return matcher.matches() ?
				matcher.group(1) : null;
	}
	public static String getLocalPart(String input) {
		Matcher matcher = QNAME_PATTERN.matcher(input);
		return matcher.matches() ?
				matcher.group(2) : null;
	}

	public static final String DECIMAL_RE = "[-+]?\\d+(\\.\\d*)?";
	public static final Pattern DECIMAL_PATTERN = Pattern.compile(DECIMAL_RE);
	public static boolean isDecimal(String input) {
		return DECIMAL_PATTERN.matcher(input).matches();
	}

	public static URI parseURI(String uri) {
		try {
			int length = uri.length();
			StringBuilder sb = new StringBuilder(length*2);
			for (int i=0; i<length; i++) {
				char ch = uri.charAt(i);
				switch (ch) {
				case '<': case '>': case '{': case '}':
				case '"': case '|': case '^': case '`': case '\\':
					break;
				default:
					if (Character.isISOControl(ch) || Character.isWhitespace(ch))
						break;
				sb.append(ch);
				continue;
				}
				sb.append('%');
				sb.append(StringUtils.byteToHex((byte) (ch & 0xff)));
			}	
			return new URI(sb.toString());
		} catch (URISyntaxException e) {
			throw new IllegalArgumentException(e);
		}
	}

	public static String encodingFromXmlDeclaration(int magic) {
		// if there's no byte order mark, try to guess
		// using XML delcaration's first characters "<?xm" 
		switch (magic) {
		case 0x0000003C:
			return "UTF-32BE";
		case 0x3C000000:
			return "UTF-32LE";
		case 0x003C003F:
			return "UTF-16BE";
		case 0x3C003F00:
			return "UTF-16LE";
		case 0x3C3F786D:
			return "UTF-8";
		case 0x4C6FA794:
			return "EBCDIC";
		}
		return null;
	}

	private static final Pattern ENCODING_PATTERN = Pattern.compile("encoding\\s*=\\s*[\"']([^\"']+)", Pattern.CASE_INSENSITIVE);
	public static String guessEncoding(InputStream is, String defaultEncoding) {
		String encoding = null;
		try {
			// both the BOM and the xml declaration heuristic
			// algorithms need at least the first four bytes
			byte[] bytes = StringUtils.peekBytes(is, 4, true);

			if (bytes != null) {
				int magic = 0;
				for (int i=0; i<bytes.length; i++)
					magic |= (bytes[i] & 0xFF) << (8*(3-i));

				encoding = StringUtils.encodingFromBOM(magic);

				// skip bom length bytes
				is.read(bytes, 0, StringUtils.bomLength(magic));

				if (encoding == null) {
					encoding = XmlUtils.encodingFromXmlDeclaration(magic);

					if (encoding != null && !encoding.startsWith("UTF-32")) {
						String xmlDecl = new String(StringUtils.peekBytes(is, 1024, false), encoding);
						int endIndex = xmlDecl.indexOf("?>");
						xmlDecl = endIndex != -1 ? xmlDecl.substring(0, endIndex) : xmlDecl;
						java.util.regex.Matcher matcher = ENCODING_PATTERN.matcher(xmlDecl);
						if (matcher.find())
							encoding = Charset.forName(matcher.group(1)).name();
					}
				}
			}
		} catch (Exception e) {
			encoding = null;
		}
		return encoding != null ? encoding : defaultEncoding;
	}

	private static final String XML_DECL_PREFIX = "<?xml";
	public static boolean hasXmlDecl(Reader reader, boolean defaultValue) {
		try {
			char[] prefixChars = StringUtils.peekChars(reader, XML_DECL_PREFIX.length());
			String prefixString = new String(prefixChars);
			return XML_DECL_PREFIX.equals(prefixString);
		} catch (Exception e) {
			return defaultValue;
		}
	}
	public static boolean hasXmlDecl(InputStream is, String encoding, boolean defaultValue) {
		try {
			byte[] bytes = XML_DECL_PREFIX.getBytes(encoding);
			return Arrays.equals(bytes, StringUtils.peekBytes(is, bytes.length, false));
		} catch (Exception e) {
			return defaultValue;
		}
	}
}