/*
* Copyright (c) 2011+, HL7, Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of HL7 nor the names of its contributors may be used to
* endorse or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.hl7.fhir.utilities.xhtml;
/*
* #%L
* HAPI FHIR - Core Library
* %%
* Copyright (C) 2014 - 2017 University Health Network
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.*;
import java.util.*;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.Comment;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.hl7.fhir.exceptions.FHIRException;
import org.hl7.fhir.exceptions.FHIRFormatError;
import org.w3c.dom.Attr;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
public class XhtmlParser {
public static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
private Set<String> attributes = new HashSet<String>();
private String cache = "";
private int col = 0;
private Set<String> elements = new HashSet<String>();
private char lastChar;
private String lastText = "";
private int line = 1;
private boolean mustBeWellFormed = true;
private ParserSecurityPolicy policy;
private Reader rdr;
private boolean trimWhitespace;
private XhtmlNode unwindPoint;
private boolean validatorMode;
public XhtmlParser() {
super();
policy = ParserSecurityPolicy.Accept; // for general parsing
// set up sets
elements.add("p");
elements.add("br");
elements.add("div");
elements.add("h1");
elements.add("h2");
elements.add("h3");
elements.add("h4");
elements.add("h5");
elements.add("h6");
elements.add("a");
elements.add("span");
elements.add("b");
elements.add("em");
elements.add("i");
elements.add("strong");
elements.add("small");
elements.add("big");
elements.add("tt");
elements.add("small");
elements.add("dfn");
elements.add("q");
elements.add("var");
elements.add("abbr");
elements.add("acronym");
elements.add("cite");
elements.add("blockquote");
elements.add("hr");
elements.add("address");
elements.add("bdo");
elements.add("kbd");
elements.add("q");
elements.add("sub");
elements.add("sup");
elements.add("ul");
elements.add("ol");
elements.add("li");
elements.add("dl");
elements.add("dt");
elements.add("dd");
elements.add("pre");
elements.add("table");
elements.add("caption");
elements.add("colgroup");
elements.add("col");
elements.add("thead");
elements.add("tr");
elements.add("tfoot");
elements.add("tbody");
elements.add("th");
elements.add("td");
elements.add("code");
elements.add("samp");
elements.add("img");
elements.add("map");
elements.add("area");
attributes.add("title");
attributes.add("style");
attributes.add("class");
attributes.add("id");
attributes.add("lang");
attributes.add("xml:lang");
attributes.add("dir");
attributes.add("accesskey");
attributes.add("tabindex");
// tables:
attributes.add("span");
attributes.add("width");
attributes.add("align");
attributes.add("valign");
attributes.add("char");
attributes.add("charoff");
attributes.add("abbr");
attributes.add("axis");
attributes.add("headers");
attributes.add("scope");
attributes.add("rowspan");
attributes.add("colspan");
attributes.add("a.href");
attributes.add("a.name");
attributes.add("img.src");
attributes.add("img.border");
attributes.add("div.xmlns");
attributes.add("blockquote.cite");
attributes.add("q.cite");
attributes.add("a.charset");
attributes.add("a.type");
attributes.add("a.name");
attributes.add("a.href");
attributes.add("a.hreflang");
attributes.add("a.rel");
attributes.add("a.rev");
attributes.add("a.shape");
attributes.add("a.coords");
attributes.add("img.src");
attributes.add("img.alt");
attributes.add("img.longdesc");
attributes.add("img.height");
attributes.add("img.width");
attributes.add("img.usemap");
attributes.add("img.ismap");
attributes.add("map.name");
attributes.add("area.shape");
attributes.add("area.coords");
attributes.add("area.href");
attributes.add("area.nohref");
attributes.add("area.alt");
attributes.add("table.summary");
attributes.add("table.width");
attributes.add("table.border");
attributes.add("table.frame");
attributes.add("table.rules");
attributes.add("table.cellspacing");
attributes.add("table.cellpadding");
}
private void addTextNode(XhtmlNode node, StringBuilder s) {
String t = isTrimWhitespace() ? s.toString().trim() : s.toString();
if (t.length() > 0) {
lastText = t;
// System.out.println(t);
node.addText(t);
s.setLength(0);
}
}
private boolean attributeIsOk(String elem, String attr, String value) throws FHIRFormatError {
if (validatorMode)
return true;
boolean ok = attributes.contains(attr) || attributes.contains(elem + "." + attr);
if (ok) {
return true;
}
switch (policy) {
case Accept:
return true;
case Drop:
return false;
case Reject:
throw new FHIRFormatError("Illegal HTML attribute " + elem + "." + attr);
}
if ((elem + "." + attr).equals("img.src") && !(value.startsWith("#") || value.startsWith("http:") || value.startsWith("https:"))) {
switch (policy) {
case Accept:
return true;
case Drop:
return false;
case Reject:
throw new FHIRFormatError("Illegal Image Reference " + value);
}
}
return false;
}
private NSMap checkNamespaces(QName n, XhtmlNode node, NSMap nsm, boolean root) {
// what we do here is strip out any stated namespace attributes, putting them in the namesapce map
// then we figure out what the namespace of this element is, and state it explicitly if it's not the default
// but we don't bother with any of this if we're not validating
if (!validatorMode)
return null;
NSMap result = new NSMap(nsm);
List<String> nsattrs = new ArrayList<String>();
for (String an : node.getAttributes().keySet()) {
if (an.equals("xmlns")) {
result.def(node.getAttribute(an));
nsattrs.add(an);
}
if (an.startsWith("xmlns:")) {
result.ns(an.substring(6), node.getAttribute(an));
nsattrs.add(an);
}
}
for (String s : nsattrs)
node.getAttributes().remove(s);
if (n.hasNs()) {
String nns = result.get(n.getNs());
if (!nns.equals(result.def())) {
node.getAttributes().put("xmlns", nns);
result.def(nns);
}
} else if (root && result.hasDef()) {
node.getAttributes().put("xmlns", result.def());
}
return result;
}
private String checkNS(XhtmlNode res, Element node, String defaultNS) {
if (!validatorMode)
return null;
String ns = node.getNamespaceURI();
if (ns == null)
return null;
if (!ns.equals(defaultNS)) {
res.getAttributes().put("xmlns", ns);
return ns;
}
return defaultNS;
}
private String descLoc() {
return " at line " + Integer.toString(line) + " column " + Integer.toString(col);
}
private boolean elementIsOk(String name) throws FHIRFormatError {
if (validatorMode)
return true;
boolean ok = elements.contains(name);
if (ok){
return true;
}
switch (policy) {
case Accept:
return true;
case Drop:
return false;
case Reject:
throw new FHIRFormatError("Illegal HTML element " + name);
}
return false;
}
public ParserSecurityPolicy getPolicy() {
return policy;
}
private boolean isInteger(String s, int base) {
try {
Integer.parseInt(s, base);
return true;
} catch (Exception e) {
return false;
}
}
public boolean isMustBeWellFormed() {
return mustBeWellFormed;
}
private boolean isNameChar(char ch) {
return Character.isLetterOrDigit(ch) || ch == '_' || ch == '-' || ch == ':';
}
public boolean isTrimWhitespace() {
return trimWhitespace;
}
public boolean isValidatorMode() {
return validatorMode;
}
public XhtmlDocument parse(InputStream input, String entryName) throws FHIRFormatError, IOException {
rdr = new InputStreamReader(input, "UTF-8");
return parse(entryName);
}
private XhtmlDocument parse(String entryName) throws FHIRFormatError, IOException {
XhtmlDocument result = new XhtmlDocument();
skipWhiteSpaceAndComments(result);
if (peekChar() != '<')
throw new FHIRFormatError("Unable to Parse HTML - does not start with tag. Found " + peekChar() + descLoc());
readChar();
QName n = new QName(readName().toLowerCase());
if ((entryName != null) && !n.getName().equals(entryName))
throw new FHIRFormatError("Unable to Parse HTML - starts with '" + n + "' not '" + entryName + "'" + descLoc());
XhtmlNode root = result.addTag(n.getName());
parseAttributes(root);
NSMap nsm = checkNamespaces(n, root, null, true);
if (readChar() == '/') {
if (peekChar() != '>')
throw new FHIRFormatError("unexpected non-end of element " + n + " " + descLoc());
readChar();
} else {
unwindPoint = null;
List<XhtmlNode> p = new ArrayList<XhtmlNode>();
parseElementInner(root, p, nsm);
}
return result;
}
public XhtmlDocument parse(String source, String entryName) throws FHIRFormatError, IOException {
rdr = new StringReader(source);
return parse(entryName);
}
private void parseAttributes(XhtmlNode node) throws FHIRFormatError, IOException {
while (Character.isWhitespace(peekChar()))
readChar();
while (peekChar() != '>' && peekChar() != '/' && peekChar() != '\0') {
String name = readName();
if (name.length() == 0) {
throw new FHIRFormatError("Unable to read attribute on <" + node.getName() + ">" + descLoc());
}
while (Character.isWhitespace(peekChar()))
readChar();
if (isNameChar(peekChar()) || peekChar() == '>' || peekChar() == '/')
node.getAttributes().put(name, null);
else if (peekChar() != '=') {
throw new FHIRFormatError("Unable to read attribute '" + name + "' value on <" + node.getName() + ">" + descLoc());
} else {
readChar();
while (Character.isWhitespace(peekChar()))
readChar();
if (peekChar() == '"' || peekChar() == '\'')
node.getAttributes().put(name, parseAttributeValue(readChar()));
else
node.getAttributes().put(name, parseAttributeValue('\0'));
}
while (Character.isWhitespace(peekChar()))
readChar();
}
}
private String parseAttributeValue(char term) throws IOException, FHIRFormatError {
StringBuilder b = new StringBuilder();
while (peekChar() != '\0' && peekChar() != '>' && (term != '\0' || peekChar() != '/') && peekChar() != term) {
if (peekChar() == '&') {
parseLiteral(b);
} else
b.append(readChar());
}
if (peekChar() == term)
readChar();
return b.toString();
}
private void parseElement(XhtmlNode parent, List<XhtmlNode> parents, NSMap nsm) throws IOException, FHIRFormatError {
QName name = new QName(readName());
XhtmlNode node = parent.addTag(name.getName());
List<XhtmlNode> newParents = new ArrayList<XhtmlNode>();
newParents.addAll(parents);
newParents.add(parent);
parseAttributes(node);
nsm = checkNamespaces(name, node, nsm, false);
if (readChar() == '/') {
if (peekChar() != '>')
throw new FHIRFormatError("unexpected non-end of element " + name + " " + descLoc());
readChar();
} else {
parseElementInner(node, newParents, nsm);
}
}
private void parseElementInner(XhtmlNode node, List<XhtmlNode> parents, NSMap nsm) throws FHIRFormatError, IOException {
StringBuilder s = new StringBuilder();
while (peekChar() != '\0' && !parents.contains(unwindPoint) && !(node == unwindPoint)) {
if (peekChar() == '<') {
addTextNode(node, s);
readChar();
if (peekChar() == '!') {
String sc = readToCommentEnd();
if (sc.startsWith("DOCTYPE"))
throw new FHIRFormatError("Malformed XHTML: Found a DocType declaration, and these are not allowed (XXE security vulnerability protection)");
node.addComment(sc);
} else if (peekChar() == '?')
node.addComment(readToTagEnd());
else if (peekChar() == '/') {
readChar();
QName n = new QName(readToTagEnd());
if (node.getName().equals(n.getName())){
return;
}
if (mustBeWellFormed)
throw new FHIRFormatError("Malformed XHTML: Found \"</" + n.getName() + ">\" expecting \"</" + node.getName() + ">\"" + descLoc());
for (int i = parents.size() - 1; i >= 0; i--) {
if (parents.get(i).getName().equals(n))
unwindPoint = parents.get(i);
}
if (unwindPoint != null) {
for (int i = parents.size(); i > 0; i--) {
if (i < parents.size() && parents.get(i) == unwindPoint)
return;
if (i == parents.size()) {
parents.get(i - 1).getChildNodes().addAll(node.getChildNodes());
node.getChildNodes().clear();
} else {
parents.get(i - 1).getChildNodes().addAll(parents.get(i).getChildNodes());
parents.get(i).getChildNodes().clear();
}
}
}
} else if (Character.isLetterOrDigit(peekChar())) {
parseElement(node, parents, nsm);
} else
throw new FHIRFormatError("Unable to Parse HTML - node '" + node.getName() + "' has unexpected content '" + peekChar() + "' (last text = '" + lastText + "'" + descLoc());
} else if (peekChar() == '&') {
parseLiteral(s);
} else
s.append(readChar());
}
addTextNode(node, s);
}
private XhtmlNode parseFragment() throws IOException, FHIRException {
skipWhiteSpace();
if (peekChar() != '<')
throw new FHIRException("Unable to Parse HTML - does not start with tag. Found " + peekChar() + descLoc());
readChar();
if (peekChar() == '?') {
readToTagEnd();
skipWhiteSpace();
if (peekChar() != '<')
throw new FHIRException("Unable to Parse HTML - does not start with tag after processing instruction. Found " + peekChar() + descLoc());
readChar();
}
String n = readName().toLowerCase();
readToTagEnd();
XhtmlNode result = new XhtmlNode(NodeType.Element);
int colonIndex = n.indexOf(':');
if (colonIndex != -1) {
n = n.substring(colonIndex + 1);
}
result.setName(n);
unwindPoint = null;
List<XhtmlNode> p = new ArrayList<XhtmlNode>();
parseElementInner(result, p, null);
return result;
}
public XhtmlNode parseFragment(InputStream input) throws IOException, FHIRException {
rdr = new InputStreamReader(input);
return parseFragment();
}
public XhtmlNode parseFragment(String source) throws IOException, FHIRException {
rdr = new StringReader(source);
return parseFragment();
}
public XhtmlNode parseHtmlNode(Element node) throws FHIRFormatError {
return parseHtmlNode(node, null);
}
public XhtmlNode parseHtmlNode(Element node, String defaultNS) throws FHIRFormatError {
XhtmlNode res = parseNode(node, defaultNS);
if (res.getNsDecl() == null)
res.getAttributes().put("xmlns", XHTML_NS);
return res;
}
public XhtmlNode parseHtmlNode(XMLEventReader xpp) throws IOException, FHIRFormatError, XMLStreamException {
XhtmlNode res = parseNode(xpp);
if (res.getNsDecl() == null)
res.getAttributes().put("xmlns", XHTML_NS);
return res;
}
private void parseLiteral(StringBuilder s) throws IOException, FHIRFormatError {
// UInt16 w;
readChar();
String c = readUntil(';');
if (c.equals("apos"))
s.append('\'');
else if (c.equals("quot"))
s.append('"');
else if (c.equals("nbsp"))
s.append(XhtmlNode.NBSP);
else if (c.equals("amp"))
s.append('&');
else if (c.equals("rsquo"))
s.append('’');
else if (c.equals("gt"))
s.append('>');
else if (c.equals("lt"))
s.append('<');
else if (c.equals("copy"))
s.append((char) 169);
else if (c.equals("reg"))
s.append((char) 174);
else if (c.equals("sect"))
s.append((char) 0xA7);
else if (c.charAt(0) == '#') {
if (isInteger(c.substring(1), 10))
s.append((char) Integer.parseInt(c.substring(1)));
else if (isInteger(c.substring(1), 16))
s.append((char) Integer.parseInt(c.substring(1), 16));
} else if (c.equals("fnof"))
s.append((char) 402); // latin small f with hook = function = florin, U+0192 ISOtech -->
else if (c.equals("Alpha"))
s.append((char) 913); // greek capital letter alpha, U+0391
else if (c.equals("Beta"))
s.append((char) 914); // greek capital letter beta, U+0392
else if (c.equals("Gamma"))
s.append((char) 915); // greek capital letter gamma, U+0393 ISOgrk3
else if (c.equals("Delta"))
s.append((char) 916); // greek capital letter delta, U+0394 ISOgrk3
else if (c.equals("Epsilon"))
s.append((char) 917); // greek capital letter epsilon, U+0395
else if (c.equals("Zeta"))
s.append((char) 918); // greek capital letter zeta, U+0396
else if (c.equals("Eta"))
s.append((char) 919); // greek capital letter eta, U+0397
else if (c.equals("Theta"))
s.append((char) 920); // greek capital letter theta, U+0398 ISOgrk3
else if (c.equals("Iota"))
s.append((char) 921); // greek capital letter iota, U+0399
else if (c.equals("Kappa"))
s.append((char) 922); // greek capital letter kappa, U+039A
else if (c.equals("Lambda"))
s.append((char) 923); // greek capital letter lambda, U+039B ISOgrk3
else if (c.equals("Mu"))
s.append((char) 924); // greek capital letter mu, U+039C
else if (c.equals("Nu"))
s.append((char) 925); // greek capital letter nu, U+039D
else if (c.equals("Xi"))
s.append((char) 926); // greek capital letter xi, U+039E ISOgrk3
else if (c.equals("Omicron"))
s.append((char) 927); // greek capital letter omicron, U+039F
else if (c.equals("Pi"))
s.append((char) 928); // greek capital letter pi, U+03A0 ISOgrk3
else if (c.equals("Rho"))
s.append((char) 929); // greek capital letter rho, U+03A1
else if (c.equals("Sigma"))
s.append((char) 931); // greek capital letter sigma, U+03A3 ISOgrk3
else if (c.equals("Tau"))
s.append((char) 932); // greek capital letter tau, U+03A4
else if (c.equals("Upsilon"))
s.append((char) 933); // greek capital letter upsilon, U+03A5 ISOgrk3
else if (c.equals("Phi"))
s.append((char) 934); // greek capital letter phi, U+03A6 ISOgrk3
else if (c.equals("Chi"))
s.append((char) 935); // greek capital letter chi, U+03A7
else if (c.equals("Psi"))
s.append((char) 936); // greek capital letter psi, U+03A8 ISOgrk3
else if (c.equals("Omega"))
s.append((char) 937); // greek capital letter omega, U+03A9 ISOgrk3
else if (c.equals("alpha"))
s.append((char) 945); // greek small letter alpha, U+03B1 ISOgrk3
else if (c.equals("beta"))
s.append((char) 946); // greek small letter beta, U+03B2 ISOgrk3
else if (c.equals("gamma"))
s.append((char) 947); // greek small letter gamma, U+03B3 ISOgrk3
else if (c.equals("delta"))
s.append((char) 948); // greek small letter delta, U+03B4 ISOgrk3
else if (c.equals("epsilon"))
s.append((char) 949); // greek small letter epsilon, U+03B5 ISOgrk3
else if (c.equals("zeta"))
s.append((char) 950); // greek small letter zeta, U+03B6 ISOgrk3
else if (c.equals("eta"))
s.append((char) 951); // greek small letter eta, U+03B7 ISOgrk3
else if (c.equals("theta"))
s.append((char) 952); // greek small letter theta, U+03B8 ISOgrk3
else if (c.equals("iota"))
s.append((char) 953); // greek small letter iota, U+03B9 ISOgrk3
else if (c.equals("kappa"))
s.append((char) 954); // greek small letter kappa, U+03BA ISOgrk3
else if (c.equals("lambda"))
s.append((char) 955); // greek small letter lambda, U+03BB ISOgrk3
else if (c.equals("mu"))
s.append((char) 956); // greek small letter mu, U+03BC ISOgrk3
else if (c.equals("nu"))
s.append((char) 957); // greek small letter nu, U+03BD ISOgrk3
else if (c.equals("xi"))
s.append((char) 958); // greek small letter xi, U+03BE ISOgrk3
else if (c.equals("omicron"))
s.append((char) 959); // greek small letter omicron, U+03BF NEW
else if (c.equals("pi"))
s.append((char) 960); // greek small letter pi, U+03C0 ISOgrk3
else if (c.equals("rho"))
s.append((char) 961); // greek small letter rho, U+03C1 ISOgrk3
else if (c.equals("sigmaf"))
s.append((char) 962); // greek small letter final sigma, U+03C2 ISOgrk3
else if (c.equals("sigma"))
s.append((char) 963); // greek small letter sigma, U+03C3 ISOgrk3
else if (c.equals("tau"))
s.append((char) 964); // greek small letter tau, U+03C4 ISOgrk3
else if (c.equals("upsilon"))
s.append((char) 965); // greek small letter upsilon, U+03C5 ISOgrk3
else if (c.equals("phi"))
s.append((char) 966); // greek small letter phi, U+03C6 ISOgrk3
else if (c.equals("chi"))
s.append((char) 967); // greek small letter chi, U+03C7 ISOgrk3
else if (c.equals("psi"))
s.append((char) 968); // greek small letter psi, U+03C8 ISOgrk3
else if (c.equals("omega"))
s.append((char) 969); // greek small letter omega, U+03C9 ISOgrk3
else if (c.equals("thetasym"))
s.append((char) 977); // greek small letter theta symbol, U+03D1 NEW
else if (c.equals("upsih"))
s.append((char) 978); // greek upsilon with hook symbol, U+03D2 NEW
else if (c.equals("piv"))
s.append((char) 982); // greek pi symbol, U+03D6 ISOgrk3
else if (c.equals("bull"))
s.append((char) 8226); // bullet = black small circle, U+2022 ISOpub
else if (c.equals("hellip"))
s.append((char) 8230); // horizontal ellipsis = three dot leader, U+2026 ISOpub
else if (c.equals("prime"))
s.append((char) 8242); // prime = minutes = feet, U+2032 ISOtech
else if (c.equals("Prime"))
s.append((char) 8243); // double prime = seconds = inches, U+2033 ISOtech
else if (c.equals("oline"))
s.append((char) 8254); // overline = spacing overscore, U+203E NEW
else if (c.equals("frasl"))
s.append((char) 8260); // fraction slash, U+2044 NEW
else if (c.equals("weierp"))
s.append((char) 8472); // script capital P = power set = Weierstrass p, U+2118 ISOamso
else if (c.equals("image"))
s.append((char) 8465); // blackletter capital I = imaginary part, U+2111 ISOamso
else if (c.equals("real"))
s.append((char) 8476); // blackletter capital R = real part symbol, U+211C ISOamso
else if (c.equals("trade"))
s.append((char) 8482); // trade mark sign, U+2122 ISOnum
else if (c.equals("alefsym"))
s.append((char) 8501); // alef symbol = first transfinite cardinal, U+2135 NEW
else if (c.equals("larr"))
s.append((char) 8592); // leftwards arrow, U+2190 ISOnum
else if (c.equals("uarr"))
s.append((char) 8593); // upwards arrow, U+2191 ISOnum
else if (c.equals("rarr"))
s.append((char) 8594); // rightwards arrow, U+2192 ISOnum
else if (c.equals("darr"))
s.append((char) 8595); // downwards arrow, U+2193 ISOnum
else if (c.equals("harr"))
s.append((char) 8596); // left right arrow, U+2194 ISOamsa
else if (c.equals("crarr"))
s.append((char) 8629); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
else if (c.equals("lArr"))
s.append((char) 8656); // leftwards double arrow, U+21D0 ISOtech
else if (c.equals("uArr"))
s.append((char) 8657); // upwards double arrow, U+21D1 ISOamsa
else if (c.equals("rArr"))
s.append((char) 8658); // rightwards double arrow, U+21D2 ISOtech
else if (c.equals("dArr"))
s.append((char) 8659); // downwards double arrow, U+21D3 ISOamsa
else if (c.equals("hArr"))
s.append((char) 8660); // left right double arrow, U+21D4 ISOamsa
else if (c.equals("forall"))
s.append((char) 8704); // for all, U+2200 ISOtech
else if (c.equals("part"))
s.append((char) 8706); // partial differential, U+2202 ISOtech
else if (c.equals("exist"))
s.append((char) 8707); // there exists, U+2203 ISOtech
else if (c.equals("empty"))
s.append((char) 8709); // empty set = null set = diameter, U+2205 ISOamso
else if (c.equals("nabla"))
s.append((char) 8711); // nabla = backward difference, U+2207 ISOtech
else if (c.equals("isin"))
s.append((char) 8712); // element of, U+2208 ISOtech
else if (c.equals("notin"))
s.append((char) 8713); // not an element of, U+2209 ISOtech
else if (c.equals("ni"))
s.append((char) 8715); // contains as member, U+220B ISOtech
else if (c.equals("prod"))
s.append((char) 8719); // n-ary product = product sign, U+220F ISOamsb
else if (c.equals("sum"))
s.append((char) 8721); // n-ary sumation, U+2211 ISOamsb
else if (c.equals("minus"))
s.append((char) 8722); // minus sign, U+2212 ISOtech
else if (c.equals("lowast"))
s.append((char) 8727); // asterisk operator, U+2217 ISOtech
else if (c.equals("radic"))
s.append((char) 8730); // square root = radical sign, U+221A ISOtech
else if (c.equals("prop"))
s.append((char) 8733); // proportional to, U+221D ISOtech
else if (c.equals("infin"))
s.append((char) 8734); // infinity, U+221E ISOtech -->
else if (c.equals("ang"))
s.append((char) 8736); // angle, U+2220 ISOamso
else if (c.equals("and"))
s.append((char) 8743); // logical and = wedge, U+2227 ISOtech
else if (c.equals("or"))
s.append((char) 8744); // logical or = vee, U+2228 ISOtech
else if (c.equals("cap"))
s.append((char) 8745); // intersection = cap, U+2229 ISOtech
else if (c.equals("cup"))
s.append((char) 8746); // union = cup, U+222A ISOtech
else if (c.equals("int"))
s.append((char) 8747); // integral, U+222B ISOtech
else if (c.equals("there4"))
s.append((char) 8756); // therefore, U+2234 ISOtech
else if (c.equals("sim"))
s.append((char) 8764); // tilde operator = varies with = similar t U+223C ISOtech
else if (c.equals("cong"))
s.append((char) 8773); // approximately equal to, U+2245 ISOtec
else if (c.equals("asymp"))
s.append((char) 8776); // almost equal to = asymptotic to, U+2248 ISOamsr
else if (c.equals("ne"))
s.append((char) 8800); // not equal to, U+2260 ISOtech
else if (c.equals("equiv"))
s.append((char) 8801); // identical to, U+2261 ISOtech
else if (c.equals("le"))
s.append((char) 8804); // less-than or equal to, U+2264 ISOtech
else if (c.equals("ge"))
s.append((char) 8805); // greater-than or equal to, U+2265 ISOtech
else if (c.equals("sub"))
s.append((char) 8834); // subset of, U+2282 ISOtech
else if (c.equals("sup"))
s.append((char) 8835); // superset of, U+2283 ISOtech
else if (c.equals("nsub"))
s.append((char) 8836); // not a subset of, U+2284 ISOamsn
else if (c.equals("sube"))
s.append((char) 8838); // subset of or equal to, U+2286 ISOtech
else if (c.equals("supe"))
s.append((char) 8839); // superset of or equal to, U+2287 ISOtech
else if (c.equals("oplus"))
s.append((char) 8853); // circled plus = direct sum, U+2295 ISOamsb
else if (c.equals("otimes"))
s.append((char) 8855); // circled times = vector product, U+2297 ISOamsb -->
else if (c.equals("perp"))
s.append((char) 8869); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
else if (c.equals("sdot"))
s.append((char) 8901); // dot operator, U+22C5 ISOamsb
else if (c.equals("lceil"))
s.append((char) 8968); // left ceiling = apl upstile, U+2308 ISOamsc
else if (c.equals("rceil"))
s.append((char) 8969); // right ceiling, U+2309 ISOamsc
else if (c.equals("lfloor"))
s.append((char) 8970); // left floor = apl downstile, U+230A ISOamsc
else if (c.equals("rfloor"))
s.append((char) 8971); // right floor, U+230B ISOamsc
else if (c.equals("lang"))
s.append((char) 9001); // left-pointing angle bracket = bra, U+2329 ISOtech
else if (c.equals("rang"))
s.append((char) 9002); // right-pointing angle bracket = ket, U+232A ISOtech
else if (c.equals("loz"))
s.append((char) 9674); // lozenge, U+25CA ISOpub
else if (c.equals("spades"))
s.append((char) 9824); // black spade suit, U+2660 ISOpub
else if (c.equals("clubs"))
s.append((char) 9827); // black club suit = shamrock, U+2663 ISOpub
else if (c.equals("hearts"))
s.append((char) 9829); // black heart suit = valentine, U+2665 ISOpub
else if (c.equals("diams"))
s.append((char) 9830); // black diamond suit, U+2666 ISOpub --
else
throw new FHIRFormatError("unable to parse character reference '" + c + "'' (last text = '" + lastText + "'" + descLoc());
}
private XhtmlNode parseNode(Element node, String defaultNS) throws FHIRFormatError {
XhtmlNode res = new XhtmlNode(NodeType.Element);
res.setName(node.getLocalName());
defaultNS = checkNS(res, node, defaultNS);
for (int i = 0; i < node.getAttributes().getLength(); i++) {
Attr attr = (Attr) node.getAttributes().item(i);
if (attributeIsOk(res.getName(), attr.getName(), attr.getValue()) && !attr.getLocalName().startsWith("xmlns"))
res.getAttributes().put(attr.getName(), attr.getValue());
}
Node child = node.getFirstChild();
while (child != null) {
if (child.getNodeType() == Node.TEXT_NODE) {
res.addText(child.getTextContent());
} else if (child.getNodeType() == Node.COMMENT_NODE) {
res.addComment(child.getTextContent());
} else if (child.getNodeType() == Node.ELEMENT_NODE) {
if (elementIsOk(child.getLocalName()))
res.getChildNodes().add(parseNode((Element) child, defaultNS));
} else
throw new FHIRFormatError("Unhandled XHTML feature: " + Integer.toString(child.getNodeType()) + descLoc());
child = child.getNextSibling();
}
return res;
}
private XhtmlNode parseNode(XMLEventReader xpp) throws IOException, FHIRFormatError, XMLStreamException {
XhtmlNode res = new XhtmlNode(NodeType.Element);
if (!xpp.hasNext()) {
return res;
}
StartElement firstEvent = (StartElement) xpp.nextEvent();
res.setName(firstEvent.getSchemaType().getLocalPart());
for (Iterator<?> attrIter = firstEvent.getAttributes(); attrIter.hasNext();) {
Attribute nextAttr = (Attribute) attrIter.next();
if (attributeIsOk(firstEvent.getName().getLocalPart(), nextAttr.getName().getLocalPart(), nextAttr.getValue()))
res.getAttributes().put(nextAttr.getName().getLocalPart(), nextAttr.getValue());
}
while (xpp.hasNext()) {
XMLEvent nextEvent = xpp.nextEvent();
int eventType = nextEvent.getEventType();
if (eventType != XMLEvent.END_ELEMENT) {
break;
}
if (eventType == XMLEvent.CHARACTERS) {
res.addText(((Characters) xpp).getData());
} else if (eventType == XMLEvent.COMMENT) {
res.addComment(((Comment) xpp).getText());
} else if (eventType == XMLEvent.START_ELEMENT) {
StartElement nextStart = (StartElement) nextEvent;
if (elementIsOk(nextStart.getName().getLocalPart())) {
res.getChildNodes().add(parseNode(xpp));
}
} else {
throw new FHIRFormatError("Unhandled XHTML feature: " + Integer.toString(eventType) + descLoc());
}
}
xpp.next();
return res;
}
private char peekChar() throws IOException {
if (cache.length() > 0)
return cache.charAt(0);
else if (!rdr.ready())
return '\0';
else {
char c = (char) rdr.read();
if (c == (char) -1) {
cache = "";
return '\0';
}
cache = Character.toString(c);
return c;
}
}
private void pushChar(char ch) {
cache = Character.toString(ch) + cache;
}
private char readChar() throws IOException {
char c;
if (cache.length() > 0) {
c = cache.charAt(0);
cache = cache.length() == 1 ? "" : cache.substring(1);
} else if (!rdr.ready())
c = '\0';
else
c = (char) rdr.read();
if (c == '\r' || c == '\n') {
if (c == '\r' || lastChar != '\r') {
line++;
col = 0;
}
lastChar = c;
}
col++;
return c;
}
private String readName() throws IOException {
StringBuilder s = new StringBuilder();
while (isNameChar(peekChar()))
s.append(readChar());
return s.toString();
}
private String readToCommentEnd() throws IOException, FHIRFormatError {
if (peekChar() == '!')
readChar();
StringBuilder s = new StringBuilder();
boolean simple = true;
if (peekChar() == '-') {
readChar();
simple = peekChar() != '-';
if (simple)
s.append('-');
else
readChar();
}
boolean done = false;
while (!done) {
char c = peekChar();
if (c == '-') {
readChar();
if (peekChar() == '-') {
readChar();
if (peekChar() == '>') {
done = true;
} else
s.append("--");
} else
s.append('-');
} else if (simple && peekChar() == '>') {
done = true;
} else if (c != '\0')
s.append(readChar());
else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source" + descLoc());
}
if (peekChar() != '\0') {
readChar();
skipWhiteSpace();
}
return s.toString();
}
private String readToTagEnd() throws IOException, FHIRFormatError {
StringBuilder s = new StringBuilder();
while (peekChar() != '>' && peekChar() != '\0')
s.append(readChar());
if (peekChar() != '\0') {
readChar();
skipWhiteSpace();
} else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source" + descLoc());
return s.toString();
}
private String readUntil(char ch) throws IOException {
StringBuilder s = new StringBuilder();
while (peekChar() != 0 && peekChar() != ch)
s.append(readChar());
readChar();
return s.toString();
}
public void setMustBeWellFormed(boolean mustBeWellFormed) {
this.mustBeWellFormed = mustBeWellFormed;
}
public void setPolicy(ParserSecurityPolicy policy) {
this.policy = policy;
}
public void setTrimWhitespace(boolean trimWhitespace) {
this.trimWhitespace = trimWhitespace;
}
public XhtmlParser setValidatorMode(boolean validatorMode) {
this.validatorMode = validatorMode;
return this;
}
private void skipWhiteSpace() throws IOException {
if (trimWhitespace)
while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
readChar();
}
private void skipWhiteSpaceAndComments(XhtmlNode focus) throws IOException, FHIRFormatError {
while (Character.isWhitespace(peekChar()) || (peekChar() == 0xfeff))
readChar();
if (peekChar() == '<') {
char ch = readChar();
if (peekChar() == '!') {
readChar();
if (peekChar() == '-') {
readChar();
if (peekChar() == '-') {
readChar();
if (peekChar() == ' ')
readChar();
focus.addComment(readToCommentEnd());
} else
throw new FHIRFormatError("unrecognised element type <!" + peekChar() + descLoc());
} else
focus.addDocType(readToCommentEnd());
skipWhiteSpaceAndComments(focus);
} else if (peekChar() == '?') {
String r = readToTagEnd();
focus.addInstruction(r.substring(1, r.length() - 1));
skipWhiteSpaceAndComments(focus);
} else
pushChar(ch);
}
}
public class NSMap {
private Map<String, String> nslist = new HashMap<String, String>();
public NSMap(NSMap nsm) {
if (nsm != null)
nslist.putAll(nsm.nslist);
}
public String def() {
return nslist.get("");
}
public void def(String ns) {
nslist.put("", ns);
}
public String get(String abbrev) {
return nslist.containsKey(abbrev) ? nslist.get(abbrev) : "http://error/undefined-namespace";
}
public boolean hasDef() {
return nslist.containsKey("");
}
public void ns(String abbrev, String ns) {
nslist.put(abbrev, ns);
}
}
public enum ParserSecurityPolicy {
Accept, Drop, Reject
}
public class QName {
private String name;
private String ns;
public QName(String src) {
if (src.contains(":")) {
ns = src.substring(0, src.indexOf(":"));
name = src.substring(src.indexOf(":") + 1);
} else {
ns = null;
name = src;
}
}
public String getName() {
return name;
}
public String getNs() {
return ns;
}
public boolean hasNs() {
return ns != null;
}
}
}