package net.varkhan.base.conversion.formats;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
/**
* <b>XML serialization and deserialization utilities</b>.
* <p/>
*
* @author varkhan
* @date 3/18/12
* @time 2:38 PM
*/
public class Xml {
protected Xml() { }
/**********************************************************************************
** XML writing
**/
/**
* Writes an element, its attributes and text content to an {@link Appendable}.
* <p/>
* The element name must be an alpha-numeric character sequence (see {@link #isValidElmtName(CharSequence)}).
* <p/>
* The text content entities are escaped (see {@link #writeText(Appendable, CharSequence)}).
* <p/>
* The attribute arrays must each have an even length, and contain { name, value } pairs,
* where each name (an even-index element) is a {@link CharSequence}, and each value is
* obtained by calling the {@link Object#toString()} method on the value object (the
* following odd-index element). Attributes whose value object is {@literal null} are
* ignored, and only the attribute name is written for values that resolve to an empty String.
* The character entities in each String value are escaped.
*
* @param out the output Appendable
* @param tag the element name
* @param txt the text content
* @param atr the attributes arrays
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if the element name or an attribute name is {@literal null}
* @throws IllegalArgumentException if the element name is not a valid element (see
* {@link #isValidElmtName(CharSequence)}), the attribute arrays are not in the expected
* { name, value } pair format, an attribute name is not a CharSequence, or is not
* a valid attribute (see {@link #isValidAttrName(CharSequence)})
*/
public static <A extends Appendable> A writeElmt(A out, String tag, CharSequence txt, Object[]... atr) throws IOException, NullPointerException, IllegalArgumentException {
if(tag==null) throw new NullPointerException("Element names must not be null");
if(!isValidElmtName(tag))
throw new IllegalArgumentException("Element names must contain only alphanumeric characters");
out.append('<').append(tag);
writeAttr(out, atr);
if(txt==null) out.append('/').append('>');
else {
out.append('>');
writeText(out, txt);
out.append('<').append('/').append(tag).append('>');
}
return out;
}
/**
* Writes an element, its attributes and text content to an {@link Appendable}.
* <p/>
* The element name must be an alpha-numeric character sequence (see {@link #isValidElmtName(CharSequence)}).
* <p/>
* The text content entities are escaped (see {@link #writeText(Appendable, CharSequence)}).
* <p/>
* Attributes whose value object is {@literal null} are ignored, and only
* the attribute name is written for values that resolve to an empty String.
* The character entities in each String value are escaped (see {@link #writeText(Appendable, CharSequence)}).
*
* @param out the output Appendable
* @param tag the element name
* @param txt the text content
* @param atr the attributes arrays
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if the element name or an attribute name is {@literal null}
* @throws IllegalArgumentException if the element name is not a valid element (see
* {@link #isValidElmtName(CharSequence)}), the attribute arrays are not in the expected
* { name, value } pair format, an attribute name is not a CharSequence, or is not
* a valid attribute (see {@link #isValidAttrName(CharSequence)})
*/
public static <A extends Appendable> A writeElmt(A out, String tag, CharSequence txt, Map<CharSequence, Object> atr) throws IOException, NullPointerException, IllegalArgumentException {
if(tag==null) throw new NullPointerException("Element names must not be null");
if(!isValidElmtName(tag))
throw new IllegalArgumentException("Element names must contain only alphanumeric characters");
out.append('<').append(tag);
writeAttr(out, atr);
if(txt==null) out.append('/').append('>');
else {
out.append('>');
writeText(out, txt);
out.append('<').append('/').append(tag).append('>');
}
return out;
}
/**
* Writes an element's opening tag and attributes to an {@link Appendable}.
* <p/>
* The element name must be an alpha-numeric character sequence (see {@link #isValidElmtName(CharSequence)}).
* <p/>
* The attribute arrays must each have an even length, and contain { name, value } pairs,
* where each name (an even-index element) is a {@link CharSequence}, and each value is
* obtained by calling the {@link Object#toString()} method on the value object (the
* following odd-index element). Attributes whose value object is {@literal null} are
* ignored, and only the attribute name is written for values that resolve to an empty String.
* The character entities in each String value are escaped (see {@link #writeText(Appendable, CharSequence)}).
*
* @param out the output Appendable
* @param tag the element name
* @param atr the attributes arrays
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if the element name or an attribute name is {@literal null}
* @throws IllegalArgumentException if the element name is not a valid element (see
* {@link #isValidElmtName(CharSequence)}), the attribute arrays are not in the expected
* { name, value } pair format, an attribute name is not a CharSequence, or is not
* a valid attribute (see {@link #isValidAttrName(CharSequence)})
*/
public static <A extends Appendable> A writeElmtOpen(A out, String tag, Object[]... atr) throws IOException, NullPointerException, IllegalArgumentException {
if(tag==null) throw new NullPointerException("Element names must not be null");
if(!isValidElmtName(tag))
throw new IllegalArgumentException("Element names must contain only alphanumeric characters");
out.append('<').append(tag);
writeAttr(out, atr);
out.append('>');
return out;
}
/**
* Writes an element's opening tag and attributes to an {@link Appendable}.
* <p/>
* The element name must be an alpha-numeric character sequence (see {@link #isValidElmtName(CharSequence)}).
* <p/>
* Attributes whose value object is {@literal null} are ignored, and only
* the attribute name is written for values that resolve to an empty String.
* The character entities in each String value are escaped (see {@link #writeText(Appendable, CharSequence)}).
*
* @param out the output Appendable
* @param tag the element name
* @param atr the attributes arrays
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if the element name or an attribute name is {@literal null}
* @throws IllegalArgumentException if the element name is not a valid element (see
* {@link #isValidElmtName(CharSequence)}), the attribute arrays are not in the expected
* { name, value } pair format, an attribute name is not a CharSequence, or is not
* a valid attribute (see {@link #isValidAttrName(CharSequence)})
*/
public static <A extends Appendable> A writeElmtOpen(A out, String tag, Map<CharSequence, Object> atr) throws IOException, NullPointerException, IllegalArgumentException {
if(tag==null) throw new NullPointerException("Element names must not be null");
if(!isValidElmtName(tag))
throw new IllegalArgumentException("Element names must contain only alphanumeric characters");
out.append('<').append(tag);
writeAttr(out, atr);
out.append('>');
return out;
}
/**
* Writes an element's closing tag to an {@link Appendable}.
* <p/>
* The element name must be an alpha-numeric character sequence (see {@link #isValidElmtName(CharSequence)}).
*
* @param out the output Appendable
* @param tag the element name
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if the element name is {@literal null}
* @throws IllegalArgumentException if the element name is not a valid element (see
* {@link #isValidElmtName(CharSequence)})
*/
public static <A extends Appendable> A writeElmtClose(A out, String tag) throws IOException, NullPointerException, IllegalArgumentException {
if(tag==null) throw new NullPointerException("Element names must not be null");
if(!isValidElmtName(tag))
throw new IllegalArgumentException("Element names must contain only alphanumeric characters");
out.append('<').append('/').append(tag).append('>');
return out;
}
/**
* Writes the attributes of an element to an {@link Appendable}.
* <p/>
* The attribute arrays must each have an even length, and contain { name, value } pairs,
* where each name (an even-index element) is a {@link CharSequence}, and each value is
* obtained by calling the {@link Object#toString()} method on the value object (the
* following odd-index element). Attributes whose value object is {@literal null} are
* ignored, and only the attribute name is written for values that resolve to an empty String.
* The character entities in each String value are escaped (see {@link #writeText(Appendable, CharSequence)}).
*
* @param out the output Appendable
* @param atr the attribute arrays
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if an attribute name is {@literal null}
* @throws IllegalArgumentException if the attribute arrays are not in the expected
* { name, value } pair format, an attribute name is not a CharSequence, or is not
* a valid attribute (see {@link #isValidAttrName(CharSequence)})
*/
public static <A extends Appendable> A writeAttr(A out, Object[]... atr) throws IOException, NullPointerException, IllegalArgumentException {
if(atr!=null) for(Object[] at : atr) {
if(at!=null) {
// Check that we have matched name = value pairs
if((at.length&1)!=0)
throw new IllegalArgumentException("Attribute list must contain an even number of elements, as { name, value } pairs");
for(int i=0;i+1<at.length;i+=2) {
Object atn=at[i];
// Specifically trap the null case
if(atn==null) throw new NullPointerException("Attribute names must not be null");
// Must be a CharSequence (this also traps the null case)
if(!(atn instanceof CharSequence))
throw new IllegalArgumentException("Attribute names must be assignable to CharSequence");
if(!isValidAttrName((CharSequence) atn))
throw new IllegalArgumentException("Attribute names must contain only alphanumeric characters");
// Suppress attributes with null values
Object ato=at[i+1];
if(ato!=null) {
out.append(' ').append((CharSequence) atn);
String atv=ato.toString();
// Suppress equal sign for empty attributes
if(atv!=null&&!atv.isEmpty()) {
out.append('=').append('"');
// Escape character entities
writeText(out, atv);
out.append('"');
}
}
}
}
}
return out;
}
/**
* Writes the attributes of an element to an {@link Appendable}.
* <p/>
* Attributes whose value object is {@literal null} are ignored, and only
* the attribute name is written for values that resolve to an empty String.
* The character entities in each String value are escaped (see {@link #writeText(Appendable, CharSequence)}).
*
* @param out the output Appendable
* @param atr the attribute map
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
* @throws NullPointerException if an attribute name is {@literal null}
* @throws IllegalArgumentException if the attribute arrays are not in the expected
* { name, value } pair format, an attribute name is not a CharSequence, or is not
* a valid attribute (see {@link #isValidAttrName(CharSequence)})
*/
public static <A extends Appendable> A writeAttr(A out, Map<CharSequence, Object> atr) throws IOException, NullPointerException, IllegalArgumentException {
if(atr!=null) for(Map.Entry<CharSequence,Object> at : atr.entrySet()) {
CharSequence atn = at.getKey();
// Specifically trap the null case
if(atn==null) throw new NullPointerException("Attribute names must not be null");
if(!isValidAttrName(atn))
throw new IllegalArgumentException("Attribute names must contain only alphanumeric characters");
// Suppress attributes with null values
Object ato=at.getValue();
if(ato!=null) {
out.append(' ').append(atn);
String atv=ato.toString();
// Suppress equal sign for empty attributes
if(atv!=null&&!atv.isEmpty()) {
out.append('=').append('"');
// Escape character entities
writeText(out, atv);
out.append('"');
}
}
}
return out;
}
/**
* Writes an XML comment to an {@link Appendable}, escaping double-hyphen ("--").
*
* @param out the output Appendable
* @param txt the text of the comment
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
*/
public static <A extends Appendable> A writeComm(A out, CharSequence txt) throws IOException {
out.append("<!--");
// Replace double-hyphen delimiters to something safe
if(txt!=null) repl(out, txt, "--", "- -");
out.append("-->");
return out;
}
/**
* Writes an XML comment to an {@link Appendable}, escaping double-hyphen ("--").
*
* @param out the output Appendable
* @param txt the lines of text of the comment
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
*/
public static <A extends Appendable> A writeComm(A out, CharSequence[]... txt) throws IOException {
out.append("<!--");
if(txt!=null) for(CharSequence[] tt : txt) {
if(tt!=null) for(CharSequence t : tt) {
// Replace double-hyphen delimiters to something safe
if(t!=null) {
repl(out, t, "--", "- -");
out.append('\n');
}
}
}
out.append("-->");
return out;
}
/**
* Writes an XML meta-tag to an {@link Appendable}.
*
* @param out the output Appendable
* @param tag the meta name
* @param txt the text of the meta tag
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
*/
public static <A extends Appendable> A writeMeta(A out, CharSequence tag, CharSequence txt) throws IOException {
out.append("<!").append(tag);
if(txt!=null) { out.append(' '); writeText(out, txt); }
out.append(">");
return out;
}
public static final char[] XML_ENTITIES_CHARS=new char[] { '&', '<', '>', '\"', '\'' };
public static final String[] XML_ENTITIES_NAMES=new String[] { "&", "<", ">", """, "'" };
/**
* Writes text to an {@link Appendable}, escaping character entities.
* <p/>
* The character entities '&', '<', '>', '"' are replaced,
* respectively, by the strings "&","<", ">", """
*
* @param out the output Appendable
* @param txt the text to escape
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
*/
public static <A extends Appendable> A writeText(A out, CharSequence txt) throws IOException {
// Replace common entities
if(txt!=null) repl(out, txt, XML_ENTITIES_CHARS, XML_ENTITIES_NAMES);
return out;
}
/**
* Writes lines of text to an {@link Appendable}, escaping character entities.
* <p/>
* The character entities '&', '<', '>', '"' are replaced,
* respectively, by the strings "&","<", ">", """
*
* @param out the output Appendable
* @param txt the lines of text to escape
* @param <A> the Appendable type
*
* @return the output Appendable (to facilitate chaining)
*
* @throws java.io.IOException if the output Appendable generated an exception
*/
public static <A extends Appendable> A writeText(A out, CharSequence[]... txt) throws IOException {
if(txt!=null) for(CharSequence[] tt : txt) {
if(tt!=null) for(CharSequence t : tt) {
// Replace common entities
if(t!=null) {
repl(out, t, XML_ENTITIES_CHARS, XML_ENTITIES_NAMES);
out.append('\n');
}
}
}
return out;
}
public static <A extends Appendable> A write(A out, Node node) throws IOException {
if(node==null) return out;
switch(node.type()) {
case TEXT:
writeText(out,node.text());
break;
case ELEM:
if(node.iterator().hasNext()) {
writeElmtOpen(out, node.name(), node.attrs());
for(Node n : node) {
write(out, n);
}
writeElmtClose(out, node.name());
}
else {
writeElmt(out, node.name(), null, node.attrs());
}
break;
case COMM:
writeComm(out,node.text());
break;
case DATA:
// writeCData(out,node.data());
break;
}
return out;
}
/**********************************************************************************
** XML reading
**/
/**
* State-aware wrapper for a reader and current XML tag
*/
public static class Parser implements Closeable {
private final Reader in;
private int st = ' ';
private int ln = 0;
private int cn = 0;
protected Parser(Reader in) {
this.in=in;
}
/**
* The last character read.
* @return the last character read, or -1 if EOS has been reached
*/
public int last() { return st; }
/**
* Reads one character from the stream.
* @return the character read from the stream, or -1 if EOS has been reached
* @throws IOException if an I/O error occurred while reading from the stream
*/
public int next() throws IOException {
st = in.read();
// System.err.println("XML "+ln+":"+cn+"'"+((char)st)+"'");
if(st=='\n') { ln++; cn=0; }
else if(st>=0) cn ++;
return st;
}
/**
* Reads and discards all whitespace characters until a non-whitespace character is reached
* @return the last (non-whitespace) character read from the stream, or -1 if EOS has been reached
* @throws IOException if an I/O error occurred while reading from the stream
*/
public int skipWhitespace() throws IOException {
while(st>=0 && isWhiteSpace(st)) { next(); }
return st;
}
/**
* Reads into a buffer all characters until a non-text or whitespace character is found
* @return the last (non-text or whitespace) character read from the stream, or -1 if EOS has been reached
* @throws IOException if an I/O error occurred while reading from the stream
*/
public int readName(Appendable buf) throws IOException {
while(st>=0 && !isWhiteSpace(st) && st!='=' && st!='>' && st!='/' && st!='&') {
buf.append((char) st);
next();
}
return st;
}
public int readString(Appendable buf) throws IOException, FormatException {
// Acquire and skip delimiter char
int d = st;
next();
StringBuilder cc = null; //new StringBuilder();
while(st!=d) {
if(st<0) throw exception("Unterminated delimited string sequence");
// Escape sequences
/** ":" ':' <:< >:> &:& */
if(st=='&') {
if(cc==null) cc = new StringBuilder();
else cc.setLength(0);
cc.append('&');
next();
if(st<0) throw exception("Unterminated escape sequence");
while(st!=';') {
cc.append((char)st);
next();
if(st<0) throw exception("Unterminated escape sequence");
}
cc.append(';');
int c=decodeEntity(cc.toString());
if(c>=0) buf.append((char)c);
else throw exception("Illegal escape sequence '"+cc.toString()+"'");
}
else buf.append((char) st);
next();
}
return st;
}
/**
* Reads into a buffer all characters until a tag delimiter is found
* @return the last (non-text) character read from the stream, or -1 if EOS has been reached
* @throws IOException if an I/O error occurred while reading from the stream
*/
public int readText(Appendable buf) throws IOException {
StringBuilder cc = null; //new StringBuilder();
while(st!='<' && st>=0) {
if(st=='&') {
if(cc==null) cc = new StringBuilder();
else cc.setLength(0);
cc.append('&');
next();
if(st<0) throw exception("Unterminated escape sequence");
while(st!=';') {
cc.append((char)st);
next();
if(st<0) throw exception("Unterminated escape sequence");
}
cc.append(';');
int c=decodeEntity(cc.toString());
if(c>=0) buf.append((char)c);
else throw exception("Illegal escape sequence '"+cc.toString()+"'");
}
else buf.append((char) st);
next();
}
return st;
}
public int decodeEntity(String ent) {
for(int i=0;i<XML_ENTITIES_NAMES.length;i++) {
String e=XML_ENTITIES_NAMES[i];
if(e.equals(ent)) return XML_ENTITIES_CHARS[i];
}
if(ent.length()>3 && ent.charAt(1)=='#') try {
if(ent.length()>=6 && ent.charAt(2)=='x') {
return Integer.parseInt(ent.substring(3,ent.length()-1),16);
}
else if(ent.length()>=5) {
return Integer.parseInt(ent.substring(2,ent.length()-1),10);
}
}
catch(NumberFormatException e) { return -1; }
return -1;
}
protected int readElemAttr(Map<CharSequence,Object> atr) throws IOException, IllegalArgumentException {
StringBuilder name = null;
StringBuilder value = null;
// One turn of loop for each attr
while(st>=0) {
// Skip whitespace
skipWhitespace();
// End of seq/tag? done
if(st<0 || st=='>' || st=='/') return st;
// First character of name -> find end
if(name==null) name=new StringBuilder();
else name.setLength(0);
readName(name);
if(!isValidAttrName(name)) throw exception("Attribute names must contain only alphanumeric characters");
skipWhitespace();
// Empty attribute: next attribute with no intervening =, or EOL reached
if(st!='=') {
if(atr!=null) atr.put(name.toString(), null);
continue;
}
next();
skipWhitespace();
if(value==null) value = new StringBuilder();
else value.setLength(0);
if(st=='\"' || st=='\'') {
readString(value);
}
else {
readName(value);
}
if(atr!=null) atr.put(name.toString(), value.toString());
}
return st;
}
public Event readEvent() throws IOException, FormatException {
// End of stream?
if(st<0) return null;
skipWhitespace();
// Element open/close?
if(st=='<') {
next();
if(st=='!') {
next();
if(st=='-') {
next();
if(st!='-') throw exception("Malformed element opening");
StringBuilder buf = new StringBuilder();
while(st>=0 && (buf.length()<2 || buf.charAt(buf.length()-1) !='-' || buf.charAt(buf.length()-2) !='-')) {
while(st!='>'&&st>=0) {
buf.append((char) st);
next();
}
}
if(st<0) throw exception("Malformed comment block",buf);
buf.setLength(buf.length()-2);
return new Event(Event.Phase.Inline, Node.Type.COMM, null, null, buf.toString(), null);
}
else {
StringBuilder name = new StringBuilder();
readName(name);
if(!isValidElmtName(name)) throw exception("Declaration names must contain only alphanumeric characters",name);
StringBuilder buf = new StringBuilder();
while(st>=0 && st!='<' && st!='>') {
buf.append((char) st);
next();
}
next();
return new Event(Event.Phase.Inline, Node.Type.META, name.toString(), null, buf.toString(), null);
}
}
else if(st=='/') {
next();
skipWhitespace();
StringBuilder name = new StringBuilder();
readName(name);
if(!isValidElmtName(name)) throw exception("Element names must contain only alphanumeric characters",name);
skipWhitespace();
if(st!='>') throw exception("Malformed element closing");
next();
return new Event(Event.Phase.Close, Node.Type.ELEM, name.toString(), null, null, null);
}
else if(st=='?') {
next();
StringBuilder name = new StringBuilder();
readName(name);
if(!"xml".equalsIgnoreCase(name.toString())) throw exception("Invalid processing instruction target \""+name+"\"",name);
StringBuilder buf = new StringBuilder();
while(st>=0 && st!='<' && st!='>' && (buf.length()<2 || buf.charAt(buf.length()-1) !='?')) {
buf.append((char) st);
next();
}
if(st<0) throw exception("Malformed processing instruction",buf);
buf.setLength(buf.length()-1);
return new Event(Event.Phase.Inline, Node.Type.META, "?xml", null, buf.toString(), null);
}
skipWhitespace();
// Name extraction loop
StringBuilder name = new StringBuilder();
readName(name);
if(!isValidElmtName(name)) throw exception("Element names must contain only alphanumeric characters",name);
Map<CharSequence,Object> attrs = new LinkedHashMap<CharSequence,Object>();
readElemAttr(attrs);
if(st<0) throw exception("Malformed element opening");
if(st=='/') {
next();
if(st!='>') throw exception("Malformed element closing");
next();
return new Event(Event.Phase.Inline, Node.Type.ELEM, name.toString(), attrs, null, null);
}
if(st!='>') throw exception("Malformed element closing");
next();
return new Event(Event.Phase.Open, Node.Type.ELEM, name.toString(), attrs, null, null);
}
// End of stream?
else if(st<0) return null;
// Free text
else {
StringBuilder buf = new StringBuilder();
readText(buf);
return new Event(Event.Phase.Inline, Node.Type.TEXT, null, null, buf.toString(), null);
}
}
protected boolean readElemNodes(String name, List<Node> nodes) throws IOException, FormatException {
skipWhitespace();
while(st>=0) {
Node n = readNode(name);
if(n==null) return true;
nodes.add(n);
}
return false;
}
public Node readNode(String root) throws IOException, FormatException {
skipWhitespace();
if(st=='<') {
next();
if(st=='!') {
next();
if(st=='-') {
next();
if(st!='-') throw exception("Malformed element opening");
StringBuilder buf = new StringBuilder();
readText(buf);
if(!buf.toString().endsWith("--")) throw exception("Malformed comment block");
buf.setLength(buf.length()-2);
return new Comm(buf.toString());
}
else {
StringBuilder name = new StringBuilder();
readName(name);
if(!isValidElmtName(name)) throw exception("Element names must contain only alphanumeric characters");
StringBuilder buf = new StringBuilder();
while(st>=0 && st!='<' && st!='>') {
buf.append((char) st);
next();
}
return new Meta(name.toString(), buf.toString());
}
}
else if(st=='/') {
skipWhitespace();
StringBuilder name = new StringBuilder();
readName(name);
if(!isValidElmtName(name)) throw exception("Element names must contain only alphanumeric characters");
skipWhitespace();
if(st!='>') throw exception("Malformed element closing");
if(root!=null) {
if(root.equals(name.toString())) return null;
}
throw exception("Mismatched closing element",name);
}
else if(st=='?') {
StringBuilder name = new StringBuilder();
readName(name);
if(!"xml".equalsIgnoreCase(name.toString())) throw exception("Invalid processing instruction target \""+name+"\"",name);
StringBuilder buf = new StringBuilder();
while(st>=0 && st!='<' && st!='>' && (buf.length()<2 || buf.charAt(buf.length()-1) !='?')) {
buf.append((char) st);
next();
}
if(st<0) throw exception("Malformed processing instruction",buf);
buf.setLength(buf.length()-1);
return new Meta("?xml", buf.toString());
}
skipWhitespace();
// Name extraction loop
StringBuilder name = new StringBuilder();
readName(name);
if(!isValidElmtName(name)) throw exception("Element names must contain only alphanumeric characters");
Map<CharSequence,Object> attrs = new LinkedHashMap<CharSequence,Object>();
readElemAttr(attrs);
if(st<0) throw exception("Malformed element opening");
if(st=='/') {
next();
if(st!='>') throw exception("Malformed element closing");
return new Elem(name.toString(), attrs);
}
List<Node> nodes = new ArrayList<Node>();
if(!readElemNodes(name.toString(), nodes)) throw exception("Unclosed element <"+name.toString()+">");
return new Elem(name.toString(), attrs,nodes);
}
else {
StringBuilder buf = new StringBuilder();
readText(buf);
return new Text(buf.toString());
}
}
/**
* Create a new format exception.
*
* @return a FormatException indicating line and column numbers
*/
public FormatException exception(String msg) {
return new FormatException(msg + " at ln:"+ln+",cn:"+cn+" near '"+(char)st+"'", ln, cn, null);
}
/**
* Create a new format exception.
*
* @return a FormatException indicating line and column numbers
*/
public FormatException exception(String msg, CharSequence ctx) {
return new FormatException(msg + " at ln:"+ln+",cn:"+cn+" near '"+(char)st+"': "+ctx, ln, cn, ctx.toString());
}
/**
* Create a new format exception.
*
* @return a FormatException indicating line and column numbers
*/
public FormatException exception(String msg, CharSequence ctx, Throwable exc) {
return new FormatException(msg + " at ln:"+ln+",cn:"+cn+" near '"+(char)st+"': "+ctx, ln, cn, ctx.toString(), exc);
}
@Override
public void close() throws IOException {
in.close();
}
}
public static Node read(Reader in) throws IOException, FormatException {
Parser p = new Parser(in);
return p.readNode(null);
}
public static Node read(InputStream in) throws IOException, FormatException {
Map<String,String> par = readPrologue(in);
String cn = par.get("encoding");
Charset cs = cn==null?Charset.defaultCharset():Charset.forName(cn);
Parser p = new Parser(new InputStreamReader(in, cs));
return p.readNode(null);
}
public static Parser parse(Reader in) throws IOException, FormatException {
return new Parser(in);
}
public static Parser parse(InputStream in) throws IOException, FormatException {
Map<String,String> par = readPrologue(in);
String cn = par.get("encoding");
Charset cs = cn==null?Charset.defaultCharset():Charset.forName(cn);
return new Parser(new InputStreamReader(in, cs));
}
/**
* Read the XML prologue form an input stream.
* <p/>
* This method does not rely on a Reader because this is where the XML
* encoding is specified, which is needed to create the right Reader.
* <p/>
* It reads character by character, thus at the end of a successful invocation,
* the stream will be left pointing at the beginning of the document, or
* !DOCTYPE declaration if present.
*
* @param in an input stream
* @return a parameter map
* @throws IOException
*/
public static Map<String,String> readPrologue(InputStream in) throws IOException {
// We need this method not to rely on a parser because this is where we detect file encoding,
// which is needed to create the right Reader
// '<?xml' version="1.0" encoding="UTF-8"? SDDecl? S? '?>'
Map<String,String> atr = new HashMap<String,String>();
int st = in.read();
if(st!='<') throw new FormatException("Invalid prologue",0,0,"");
st = in.read();
if(st!='?') throw new FormatException("Invalid prologue",0,0,"");
st = in.read();
if(st!='x') throw new FormatException("Invalid prologue",0,0,"");
st = in.read();
if(st!='m') throw new FormatException("Invalid prologue",0,0,"");
st = in.read();
if(st!='l') throw new FormatException("Invalid prologue",0,0,"");
st = in.read();
while(st>=0) {
// Skip whitespace
while(st>=0 && isWhiteSpace(st)) { st = in.read(); }
// End of seq/tag? done
if(st<0) throw new FormatException("Unterminated prologue",0,0,"");
if(st=='?') {
st = in.read();
if(st<0) throw new FormatException("Unterminated prologue",0,0,"");
if(st!='>') throw new FormatException("Unterminated prologue",0,0,"");
break;
}
// First character of name -> find end
StringBuilder name = new StringBuilder();
while(st>=0 && !isWhiteSpace(st) && st!='=' && st!='>' && st!='?') {
name.append((char) st);
st = in.read();
}
if(!isValidPrologue(name)) throw new FormatException("Invalid prologue declaration",0,0,name.toString());
while(st>=0 && isWhiteSpace(st)) { st = in.read(); }
// Empty attribute: next attribute with no intervening =, or EOL reached
if(st!='=') throw new FormatException("Invalid prologue declaration",0,0,name.toString());
st = in.read();
while(st>=0 && isWhiteSpace(st)) { st = in.read(); }
StringBuilder buf = new StringBuilder();
if(st=='\"' || st=='\'') {
int d = st;
st = in.read();
while(st!=d) {
if(st<0) throw new FormatException("Unterminated prologue",0,0,"");
// Escape sequences
if(st=='\\') {
st = in.read();
if(st<0) throw new FormatException("Unterminated prologue",0,0,"");
}
buf.append((char) st);
st = in.read();
}
st = in.read();
}
else {
while(st>=0 && !isWhiteSpace(st) && st!='=' && st!='>' && st!='/') {
buf.append((char) st);
st = in.read();
}
}
atr.put(name.toString(), buf.toString());
}
return atr;
}
/**********************************************************************************
** XML syntax checks
**/
/**
* Checks whether a CharSequence is a valid XML attribute name.
* <p/>
* A valid name is a sequence of the characters [-+._A-Za-z].
*
* @param name the name to check
*
* @return {@literal true} if the name is valid
*/
public static boolean isValidAttrName(CharSequence name) {
final int len=name.length();
for(int i=0;i<len;i++) {
char c=name.charAt(i);
if(c=='_'||c=='+'||c=='-'||c=='.') continue;
if('0'<=c||c<='9') continue;
if('A'<=c||c<='Z') continue;
if('a'<=c||c<='z') continue;
return false;
}
return true;
}
/**
* Checks whether a character is legal in a XML attribute name.
* <p/>
* A legal character is in [-+._A-Za-z].
*
* @param c the character to check
* @return {@literal true} if the character is legal
*/
public static boolean isValidAttrChar(int c) {
if(c=='_'||c=='+'||c=='-'||c=='.') return true;
if('0'<=c&&c<='9') return true;
if('A'<=c&&c<='Z') return true;
if('a'<=c&&c<='z') return true;
return false;
}
public static boolean isValidPrologue(CharSequence name) {
return "version".contentEquals(name)||"encoding".contentEquals(name)||"standalone".contentEquals(name);
}
/**
* Checks whether a CharSequence is a valid XML element name.
* <p/>
* A valid name is a sequence of the characters [-+._A-Za-z].
*
* @param name the name to check
*
* @return {@literal true} if the name is valid
*/
public static boolean isValidElmtName(CharSequence name) {
final int len=name.length();
for(int i=0;i<len;i++) {
char c=name.charAt(i);
if(c=='_'||c=='+'||c=='-'||c=='.') continue;
if('0'<=c||c<='9') continue;
if('A'<=c||c<='Z') continue;
if('a'<=c||c<='z') continue;
return false;
}
return true;
}
/**
* Checks whether a character is legal in a XML element name.
* <p/>
* A legal character is in [-+._A-Za-z].
*
* @param c the character to check
* @return {@literal true} if the character is legal
*/
public static boolean isValidElmtChar(int c) {
if(c=='_'||c=='+'||c=='-'||c=='.') return true;
if('0'<=c&&c<='9') return true;
if('A'<=c&&c<='Z') return true;
if('a'<=c&&c<='z') return true;
return false;
}
/**
* Checks whether a character is white-space
* <p/>
*
* @param c the character to check
* @return {@literal true} if the character is whitespace
*/
public static boolean isWhiteSpace(int c) {
return c==' '||c=='\n'||c=='\r'||c=='\t';
}
protected static <A extends Appendable> A repl(A buf, CharSequence str, CharSequence pat, CharSequence rep) throws IOException {
final int lp=pat.length();
final int ls=str.length();
// Pattern finding loop
find:
for(int i=0;i<ls;) {
// Look for a local match starting at i
for(int j=0;j<lp;j++) {
if(i+j>=ls||str.charAt(i+j)!=pat.charAt(j)) {
// No match => add current char, restart match
buf.append(str.charAt(i++));
continue find;
}
}
// Match => add replacement
if(rep!=null) buf.append(rep);
i+=lp;
}
return buf;
}
protected static <A extends Appendable> A repl(A buf, CharSequence str, CharSequence[] pat, CharSequence[] rep) throws IOException {
final int np=pat.length;
final int ls=str.length();
// Pattern finding loop
find:
for(int i=0;i<ls;) {
match:
for(int k=0;k<np;k++) {
CharSequence sp=pat[k];
final int lp=sp.length();
// Look for a local match starting at i
for(int j=0;j<lp;j++) {
if(i+j>=ls||str.charAt(i+j)!=sp.charAt(j)) {
// No match for pattern => go to next pattern
continue match;
}
}
// Match => add replacement, skip pattern
if(rep!=null) buf.append(rep[k]);
i+=lp;
continue find;
}
// No match => add current char, restart match
buf.append(str.charAt(i++));
}
return buf;
}
protected static <A extends Appendable> A repl(A buf, CharSequence str, char[] pat, CharSequence[] rep) throws IOException {
final int np=pat.length;
final int ls=str.length();
// Pattern finding loop
find:
for(int i=0;i<ls;i++) {
char c = str.charAt(i);
for(int k=0;k<np;k++) {
if(pat[k]==c) {
// Match => add replacement, skip pattern
if(rep!=null) buf.append(rep[k]);
continue find;
}
}
// No match => add current char, restart match
buf.append(c);
}
return buf;
}
/**********************************************************************************
** XML document object model
**/
public static class Event {
public static enum Phase { Inline, Open, Close }
protected final Event.Phase phase;
protected final Node.Type type;
protected final String name;
protected final String text;
protected final byte[] data;
protected final Map<CharSequence,Object> attr;
public Event(Phase phase, Node.Type type, String name, Map<CharSequence,Object> attr, String text, byte[] data) {
this.phase=phase;
this.type=type;
this.text=text;
this.name=name;
this.data=data;
this.attr=attr;
}
public Event.Phase phase() { return phase; }
public Node.Type type() { return type; }
public String name() { return name; }
public Map<CharSequence, Object> attr() { return attr; }
public String text() { return text; }
public byte[] data() { return data; }
public String toString() {
StringBuilder buf=new StringBuilder();
switch(type) {
case TEXT:
try { writeText(buf, text); } catch(IOException e) { /* ignore */ }
break;
case ELEM:
switch(phase) {
case Inline:
try { writeElmt(buf,name,null,attr); } catch(IOException e) { /* ignore */ }
break;
case Open:
try { writeElmtOpen(buf, name, attr); } catch(IOException e) { /* ignore */ }
break;
case Close:
try { writeElmtClose(buf, name); } catch(IOException e) { /* ignore */ }
break;
}
break;
case COMM:
try { writeComm(buf, text); } catch(IOException e) { /* ignore */ }
break;
case META:
try { writeMeta(buf, name, text); } catch(IOException e) { /* ignore */ }
break;
case DATA:
buf.append("<![CDATA["+"]]>");
// try { writeCData(buf,text); } catch(IOException e) { /* ignore */ }
break;
}
return buf.toString();
}
}
public static abstract class Node implements Iterable<Node> {
public static enum Type {
TEXT, ELEM, COMM, META, DATA
}
protected final Type type;
protected Node(Type type) { this.type=type; }
public Type type() { return type; }
public String name() { return null; }
public String text() { return null; }
public byte[] data() { return null; }
public Map<CharSequence, Object> attrs() { return null; }
public Iterator<Node> iterator() {
return new Iterator<Node>() {
public boolean hasNext() { return false; }
public Node next() { throw new NoSuchElementException(); }
public void remove() { throw new UnsupportedOperationException(); }
};
}
}
public static class Text extends Node {
protected final String text;
public Text(String text) {
super(Type.TEXT);
this.text=text;
}
public String text() { return text; }
}
public static class Comm extends Node {
protected final String text;
public Comm(String text) {
super(Type.TEXT);
this.text=text;
}
public String text() { return text; }
}
public static class Elem extends Node {
protected final String name;
protected final List<Node> nodes;
protected final Map<CharSequence,Object> attrs;
public Elem(String name, Map<CharSequence,Object> attrs) {
super(Type.ELEM);
this.name=name;
this.attrs=attrs;
this.nodes=null;
}
public Elem(String name, Map<CharSequence,Object> attrs, List<Node> nodes) {
super(Type.ELEM);
this.name=name;
this.attrs=attrs;
this.nodes=nodes;
}
public String name() { return name; }
public Iterator<Node> iterator() {
return nodes==null?super.iterator():nodes.iterator();
}
}
public static class Meta extends Node {
protected final String name;
protected final String text;
protected final List<Meta> nodes;
public Meta(String name, String text) {
super(Type.META);
this.name=name;
this.text=text;
this.nodes=null;
}
}
}