/* GeoGebra - Dynamic Mathematics for Everyone http://www.geogebra.org This file is part of GeoGebra. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation. */ /* * Quick and dirty XML parser. Java Tip 128 * http://www.javaworld.com/javaworld/javatips/jw-javatip128.html * * Some optimizations by Markus Hohenwarter, 19.11.2004 */ package org.geogebra.common.io; import java.io.Reader; import java.util.LinkedHashMap; import java.util.Stack; import org.geogebra.common.util.StringUtil; /** * Quick and Dirty xml parser. This parser is, like the SAX parser, an event * based parser, but with much less functionality. */ public class QDParser { private LinkedHashMap<String, String> attrs; private Stack<Integer> stack; private StringBuilder sb, etag; /** * Creates new parser */ public QDParser() { attrs = new LinkedHashMap<String, String>(); stack = new Stack<Integer>(); sb = new StringBuilder(); etag = new StringBuilder(); } /** * Resets the parser */ public void reset() { attrs.clear(); stack.clear(); sb = new StringBuilder(); etag = new StringBuilder(); } private static int popMode(Stack<Integer> st) { if (!st.empty()) { return st.pop().intValue(); } return PRE; } private final static int TEXT = 1, ENTITY = 2, OPEN_TAG = 3, CLOSE_TAG = 4, START_TAG = 5, ATTRIBUTE_LVALUE = 6, ATTRIBUTE_EQUAL = 9, ATTRIBUTE_RVALUE = 10, QUOTE = 7, IN_TAG = 8, SINGLE_TAG = 12, COMMENT = 13, DONE = 11, DOCTYPE = 14, PRE = 15, CDATA = 16; /** * @param doc * handler that receives document events * @param r * source of XML data * @throws Exception * if XML is not valid */ final public void parse(DocHandler doc, Reader r) throws Exception { // Stack stack = new Stack(); stack.clear(); int depth = 0; int mode = PRE; int c = 0; int quotec = '"'; depth = 0; // StringBuilder sb = new StringBuilder(); // StringBuilder etag = new StringBuilder(); sb.setLength(0); etag.setLength(0); String tagName = null; String lvalue = null; String rvalue = null; // attrs = new LinkedHashMap(); attrs.clear(); doc.startDocument(); int line = 1, col = 0; boolean eol = false; while ((c = r.read()) != -1) { // We need to map \r, \r\n, and \n to \n // See XML spec section 2.11 if (c == '\n' && eol) { eol = false; continue; } else if (eol) { eol = false; } else if (c == '\n') { line++; col = 0; } else if (c == '\r') { eol = true; c = '\n'; line++; col = 0; } else { col++; } switch (mode) { case DONE: doc.endDocument(); return; // We are between tags collecting text. case TEXT: switch (c) { case '<': stack.push(Integer.valueOf(mode)); mode = START_TAG; if (sb.length() > 0) { doc.text(sb.toString()); sb.setLength(0); } break; case '&': stack.push(Integer.valueOf(mode)); mode = ENTITY; etag.setLength(0); break; default: sb.append((char) c); } break; // we are processing a closing tag: e.g. </foo> case CLOSE_TAG: switch (c) { case '>': mode = popMode(stack); tagName = sb.toString(); sb.setLength(0); depth--; if (depth == 0) { mode = DONE; } doc.endElement(tagName); break; default: sb.append((char) c); } break; // we are processing CDATA case CDATA: if (c == '>' && sb.toString().endsWith("]]")) { sb.setLength(sb.length() - 2); doc.text(sb.toString()); sb.setLength(0); mode = popMode(stack); } else { sb.append((char) c); } break; // we are processing a comment. We are inside // the <!-- .... --> looking for the -->. case COMMENT: if (c == '>' && sb.toString().endsWith("--")) { sb.setLength(0); mode = popMode(stack); } else { sb.append((char) c); } break; // We are outside the root tag element case PRE: if (c == '<') { mode = TEXT; stack.push(Integer.valueOf(mode)); mode = START_TAG; } break; // We are inside one of these <? ... ?> // or one of these <!DOCTYPE ... > case DOCTYPE: if (c == '>') { mode = popMode(stack); if (mode == TEXT) { mode = PRE; } } break; // we have just seen a < and // are wondering what we are looking at // <foo>, </foo>, <!-- ... --->, etc. case START_TAG: mode = popMode(stack); switch (c) { case '/': stack.push(Integer.valueOf(mode)); mode = CLOSE_TAG; break; case '?': mode = DOCTYPE; break; default: stack.push(Integer.valueOf(mode)); mode = OPEN_TAG; tagName = null; // attrs = new LinkedHashMap(); sb.append((char) c); } break; // we are processing an entity, e.g. <, », etc. case ENTITY: if (c == ';') { mode = popMode(stack); String cent = etag.toString(); etag.setLength(0); if ("lt".equals(cent)) { sb.append('<'); } else if ("gt".equals(cent)) { sb.append('>'); } else if ("amp".equals(cent)) { sb.append('&'); } else if ("quot".equals(cent)) { sb.append('"'); } else if ("apos".equals(cent)) { sb.append('\''); } else if (cent.startsWith("#x")) { sb.append( (char) Integer.parseInt(cent.substring(2), 16)); } else if (cent.charAt(0) == '#') { sb.append((char) Integer.parseInt(cent.substring(1))); // Insert custom entity definitions here } else { exc("Unknown entity: &" + cent + ";", line, col); } } else { etag.append((char) c); } break; // we have just seen something like this: // <foo a="b"/ // and are looking for the final >. case SINGLE_TAG: if (tagName == null) { tagName = sb.toString(); } if (c != '>') { exc("Expected > for tag: <" + tagName + "/>", line, col); } doc.startElement(tagName, attrs); doc.endElement(tagName); if (depth == 0) { doc.endDocument(); return; } sb.setLength(0); // attrs = new LinkedHashMap(); attrs.clear(); tagName = null; mode = popMode(stack); break; // we are processing something // like this <foo ... >. It could // still be a <!-- ... --> or something. case OPEN_TAG: switch (c) { case '>': if (tagName == null) { tagName = sb.toString(); } sb.setLength(0); depth++; doc.startElement(tagName, attrs); tagName = null; // attrs = new LinkedHashMap(); attrs.clear(); mode = popMode(stack); break; case '/': mode = SINGLE_TAG; break; case '-': if (sb.toString().equals("!-")) { mode = COMMENT; } else { sb.append((char) c); } break; case '[': if (sb.toString().equals("![CDATA")) { mode = CDATA; sb.setLength(0); } break; case 'E': if (sb.toString().equals("!DOCTYP")) { sb.setLength(0); mode = DOCTYPE; } break; default: if (StringUtil.isWhitespace((char) c)) { tagName = sb.toString(); sb.setLength(0); mode = IN_TAG; } else { sb.append((char) c); } } break; // We are processing the quoted right-hand side // of an element's attribute. case QUOTE: if (c == quotec) { rvalue = sb.toString(); sb.setLength(0); attrs.put(lvalue, rvalue); mode = IN_TAG; // See section the XML spec, section 3.3.3 // on normalization processing. } // Markus Hohenwarter, begin // I need to get all characters within quotes // including newlines // else if (" \r\n\u0009".indexOf(c) >= 0) { // sb.append(' '); // } // Markus Hohenwarter, end else if (c == '&') { stack.push(Integer.valueOf(mode)); mode = ENTITY; etag.setLength(0); } else { sb.append((char) c); } break; case ATTRIBUTE_RVALUE: if (c == '"' || c == '\'') { quotec = c; mode = QUOTE; } else if (!StringUtil.isWhitespace((char) c)) { exc("Error in attribute processing", line, col); } break; case ATTRIBUTE_LVALUE: if (StringUtil.isWhitespace((char) c)) { lvalue = sb.toString(); sb.setLength(0); mode = ATTRIBUTE_EQUAL; } else if (c == '=') { lvalue = sb.toString(); sb.setLength(0); mode = ATTRIBUTE_RVALUE; } else { sb.append((char) c); } break; case ATTRIBUTE_EQUAL: if (c == '=') { mode = ATTRIBUTE_RVALUE; } else if (!StringUtil.isWhitespace((char) c)) { exc("Error in attribute processing.", line, col); } break; case IN_TAG: switch (c) { case '>': mode = popMode(stack); doc.startElement(tagName, attrs); depth++; tagName = null; // attrs = new LinkedHashMap(); attrs.clear(); break; case '/': mode = SINGLE_TAG; break; default: if (!StringUtil.isWhitespace((char) c)) { mode = ATTRIBUTE_LVALUE; sb.append((char) c); } } break; } } if (mode == DONE) { doc.endDocument(); } else { exc("missing end tag", line, col); } } private static void exc(String s, int line, int col) throws Exception { throw new Exception(s + " near line " + line + ", column " + col); } }