QDParser.java example

Explorer
geogebra-master
/* 
 GeoGebra - Dynamic Mathematics for Everyone
 http://www.geogebra.org

 This file is part of GeoGebra.

 This program is free software; you can redistribute it and/or modify it 
 under the terms of the GNU General Public License as published by 
 the Free Software Foundation.
 
 */

/*
 * Quick and dirty XML parser. Java Tip 128
 * http://www.javaworld.com/javaworld/javatips/jw-javatip128.html
 * 
 * Some optimizations by Markus Hohenwarter, 19.11.2004
 */

package org.geogebra.common.io;

import java.io.Reader;
import java.util.LinkedHashMap;
import java.util.Stack;

import org.geogebra.common.util.StringUtil;

/**
 * Quick and Dirty xml parser. This parser is, like the SAX parser, an event
 * based parser, but with much less functionality.
 */
public class QDParser {

	private LinkedHashMap<String, String> attrs;
	private Stack<Integer> stack;
	private StringBuilder sb, etag;

	/**
	 * Creates new parser
	 */
	public QDParser() {
		attrs = new LinkedHashMap<String, String>();
		stack = new Stack<Integer>();
		sb = new StringBuilder();
		etag = new StringBuilder();
	}

	/**
	 * Resets the parser
	 */
	public void reset() {
		attrs.clear();
		stack.clear();
		sb = new StringBuilder();
		etag = new StringBuilder();
	}

	private static int popMode(Stack<Integer> st) {
		if (!st.empty()) {
			return st.pop().intValue();
		}
		return PRE;
	}

	private final static int TEXT = 1, ENTITY = 2, OPEN_TAG = 3, CLOSE_TAG = 4,
			START_TAG = 5, ATTRIBUTE_LVALUE = 6, ATTRIBUTE_EQUAL = 9,
			ATTRIBUTE_RVALUE = 10, QUOTE = 7, IN_TAG = 8, SINGLE_TAG = 12,
			COMMENT = 13, DONE = 11, DOCTYPE = 14, PRE = 15, CDATA = 16;

	/**
	 * @param doc
	 *            handler that receives document events
	 * @param r
	 *            source of XML data
	 * @throws Exception
	 *             if XML is not valid
	 */
	final public void parse(DocHandler doc, Reader r) throws Exception {
		// Stack stack = new Stack();
		stack.clear();

		int depth = 0;
		int mode = PRE;
		int c = 0;
		int quotec = '"';
		depth = 0;
		// StringBuilder sb = new StringBuilder();
		// StringBuilder etag = new StringBuilder();
		sb.setLength(0);
		etag.setLength(0);
		String tagName = null;
		String lvalue = null;
		String rvalue = null;

		// attrs = new LinkedHashMap();
		attrs.clear();

		doc.startDocument();
		int line = 1, col = 0;
		boolean eol = false;
		while ((c = r.read()) != -1) {

			// We need to map \r, \r\n, and \n to \n
			// See XML spec section 2.11
			if (c == '\n' && eol) {
				eol = false;
				continue;
			} else if (eol) {
				eol = false;
			} else if (c == '\n') {
				line++;
				col = 0;
			} else if (c == '\r') {
				eol = true;
				c = '\n';
				line++;
				col = 0;
			} else {
				col++;
			}

			switch (mode) {
			case DONE:
				doc.endDocument();
				return;

			// We are between tags collecting text.
			case TEXT:
				switch (c) {
				case '<':
					stack.push(Integer.valueOf(mode));
					mode = START_TAG;
					if (sb.length() > 0) {
						doc.text(sb.toString());
						sb.setLength(0);
					}
					break;
				case '&':
					stack.push(Integer.valueOf(mode));
					mode = ENTITY;
					etag.setLength(0);
					break;
				default:
					sb.append((char) c);
				}
				break;

			// we are processing a closing tag: e.g. </foo>
			case CLOSE_TAG:
				switch (c) {
				case '>':
					mode = popMode(stack);
					tagName = sb.toString();
					sb.setLength(0);
					depth--;
					if (depth == 0) {
						mode = DONE;
					}
					doc.endElement(tagName);
					break;
				default:
					sb.append((char) c);
				}
				break;

			// we are processing CDATA
			case CDATA:
				if (c == '>' && sb.toString().endsWith("]]")) {
					sb.setLength(sb.length() - 2);
					doc.text(sb.toString());
					sb.setLength(0);
					mode = popMode(stack);
				} else {
					sb.append((char) c);
				}
				break;

			// we are processing a comment. We are inside
			// the <!-- .... --> looking for the -->.
			case COMMENT:
				if (c == '>' && sb.toString().endsWith("--")) {
					sb.setLength(0);
					mode = popMode(stack);
				} else {
					sb.append((char) c);
				}
				break;

			// We are outside the root tag element
			case PRE:
				if (c == '<') {
					mode = TEXT;
					stack.push(Integer.valueOf(mode));
					mode = START_TAG;
				}
				break;

			// We are inside one of these <? ... ?>
			// or one of these <!DOCTYPE ... >
			case DOCTYPE:
				if (c == '>') {
					mode = popMode(stack);
					if (mode == TEXT) {
						mode = PRE;
					}
				}
				break;

			// we have just seen a < and
			// are wondering what we are looking at
			// <foo>, </foo>, <!-- ... --->, etc.
			case START_TAG:
				mode = popMode(stack);
				switch (c) {
				case '/':
					stack.push(Integer.valueOf(mode));
					mode = CLOSE_TAG;
					break;
				case '?':
					mode = DOCTYPE;
					break;
				default:
					stack.push(Integer.valueOf(mode));
					mode = OPEN_TAG;
					tagName = null;
					// attrs = new LinkedHashMap();
					sb.append((char) c);
				}
				break;

			// we are processing an entity, e.g. <, », etc.
			case ENTITY:
				if (c == ';') {
					mode = popMode(stack);
					String cent = etag.toString();
					etag.setLength(0);
					if ("lt".equals(cent)) {
						sb.append('<');
					} else if ("gt".equals(cent)) {
						sb.append('>');
					} else if ("amp".equals(cent)) {
						sb.append('&');
					} else if ("quot".equals(cent)) {
						sb.append('"');
					} else if ("apos".equals(cent)) {
						sb.append('\'');
					} else if (cent.startsWith("#x")) {
						sb.append(
								(char) Integer.parseInt(cent.substring(2), 16));
					} else if (cent.charAt(0) == '#') {
						sb.append((char) Integer.parseInt(cent.substring(1)));
					// Insert custom entity definitions here
					} else {
						exc("Unknown entity: &" + cent + ";", line, col);
					}
				} else {
					etag.append((char) c);
				}
				break;

			// we have just seen something like this:
			// <foo a="b"/
			// and are looking for the final >.
			case SINGLE_TAG:
				if (tagName == null) {
					tagName = sb.toString();
				}
				if (c != '>') {
					exc("Expected > for tag: <" + tagName + "/>", line, col);
				}
				doc.startElement(tagName, attrs);
				doc.endElement(tagName);
				if (depth == 0) {
					doc.endDocument();
					return;
				}
				sb.setLength(0);
				// attrs = new LinkedHashMap();
				attrs.clear();
				tagName = null;
				mode = popMode(stack);
				break;

			// we are processing something
			// like this <foo ... >. It could
			// still be a <!-- ... --> or something.
			case OPEN_TAG:
				switch (c) {
				case '>':
					if (tagName == null) {
						tagName = sb.toString();
					}
					sb.setLength(0);
					depth++;
					doc.startElement(tagName, attrs);
					tagName = null;
					// attrs = new LinkedHashMap();
					attrs.clear();
					mode = popMode(stack);
					break;

				case '/':
					mode = SINGLE_TAG;
					break;

				case '-':
					if (sb.toString().equals("!-")) {
						mode = COMMENT;
					} else {
						sb.append((char) c);
					}
					break;

				case '[':
					if (sb.toString().equals("![CDATA")) {
						mode = CDATA;
						sb.setLength(0);
					}
					break;

				case 'E':
					if (sb.toString().equals("!DOCTYP")) {
						sb.setLength(0);
						mode = DOCTYPE;
					}
					break;

				default:
					if (StringUtil.isWhitespace((char) c)) {
						tagName = sb.toString();
						sb.setLength(0);
						mode = IN_TAG;
					} else {
						sb.append((char) c);
					}
				}
				break;

			// We are processing the quoted right-hand side
			// of an element's attribute.
			case QUOTE:
				if (c == quotec) {
					rvalue = sb.toString();
					sb.setLength(0);
					attrs.put(lvalue, rvalue);
					mode = IN_TAG;
					// See section the XML spec, section 3.3.3
					// on normalization processing.
				}

				// Markus Hohenwarter, begin
				// I need to get all characters within quotes
				// including newlines
				// else if (" \r\n\u0009".indexOf(c) >= 0) {
				// sb.append(' ');
				// }
				// Markus Hohenwarter, end

				else if (c == '&') {
					stack.push(Integer.valueOf(mode));
					mode = ENTITY;
					etag.setLength(0);
				} else {
					sb.append((char) c);
				}
				break;

			case ATTRIBUTE_RVALUE:
				if (c == '"' || c == '\'') {
					quotec = c;
					mode = QUOTE;
				} else if (!StringUtil.isWhitespace((char) c)) {
					exc("Error in attribute processing", line, col);
				}
				break;

			case ATTRIBUTE_LVALUE:
				if (StringUtil.isWhitespace((char) c)) {
					lvalue = sb.toString();
					sb.setLength(0);
					mode = ATTRIBUTE_EQUAL;
				} else if (c == '=') {
					lvalue = sb.toString();
					sb.setLength(0);
					mode = ATTRIBUTE_RVALUE;
				} else {
					sb.append((char) c);
				}
				break;

			case ATTRIBUTE_EQUAL:
				if (c == '=') {
					mode = ATTRIBUTE_RVALUE;
				} else if (!StringUtil.isWhitespace((char) c)) {
					exc("Error in attribute processing.", line, col);
				}
				break;

			case IN_TAG:
				switch (c) {
				case '>':
					mode = popMode(stack);
					doc.startElement(tagName, attrs);
					depth++;
					tagName = null;
					// attrs = new LinkedHashMap();
					attrs.clear();
					break;

				case '/':
					mode = SINGLE_TAG;
					break;

				default:
					if (!StringUtil.isWhitespace((char) c)) {
						mode = ATTRIBUTE_LVALUE;
						sb.append((char) c);
					}
				}
				break;
			}
		}

		if (mode == DONE) {
			doc.endDocument();
		} else {
			exc("missing end tag", line, col);
		}

	}

	private static void exc(String s, int line, int col) throws Exception {
		throw new Exception(s + " near line " + line + ", column " + col);
	}
}