package org.rr.commons.xml; /* Package Tigase XMPP/Jabber Server * Copyright (C) 2001, 2002, 2003, 2004, 2005 * "Artur Hefczyc" <artur.hefczyc@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * $Rev$ * $Author$ * $Date$ */ import java.util.Arrays; /** * <code>SimpleParser</code> - implementation of <em>SAX</em> parser. * This is very basic implementation of <em>XML</em> parser designed especially * to be light and parse <em>XML</em> streams like jabber <em>XML</em> stream. * It is very efficient, capable of parsing parts of <em>XML</em> document * received from the network connection as well as handling a few <em>XML</em> * documents in one buffer. This is especially useful when parsing data * received from the network. Packets received from the network can contain * non-comlete * <em>XML</em> document as well as a few complete <em>XML</em> documents. It * doesn't support <em>XML</em> comments, processing instructions, document * inclussions. Actually it supports only: * <ul> * <li>Start element event (with all attributes found).</li> * <li>End element even.</li> * <li>Character data event.</li> * <li>'OtherXML' data event - everything between '<' and '>' if after * < is '?' or '!'. So it can 'catch' doctype declaration, processing * instructions but it can't process correctly commented blocks.</li> * </ul> Although very simple this imlementation is sufficient for Jabber * protocol needs and is even used by some other packages of this server like * implementation of <code>UserRepository</code> based on <em>XML</em> file or * server configuration. * <p>It is worth to note also that this class is fully thread safe. It means that * one instance of this class can be simultanously used by many threads. This * is to improve resources usage when processing many client connections at * the same time.</p> * <p> * Created: Fri Oct 1 23:02:15 2004 * </p> * @author <a href="mailto:artur.hefczyc@gmail.com">Artur Hefczyc</a> * @version $Rev$ */ class SimpleParser { /** * Variable constant <code>MAX_ATTRIBS_NUMBER</code> keeps value of * maximum possible attributes number. Real XML parser shouldn't have * such limit but in most cases XML elements don't have too many attributes. * For efficiency it is better to use fixed number of attributes and * operate on arrays than on lists. */ public static int MAX_ATTRIBS_NUMBER = 6; private static enum State { START, OPEN_BRACKET, ELEMENT_NAME, END_ELEMENT_NAME, ATTRIB_NAME, END_OF_ATTR_NAME, ATTRIB_VALUE, ELEMENT_CDATA, OTHER_XML, ERROR, CLOSE_ELEMENT }; private static final char OPEN_BRACKET = '<'; private static final char CLOSE_BRACKET = '>'; private static final char QUESTION_MARK = '?'; private static final char EXCLAMATION_MARK = '!'; private static final char SLASH = '/'; private static final char SPACE = ' '; private static final char TAB = '\t'; private static final char LF = '\n'; private static final char CR = '\r'; private static final char EQUALS = '='; private static final char SINGLE_QUOTE = '\''; private static final char DOUBLE_QUOTE = '"'; private static final char[] WHITE_CHARS = { SPACE, TAB, LF, CR }; private static final char[] ERR_NAME_CHARS = { OPEN_BRACKET, QUESTION_MARK }; static { Arrays.sort(WHITE_CHARS); } private StringBuilder[] initArray(int size) { StringBuilder[] array = new StringBuilder[size]; Arrays.fill(array, null); return array; } private StringBuilder[] resizeArray(StringBuilder[] src, int size) { StringBuilder[] array = new StringBuilder[size]; System.arraycopy(src, 0, array, 0, src.length); Arrays.fill(array, src.length, array.length, null); return array; } public final void parse(SimpleHandler handler, char[] data, int off, int len) { ParserState parser_state = (ParserState)handler.restoreParserState(); if (parser_state == null) { parser_state = new ParserState(); } // end of if (parser_state == null) for (int index = off; index < len; index++) { char chr = data[index]; switch (parser_state.state) { case START: if (chr == OPEN_BRACKET) { parser_state.state = State.OPEN_BRACKET; parser_state.slash_found = false; } // end of if (chr == OPEN_BRACKET) // Skip everything up to open bracket break; case OPEN_BRACKET: switch (chr) { case QUESTION_MARK: case EXCLAMATION_MARK: parser_state.state = State.OTHER_XML; parser_state.element_cdata = new StringBuilder(); parser_state.element_cdata.append(chr); break; case SLASH: parser_state.state = State.CLOSE_ELEMENT; parser_state.element_name = new StringBuilder(); parser_state.slash_found = true; break; default: if (Arrays.binarySearch(WHITE_CHARS, chr) < 0) { parser_state.state = State.ELEMENT_NAME; parser_state.element_name = new StringBuilder(); parser_state.element_name.append(chr); } // end of if () break; } // end of switch (chr) break; case ELEMENT_NAME: if (Arrays.binarySearch(WHITE_CHARS, chr) >= 0) { parser_state.state = State.END_ELEMENT_NAME; break; } // end of if () if (chr == SLASH) { parser_state.slash_found = true; break; } // end of if (chr == SLASH) if (chr == CLOSE_BRACKET) { parser_state.state = State.ELEMENT_CDATA; handler.startElement(parser_state.element_name, null, null); if (parser_state.slash_found) { handler.endElement(parser_state.element_name); parser_state.state = State.START; } // end of if (slash_found) parser_state.element_name = null; break; } // end of if () if (chr == ERR_NAME_CHARS[0] || chr == ERR_NAME_CHARS[1]) { parser_state.state = State.ERROR; break; } // end of if () parser_state.element_name.append(chr); break; case CLOSE_ELEMENT: if (Arrays.binarySearch(WHITE_CHARS, chr) >= 0) { break; } // end of if () if (chr == SLASH) { parser_state.state = State.ERROR; break; } // end of if (chr == SLASH) if (chr == CLOSE_BRACKET) { parser_state.state = State.START; handler.endElement(parser_state.element_name); parser_state.element_name = null; break; } // end of if () if (chr == ERR_NAME_CHARS[0] || chr == ERR_NAME_CHARS[1]) { parser_state.state = State.ERROR; break; } // end of if () parser_state.element_name.append(chr); break; case END_ELEMENT_NAME: if (chr == SLASH) { parser_state.slash_found = true; break; } // end of if (chr == SLASH) if (chr == CLOSE_BRACKET) { parser_state.state = State.ELEMENT_CDATA; handler.startElement(parser_state.element_name, parser_state.attrib_names, parser_state.attrib_values); if (parser_state.slash_found) { handler.endElement(parser_state.element_name); parser_state.state = State.START; } // end of if (slash_found) parser_state.element_name = null; parser_state.attrib_names = null; parser_state.attrib_values = null; parser_state.current_attr = -1; break; } // end of if () if (Arrays.binarySearch(WHITE_CHARS, chr) < 0) { parser_state.state = State.ATTRIB_NAME; if (parser_state.attrib_names == null) { parser_state.attrib_names = initArray(MAX_ATTRIBS_NUMBER); parser_state.attrib_values = initArray(MAX_ATTRIBS_NUMBER); } else { if (parser_state.current_attr == parser_state.attrib_names.length - 1) { int new_size = parser_state.attrib_names.length + MAX_ATTRIBS_NUMBER; parser_state.attrib_names = resizeArray(parser_state.attrib_names, new_size); parser_state.attrib_values = resizeArray(parser_state.attrib_values, new_size); } } // end of else parser_state.attrib_names[++parser_state.current_attr] = new StringBuilder(); parser_state.attrib_names[parser_state.current_attr].append(chr); break; } // end of if () // do nothing, skip white chars break; case ATTRIB_NAME: if (Arrays.binarySearch(WHITE_CHARS, chr) >= 0 || chr == EQUALS) { parser_state.state = State.END_OF_ATTR_NAME; break; } // end of if () parser_state.attrib_names[parser_state.current_attr].append(chr); break; case END_OF_ATTR_NAME: if (chr == SINGLE_QUOTE || chr == DOUBLE_QUOTE) { parser_state.state = State.ATTRIB_VALUE; parser_state.attrib_values[parser_state.current_attr] = new StringBuilder(); } // end of if (chr == SINGLE_QUOTE || chr == DOUBLE_QUOTE) // Skip white characters and actually everything except quotes break; case ATTRIB_VALUE: if (chr == SINGLE_QUOTE || chr == DOUBLE_QUOTE) { parser_state.state = State.END_ELEMENT_NAME; break; } // end of if (chr == SINGLE_QUOTE || chr == DOUBLE_QUOTE) parser_state.attrib_values[parser_state.current_attr].append(chr); break; case ELEMENT_CDATA: if (chr == OPEN_BRACKET) { parser_state.state = State.OPEN_BRACKET; parser_state.slash_found = false; if (parser_state.element_cdata != null) { rtrim(parser_state.element_cdata); handler.elementCData(parser_state.element_cdata); parser_state.element_cdata = null; } // end of if (parser_state.element_cdata != null) break; } // end of if (chr == OPEN_BRACKET) if (parser_state.element_cdata == null) { // Skip leading white characters if (Arrays.binarySearch(WHITE_CHARS, chr) < 0) { parser_state.element_cdata = new StringBuilder(); parser_state.element_cdata.append(chr); } // end of if (Arrays.binarySearch(WHITE_CHARS, chr) < 0) } // end of if (parser_state.element_cdata == null) else { parser_state.element_cdata.append(chr); } // end of if (parser_state.element_cdata == null) else break; case OTHER_XML: if (chr == CLOSE_BRACKET) { parser_state.state = State.START; handler.otherXML(parser_state.element_cdata); parser_state.element_cdata = null; break; } // end of if (chr == CLOSE_BRACKET) parser_state.element_cdata.append(chr); break; case ERROR: parser_state = null; handler.error(); return; default: assert false : "Unknown SimpleParser state: "+parser_state.state; break; } // end of switch (state) } // end of for () handler.saveParserState(parser_state); } private static void rtrim(StringBuilder value) { int newSize = value.length(); for (int i = value.length(); i > 0; i--) { char chr = value.charAt(i-1); if (Arrays.binarySearch(WHITE_CHARS, chr) >= 0) { newSize--; } else { break; } } if(newSize < value.length()) { value.delete(newSize, value.length()); } } private class ParserState { StringBuilder element_name = null; StringBuilder[] attrib_names = null; StringBuilder[] attrib_values = null; StringBuilder element_cdata = null; int current_attr = -1; boolean slash_found = false; State state = State.START; } }// SimpleParser