// Copyright 2009 Google Inc. All Rights Reserved. package org.waveprotocol.wave.model.document.indexed; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; /** * Simple XML pull-parser. * * Only handles elements, text, minimal entities. * No CDATA, comments, etc. * * @author danilatos@google.com (Daniel Danilatos) * */ public class SimpleXmlParser { /** * Supported item types */ public enum ItemType { /** Start tag, including attributes */ START_ELEMENT, /** End tag */ END_ELEMENT, /** Non-empty text data */ TEXT, /** At end of document */ END } private final String attributeQuoteCharacter; private final String[] parts; private int position = -1; private ItemType currentType = null; private Map<String, String> attributes = new HashMap<String, String>(); private String text = null; private String tagName = null; private boolean isSelfClosing = false; /** * NOTE(user): This attribute string is only here in case clients * need to get at the original string unescaped. Can be removed when no * longer needed, as it feels unnecessary to this simple interface. */ private String attributeString = null; /** * @param content XML to be parsed */ public SimpleXmlParser(String content) { this(content, "\""); } /** * Optional constructor to allow alternative attribute quote delimeters to be * used. * * @param content XML to be parsed * @param attributeQuoteCharacter character class used to determine attribute * boundaries. */ public SimpleXmlParser(String content, String attributeQuoteCharacter) { this.parts = content.split("[<>]", -1); this.attributeQuoteCharacter = attributeQuoteCharacter; } /** * Move to the next token * @return The type of item we are at */ public ItemType next() { if (currentType == ItemType.START_ELEMENT && isSelfClosing) { currentType = ItemType.END_ELEMENT; return currentType; } position++; if (position >= parts.length) { currentType = ItemType.END; return currentType; } else if (position % 2 != 0) { String elementString = parts[position]; if (elementString.startsWith("/")) { currentType = ItemType.END_ELEMENT; if (elementString.length() < 2) { throw new RuntimeException("invalid XML: missing element name"); } tagName = elementString.substring(1); } else if (elementString.startsWith("?")) { throw new UnsupportedOperationException("XML processing instructions are not supported"); } else { currentType = ItemType.START_ELEMENT; isSelfClosing = elementString.endsWith("/"); if (isSelfClosing) { elementString = elementString.substring(0, elementString.length() - 1); } if (elementString.length() < 1) { throw new RuntimeException("invalid XML: missing element name"); } String[] elementParts = elementString.split("\\s", 2); tagName = elementParts[0]; if (elementParts.length == 2) { attributeString = elementParts[1]; attributes = parseAttributes(elementParts[1], attributeQuoteCharacter); } else { attributeString = ""; attributes = Collections.<String, String>emptyMap(); } } } else { if (parts[position].length() == 0) { return next(); } currentType = ItemType.TEXT; text = unescape(parts[position]); } return currentType; } /** * @return Type of item at current position */ public ItemType getCurrentType() { return currentType; } /** * @return character data at current position; only valid * when we are at a text item */ public String getText() { if (currentType != ItemType.TEXT) { throw new IllegalStateException("Not at text"); } return text; } /** * @return tag name for current start element; not valid over * other items */ public String getTagName() { if (currentType != ItemType.START_ELEMENT && currentType != ItemType.END_ELEMENT) { throw new IllegalStateException("Not at start/end element"); } return tagName; } /** * Returns whether an element is self-closing (ie <tag/&rt;). An exception * is thrown if this check is attempted on an element that is not a start * element. * * @return true if the element is self closing, false otherwise. */ public boolean isSelfClosing() { if (currentType != ItemType.START_ELEMENT) { throw new IllegalStateException("Not at start element"); } return isSelfClosing; } /** * @return attributes for current start element; not valid over * other items */ public Map<String, String> getAttributes() { if (currentType != ItemType.START_ELEMENT) { throw new IllegalStateException("Not at start element"); } return attributes; } /** * @return the original unescaped and unparsed attribute string. */ public String getOriginalAttributeString() { if (currentType != ItemType.START_ELEMENT) { throw new IllegalStateException("Not at start element"); } return attributeString; } /** * Split a |str| at occurrences of a separator |c|. The difference between this and * String.split(String) is that this will return empty strings between sequences of * the separator string, whereas String.split(String) will skip the whole sequence * as if it were one atomic separator. * * @param str the string to split * @param c the string to split by * @return an array of the split parts */ public static String[] split(String str, String c) { ArrayList<String> elms = new ArrayList<String>(); int lastIndex = 0; while (true) { int nextIndex = str.indexOf(c, lastIndex); if (nextIndex == -1) { break; } elms.add(str.substring(lastIndex, nextIndex)); lastIndex = nextIndex + c.length(); } if (lastIndex < str.length()) { elms.add(str.substring(lastIndex, str.length())); } return elms.toArray(new String[elms.size()]); } /** * Parses the attributes of an element. * * @param attributesString The string containing the attributes to parse. * @param quote The regex classed used to determine a quote or attribute * delimeter. * @return A map mapping attribute names to attribute values. */ private static Map<String,String> parseAttributes(String attributesString, String quote) { Map<String,String> attributes = new TreeMap<String,String>(); String[] parts = split(attributesString, quote); for (int i = 0; i < parts.length - 1;) { String name = parts[i++]; name = name.substring(0, name.indexOf('=')).trim(); String value = unescape(parts[i++]); attributes.put(name, value); } return attributes; } /** * Unescapes the XML entity references in a string. * * @param escapedString The string to unescape. * @return The unescaped string. */ private static String unescape(String escapedString) { // NOTE(user): This does not handle numeric character references. return escapedString.replaceAll("<", "<").replaceAll(">", ">") .replaceAll("'", "'").replaceAll(""", "\"").replaceAll("&", "&"); } }