SimpleXmlParser.java example

Explorer
fedone-master
// Copyright 2009 Google Inc. All Rights Reserved.

package org.waveprotocol.wave.model.document.indexed;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;

/**
 * Simple XML pull-parser.
 *
 * Only handles elements, text, minimal entities.
 * No CDATA, comments, etc.
 *
 * @author danilatos@google.com (Daniel Danilatos)
*
 */
public class SimpleXmlParser {

  /**
   * Supported item types
   */
  public enum ItemType {
    /** Start tag, including attributes */
    START_ELEMENT,
    /** End tag */
    END_ELEMENT,
    /** Non-empty text data */
    TEXT,
    /** At end of document */
    END
  }

  private final String attributeQuoteCharacter;
  private final String[] parts;
  private int position = -1;
  private ItemType currentType = null;
  private Map<String, String> attributes = new HashMap<String, String>();
  private String text = null;
  private String tagName = null;
  private boolean isSelfClosing = false;

  /**
   * NOTE(user): This attribute string is only here in case clients
   * need to get at the original string unescaped. Can be removed when no
   * longer needed, as it feels unnecessary to this simple interface.
   */
  private String attributeString = null;

  /**
   * @param content XML to be parsed
   */
  public SimpleXmlParser(String content) {
    this(content, "\"");
  }

  /**
   * Optional constructor to allow alternative attribute quote delimeters to be
   * used.
   *
   * @param content XML to be parsed
   * @param attributeQuoteCharacter character class used to determine attribute
   *     boundaries.
   */
  public SimpleXmlParser(String content, String attributeQuoteCharacter) {
    this.parts = content.split("[<>]", -1);
    this.attributeQuoteCharacter = attributeQuoteCharacter;
  }

  /**
   * Move to the next token
   * @return The type of item we are at
   */
  public ItemType next() {

    if (currentType == ItemType.START_ELEMENT && isSelfClosing) {
      currentType = ItemType.END_ELEMENT;
      return currentType;
    }

    position++;

    if (position >= parts.length) {
      currentType = ItemType.END;
      return currentType;
    } else if (position % 2 != 0) {
      String elementString = parts[position];

      if (elementString.startsWith("/")) {
        currentType = ItemType.END_ELEMENT;
        if (elementString.length() < 2) {
          throw new RuntimeException("invalid XML: missing element name");
        }
        tagName = elementString.substring(1);

      } else if (elementString.startsWith("?")) {
        throw new UnsupportedOperationException("XML processing instructions are not supported");

      } else {
        currentType = ItemType.START_ELEMENT;
        isSelfClosing = elementString.endsWith("/");
        if (isSelfClosing) {
          elementString = elementString.substring(0, elementString.length() - 1);
        }
        if (elementString.length() < 1) {
          throw new RuntimeException("invalid XML: missing element name");
        }
        String[] elementParts = elementString.split("\\s", 2);
        tagName = elementParts[0];
        if (elementParts.length == 2) {
          attributeString = elementParts[1];
          attributes = parseAttributes(elementParts[1], attributeQuoteCharacter);
        } else {
          attributeString = "";
          attributes = Collections.<String, String>emptyMap();
        }
      }

    } else {
      if (parts[position].length() == 0) {
        return next();
      }

      currentType = ItemType.TEXT;

      text = unescape(parts[position]);
    }

    return currentType;
  }

  /**
   * @return Type of item at current position
   */
  public ItemType getCurrentType() {
    return currentType;
  }

  /**
   * @return character data at current position; only valid
   *   when we are at a text item
   */
  public String getText() {
    if (currentType != ItemType.TEXT) {
      throw new IllegalStateException("Not at text");
    }

    return text;
  }

  /**
   * @return tag name for current start element; not valid over
   *   other items
   */
  public String getTagName() {
    if (currentType != ItemType.START_ELEMENT && currentType != ItemType.END_ELEMENT) {
      throw new IllegalStateException("Not at start/end element");
    }

    return tagName;
  }

  /**
   * Returns whether an element is self-closing (ie <tag/&rt;). An exception
   * is thrown if this check is attempted on an element that is not a start
   * element.
   *
   * @return true if the element is self closing, false otherwise.
   */
  public boolean isSelfClosing() {
    if (currentType != ItemType.START_ELEMENT) {
      throw new IllegalStateException("Not at start element");
    }
    return isSelfClosing;
  }

  /**
   * @return attributes for current start element; not valid over
   *   other items
   */
  public Map<String, String> getAttributes() {
    if (currentType != ItemType.START_ELEMENT) {
      throw new IllegalStateException("Not at start element");
    }

    return attributes;
  }

  /**
   * @return the original unescaped and unparsed attribute string.
   */
  public String getOriginalAttributeString() {
    if (currentType != ItemType.START_ELEMENT) {
      throw new IllegalStateException("Not at start element");
    }

    return attributeString;
  }

  /**
   * Split a |str| at occurrences of a separator |c|.  The difference between this and
   * String.split(String) is that this will return empty strings between sequences of
   * the separator string, whereas String.split(String) will skip the whole sequence
   * as if it were one atomic separator.
   *
   * @param str the string to split
   * @param c the string to split by
   * @return an array of the split parts
   */
  public static String[] split(String str, String c) {
    ArrayList<String> elms = new ArrayList<String>();
    int lastIndex = 0;
    while (true) {
      int nextIndex = str.indexOf(c, lastIndex);
      if (nextIndex == -1) {
        break;
      }
      elms.add(str.substring(lastIndex, nextIndex));
      lastIndex = nextIndex + c.length();
    }
    if (lastIndex < str.length()) {
      elms.add(str.substring(lastIndex, str.length()));
    }
    return elms.toArray(new String[elms.size()]);
  }

  /**
   * Parses the attributes of an element.
   *
   * @param attributesString The string containing the attributes to parse.
   * @param quote The regex classed used to determine a quote or attribute
   *     delimeter.
   * @return A map mapping attribute names to attribute values.
   */
  private static Map<String,String> parseAttributes(String attributesString, String quote) {
    Map<String,String> attributes = new TreeMap<String,String>();

    String[] parts = split(attributesString, quote);
    for (int i = 0; i < parts.length - 1;) {
      String name = parts[i++];
      name = name.substring(0, name.indexOf('=')).trim();
      String value = unescape(parts[i++]);
      attributes.put(name, value);
    }
    return attributes;
  }

  /**
   * Unescapes the XML entity references in a string.
   *
   * @param escapedString The string to unescape.
   * @return The unescaped string.
   */
  private static String unescape(String escapedString) {
    // NOTE(user): This does not handle numeric character references.
    return escapedString.replaceAll("<", "<").replaceAll(">", ">")
        .replaceAll("'", "'").replaceAll(""", "\"").replaceAll("&", "&");
  }

}