/** * Copyright 2011 The Buzz Media, LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thebuzzmedia.sjxp; import com.thebuzzmedia.sjxp.rule.IRule; import org.xmlpull.v1.XmlPullParser; import org.xmlpull.v1.XmlPullParserException; import org.xmlpull.v1.XmlPullParserFactory; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Class used to define a parser that makes parsing using the performance of an * XML Pull Parser with the ease of XPath-like expressions possible. * * <h3>Thread Safety</h3> This class is not thread-safe, however instances of * {@link com.thebuzzmedia.sjxp.XMLParser} can safely be re-used to parse multiple files once the * previous parse operation is done. * * @param <T> * The class type of any user-supplied object that the caller wishes * to be passed through from one of the {@link com.thebuzzmedia.sjxp.XMLParser}'s * <code>parse</code> methods directly to the handler when an * {@link com.thebuzzmedia.sjxp.rule.IRule} matches. This is typically a data storage mechanism * like a DAO or cache used to store the parsed value in some * valuable way, but it can ultimately be anything. If you do not * need to make use of the user object, there is no need to * parameterize the class. * * @author Riyad Kalla (software@thebuzzmedia.com) */ public class XMLParser<T> { /** * Flag used to indicate if debugging output has been enabled by setting the * "sjxp.debug" system property to <code>true</code>. This value will be * <code>false</code> if the "sjxp.debug" system property is undefined or * set to <code>false</code>. * <p/> * This system property can be set on startup with:<br/> * <code> * -Dsjxp.debug=true * </code> or by calling {@link System#setProperty(String, String)} before * this class is loaded. * <p/> * This is <code>false</code> by default. */ public static Boolean DEBUG = Boolean.getBoolean("sjxp.debug"); /** * Flag used to indicate if this parser should be namespace-aware by setting * the "sjxp.namespaces" system property to <code>true</code>. This value * will be <code>true</code> if the "sjxp.namespaces" system property is * undefined. Namespace awareness can only be disabled by setting this * system property to <code>false</code>. * <p/> * <strong>NOTE</strong>: If you intentionally disable namespace awareness, * any {@link com.thebuzzmedia.sjxp.rule.IRule} you provide that uses namespace qualified values (e.g. * [http://w3.org/text]book) will fail to match as the parser can no longer * see namespace URIs. * <p/> * This system property can be set on startup with:<br/> * <code> * -Dsjxp.namespaces=true * </code> or by calling {@link System#setProperty(String, String)} before * this class is loaded. * <p/> * This is <code>true</code> by default. */ public static final Boolean ENABLE_NAMESPACES = (System .getProperty("sjxp.namespaces") == null ? Boolean.TRUE : Boolean .getBoolean("sjxp.namespaces")); /** * Flag used to indicate if this parser should validate the parsed XML * against the references DTD or XML Schema by setting the "sjxp.validation" * system property to <code>true</code>. This value will be * <code>false</code> if the "sjxp.validation" system property is undefined * or set to <code>false</code>. * <p/> * This system property can be set on startup with:<br/> * <code> * -Dsjxp.validation=true * </code> or by calling {@link System#setProperty(String, String)} before * this class is loaded. * <p/> * This is <code>false</code> by default. */ public static final Boolean ENABLE_VALIDATION = Boolean .getBoolean("sjxp.validation"); /** * Prefix to every log message this library logs. Using a well-defined * prefix helps make it easier both visually and programmatically to scan * log files for messages produced by this library. * <p/> * The value is "[sjxp] " (including the space). */ public static final String LOG_MESSAGE_PREFIX = "[sjxp] "; /** * Singleton {@link org.xmlpull.v1.XmlPullParserFactory} instance used to create new * underlying {@link org.xmlpull.v1.XmlPullParser} instances for each instance of * {@link com.thebuzzmedia.sjxp.XMLParser}. */ public static final XmlPullParserFactory XPP_FACTORY; /** * Static initializer used to init the {@link org.xmlpull.v1.XmlPullParserFactory} with the * configured namespace and validation settings. */ static { if (DEBUG) log("Debug output ENABLED"); try { XPP_FACTORY = XmlPullParserFactory.newInstance(); // Configure pull parser features XPP_FACTORY.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, ENABLE_NAMESPACES); XPP_FACTORY.setFeature(XmlPullParser.FEATURE_VALIDATION, ENABLE_VALIDATION); if (DEBUG) log("XmlPullParserFactory configured [namespaces=%s, validation=%s]", ENABLE_NAMESPACES, ENABLE_VALIDATION); } catch (XmlPullParserException e) { throw new RuntimeException( "An exception occurred while calling XmlPullParserFactory.newInstance(). A library providing the impl of the XML Pull Parser spec (e.g. XPP3 or Android SDK) must be available at runtime.", e); } } /** * Helper method used to ensure a message is loggable before it is logged * and then pre-pend a universal prefix to all log messages generated by * this library to make the log entries easy to parse visually or * programmatically. * <p/> * If a message cannot be logged (logging is disabled) then this method * returns immediately. * <p/> * <strong>NOTE</strong>: Because Java will auto-box primitive arguments * into Objects when building out the <code>params</code> array, care should * be taken not to call this method with primitive values unless * {@link #DEBUG} is <code>true</code>; otherwise the VM will be spending * time performing unnecessary auto-boxing calculations. * * @param message * The log message in <a href= * "http://download.oracle.com/javase/6/docs/api/java/util/Formatter.html#syntax" * >format string syntax</a> that will be logged. * @param params * The parameters that will be swapped into all the place holders * in the original messages before being logged. * * @see #LOG_MESSAGE_PREFIX */ protected static void log(String message, Object... params) { if (DEBUG) System.out.printf(LOG_MESSAGE_PREFIX + message + '\n', params); } private String toStringCache; private boolean continueParsing = true; private Location location; private XmlPullParser xpp; private Map<Integer, List<IRule<T>>> tagRuleMap; private Map<Integer, List<IRule<T>>> attrRuleMap; private Map<Integer, List<IRule<T>>> charRuleMap; /** * Create a new parser that uses the given {@link IRule}s when parsing any * XML content. * * @param rules * The rules applied to any parsed content. * * @throws IllegalArgumentException * if <code>rules</code> is <code>null</code> or empty. * @throws com.thebuzzmedia.sjxp.XMLParserException * if the {@link #XPP_FACTORY} is unable to create a new * {@link org.xmlpull.v1.XmlPullParser} instance and throws an exception. */ public XMLParser(IRule<T>... rules) throws IllegalArgumentException, XMLParserException { if (rules == null || rules.length == 0) throw new IllegalArgumentException( "rules cannot be null or empty, you must provide at least 1 rule to execute otherwise parsing will do nothing."); location = new Location(); try { xpp = XPP_FACTORY.newPullParser(); } catch (XmlPullParserException e) { throw new XMLParserException( "An exception occurred while trying to create a new XmlPullParser instance using the XmlPullParserFactory.", e); } // Load all the rules initRules(rules); } /** * Overridden to provide a nicely formatted representation of the parser for * easy debugging. * <p/> * As an added bonus, since {@link com.thebuzzmedia.sjxp.XMLParser}s are intended to be immutable, * the result of <code>toString</code> is cached on the first call and the * cache returned every time to avoid re-computing the completed * {@link String}. * * @return a nicely formatted representation of the parser for easy * debugging. */ @Override public synchronized String toString() { if (toStringCache == null) { toStringCache = this.getClass().getName() + "[attributeRules=" + attrRuleMap + ", characterRules=" + charRuleMap + "]"; } return toStringCache; } /** * Used to indicate to the parser that you would like it to stop parsing. * <p/> * Internally the parser uses a simple <code>boolean</code> to indicate if * it should keep parsing. A call to this method sets the boolean value to * <code>false</code> which the parser checks at the next parse event and * then stops. * <p/> * This is a safe operation that simply flips a flag to tell the underlying * {@link org.xmlpull.v1.XmlPullParser} to stop working after it's done with its current * parse event and return from whichever <code>parse</code> method was * called. */ public void stop() { continueParsing = false; } /** * Parse the XML out of the given stream matching the {@link IRule}s * provided when the {@link com.thebuzzmedia.sjxp.XMLParser} was instantiated. * <p/> * The underlying {@link org.xmlpull.v1.XmlPullParser} will attempt to determine the * stream's encoding based on the pull parser spec or fall back to a default * of UTF-8. * <p/> * This class will make no attempt at closing the given {@link java.io.InputStream}, * the caller must take care to clean up that resource. * <h3>Stopping Parsing</h3> * Parsing can be safely stopped by calling {@link #stop()}. This allows * {@link IRule} implementations control over stopping parsing, for example, * if an arbitrary threshold is hit. A followup call to any of the * <code>parse</code> methods will reset the stopped state. * * @param source * The stream that XML content will be read out of. * * @throws IllegalArgumentException * if <code>source</code> is <code>null</code>. * @throws com.thebuzzmedia.sjxp.XMLParserException * if any error occurs with the underlying stream during parsing * of if the XML content itself is malformed and the underlying * pull parser cannot parse it. */ public void parse(InputStream source) throws IllegalArgumentException, XMLParserException { try { parse(source, null, null); } catch (UnsupportedEncodingException e) { // no-op, this should never happen as null is a valid encoding. } } /** * Parse the XML out of the given stream matching the {@link IRule}s * provided when the {@link com.thebuzzmedia.sjxp.XMLParser} was instantiated. * <p/> * The underlying {@link org.xmlpull.v1.XmlPullParser} will attempt to determine the * stream's encoding based on the pull parser spec or fall back to a default * of UTF-8. * <p/> * This class will make no attempt at closing the given {@link java.io.InputStream}, * the caller must take care to clean up that resource. * <h3>Stopping Parsing</h3> * Parsing can be safely stopped by calling {@link #stop()}. This allows * {@link IRule} implementations control over stopping parsing, for example, * if an arbitrary threshold is hit. A followup call to any of the * <code>parse</code> methods will reset the stopped state. * * @param source * The stream that XML content will be read out of. * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. * * @throws IllegalArgumentException * if <code>source</code> is <code>null</code>. * @throws com.thebuzzmedia.sjxp.XMLParserException * if any error occurs with the underlying stream during parsing * of if the XML content itself is malformed and the underlying * pull parser cannot parse it. */ public void parse(InputStream source, T userObject) throws IllegalArgumentException, XMLParserException { try { parse(source, null, userObject); } catch (UnsupportedEncodingException e) { // no-op, this should never happen as null is a valid encoding. } } /** * Parse the XML out of the given stream (producing content matching the * given encoding) matching the {@link IRule}s provided when the * {@link com.thebuzzmedia.sjxp.XMLParser} was instantiated. * <p/> * This class will make no attempt at closing the given {@link java.io.InputStream}, * the caller must take care to clean up that resource. * <h3>Stopping Parsing</h3> * Parsing can be safely stopped by calling {@link #stop()}. This allows * {@link IRule} implementations control over stopping parsing, for example, * if an arbitrary threshold is hit. A followup call to any of the * <code>parse</code> methods will reset the stopped state. * * @param source * The stream that XML content will be read out of. * @param encoding * The character encoding (e.g. "UTF-8") of the data from the * given stream. If the encoding is not known, passing * <code>null</code> or calling {@link #parse(java.io.InputStream)} * instead will allow the underlying {@link org.xmlpull.v1.XmlPullParser} to try * and automatically determine the encoding. * * @throws IllegalArgumentException * if <code>source</code> is <code>null</code>. * @throws java.io.UnsupportedEncodingException * if <code>encoding</code> represents an encoding name that is * not recognized by {@link java.nio.charset.Charset#isSupported(String)} * @throws com.thebuzzmedia.sjxp.XMLParserException * if any error occurs with the underlying stream during parsing * of if the XML content itself is malformed and the underlying * pull parser cannot parse it. */ public void parse(InputStream source, String encoding) throws IllegalArgumentException, UnsupportedEncodingException, XMLParserException { parse(source, encoding, null); } /** * Parse the XML out of the given stream (producing content matching the * given encoding) matching the {@link IRule}s provided when the * {@link com.thebuzzmedia.sjxp.XMLParser} was instantiated. * <p/> * This class will make no attempt at closing the given {@link java.io.InputStream}, * the caller must take care to clean up that resource. * <h3>Stopping Parsing</h3> * Parsing can be safely stopped by calling {@link #stop()}. This allows * {@link IRule} implementations control over stopping parsing, for example, * if an arbitrary threshold is hit. A followup call to any of the * <code>parse</code> methods will reset the stopped state. * * @param source * The stream that XML content will be read out of. * @param encoding * The character encoding (e.g. "UTF-8") of the data from the * given stream. If the encoding is not known, passing * <code>null</code> or calling {@link #parse(java.io.InputStream)} * instead will allow the underlying {@link org.xmlpull.v1.XmlPullParser} to try * and automatically determine the encoding. * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. * * @throws IllegalArgumentException * if <code>source</code> is <code>null</code>. * @throws java.io.UnsupportedEncodingException * if <code>encoding</code> represents an encoding name that is * not recognized by {@link java.nio.charset.Charset#isSupported(String)} * @throws com.thebuzzmedia.sjxp.XMLParserException * if any error occurs with the underlying stream during parsing * of if the XML content itself is malformed and the underlying * pull parser cannot parse it. */ public void parse(InputStream source, String encoding, T userObject) throws IllegalArgumentException, UnsupportedEncodingException, XMLParserException { if (source == null) throw new IllegalArgumentException("source cannot be null"); if (encoding != null) { // If empty, ensure it is null so XPP gets encoding from XML header if (encoding.trim().length() == 0) encoding = null; // Extra-safe, make sure the provided encoding is valid else if (!Charset.isSupported(encoding)) throw new UnsupportedEncodingException( "Encoding [" + encoding + "] is not a valid charset encoding in this runtime according to Charset.isSupported(encoding)."); } try { xpp.setInput(source, encoding); if (DEBUG) log("Underlying XmlPullParser input set [type=InputStream, encoding=%s (null is OK), userObject=%s]", xpp.getInputEncoding(), (userObject == null ? "" : userObject)); } catch (XmlPullParserException e) { throw new XMLParserException( "Unable to set the given InputStream (with an optional encoding of '" + encoding + "') as input for the underlying XmlPullParser.", e); } try { doParse(userObject); } catch (IOException e) { throw new XMLParserException( "An exception occurred while parsing the given source, the XML document may be malformed.", e); } catch (XmlPullParserException e) { throw new XMLParserException( "An error with the underlying data stream being parsed occurred.", e); } } protected void initRules(IRule<T>... rules) { // calculate a rough optimal size for the rule maps int optSize = (rules.length > 64 ? rules.length * 2 : 64); // init the rule maps tagRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize); attrRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize); charRuleMap = new HashMap<Integer, List<IRule<T>>>(optSize); // init the rules List<IRule<T>> ruleList = null; for (int i = 0, length = rules.length; i < length; i++) { IRule<T> rule = rules[i]; switch (rule.getType()) { case TAG: // Get the rule list for this path ruleList = tagRuleMap.get(rule.getLocationPath()); // If there wasn't already a rule list, create and add it if (ruleList == null) { ruleList = new ArrayList<IRule<T>>(3); tagRuleMap.put(rule.getLocationPath().hashCode(), ruleList); } break; case ATTRIBUTE: // Get the rule list for this path ruleList = attrRuleMap.get(rule.getLocationPath()); // If there wasn't already a rule list, create and add it if (ruleList == null) { ruleList = new ArrayList<IRule<T>>(3); attrRuleMap .put(rule.getLocationPath().hashCode(), ruleList); } break; case CHARACTER: // Get the rule list for this path ruleList = charRuleMap.get(rule.getLocationPath()); // If there wasn't already a rule list, create and add it if (ruleList == null) { ruleList = new ArrayList<IRule<T>>(3); charRuleMap .put(rule.getLocationPath().hashCode(), ruleList); } break; } // Add the rule to the list for the given path ruleList.add(rule); } if (DEBUG) log("Initialized %d TAG rules, %d ATTRIBUTE rules and %d CHARACTER rules.", tagRuleMap.size(), attrRuleMap.size(), charRuleMap.size()); } /** * Uses the underlying {@link org.xmlpull.v1.XmlPullParser} to begin parsing through the * XML content from the given stream. This method's implementation is * simple, acting like a traffic-cop responding to * {@link org.xmlpull.v1.XmlPullParser#START_TAG}, {@link org.xmlpull.v1.XmlPullParser#TEXT}, * {@link org.xmlpull.v1.XmlPullParser#END_TAG} and {@link org.xmlpull.v1.XmlPullParser#END_DOCUMENT} * events by calling the appropriate <code>doXXX</code> methods. * <p/> * Developers creating a subclass of {@link com.thebuzzmedia.sjxp.XMLParser} are meant to override * one of the {@link #doStartTag(Object)}, {@link #doText(Object)}, * {@link #doEndTag(Object)} and {@link #doEndDocument(Object)} methods to * add custom behavior and not necessarily override this central method. * <h3>Stopping Parsing</h3> * Parsing can be safely stopped by calling {@link #stop()}. This allows * {@link IRule} implementations control over stopping parsing, for example, * if an arbitrary threshold is hit. A followup call to any of the * <code>parse</code> methods will reset the stopped state. * * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. * * @throws java.io.IOException * if an error occurs with reading from the underlying * {@link java.io.InputStream} given to one of the public * <code>parse</code> methods. * @throws org.xmlpull.v1.XmlPullParserException * if an error occurs while parsing the XML content from the * underlying stream; typically resulting from malformed or * invalid XML. */ protected void doParse(T userObject) throws IOException, XmlPullParserException { location.clear(); continueParsing = true; if (DEBUG) log("Parsing starting..."); long startTime = System.currentTimeMillis(); while (continueParsing) { switch (xpp.next()) { case XmlPullParser.START_TAG: doStartTag(userObject); break; case XmlPullParser.TEXT: doText(userObject); break; case XmlPullParser.END_TAG: doEndTag(userObject); break; case XmlPullParser.END_DOCUMENT: continueParsing = false; doEndDocument(userObject); break; } } if (DEBUG) { long duration = System.currentTimeMillis() - startTime; log("Parse COMPLETE, elapsed time: %dms (approx %f seconds)", duration, (double) duration / (double) 1000); } } /** * Used to process a {@link org.xmlpull.v1.XmlPullParser#START_TAG} event. * <p/> * By default this updates the internal location state of the parser, * processes all {@link IRule}s of type {@link Type#TAG} and processes all * {@link IRule}s of type {@link Type#ATTRIBUTE} that match the parser's * current location. * * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. */ protected void doStartTag(T userObject) { // Update parser location location.push(xpp.getName(), xpp.getNamespace()); if (DEBUG) log("START_TAG: %s %s:%s", location, xpp.getNamespace(), xpp.getName()); // Get the rules for the current path List<IRule<T>> tagRuleList = tagRuleMap.get(location .getCachedHashCode()); List<IRule<T>> attrRuleList = attrRuleMap.get(location .getCachedHashCode()); // If there are no rules for the current path, then we are done. if ((tagRuleList == null || tagRuleList.isEmpty()) && (attrRuleList == null || attrRuleList.isEmpty())) return; if (DEBUG) log("\t%d TAG rules and %d ATTR rules found for START_TAG...", (tagRuleList == null ? 0 : tagRuleList.size()), (attrRuleList == null ? 0 : attrRuleList.size())); // Process the TAG rules if (tagRuleList != null) { for (int i = 0, size = tagRuleList.size(); i < size; i++) { IRule<T> rule = tagRuleList.get(i); if (DEBUG) log("\t\tRunning TAG Rule: %s", rule); rule.handleTag(this, true, userObject); } } // Process the ATTR rules if (attrRuleList != null) { for (int i = 0, size = attrRuleList.size(); i < size; i++) { IRule<T> rule = attrRuleList.get(i); if (DEBUG) log("\t\tRunning ATTR Rule: %s", rule); String[] attrNames = rule.getAttributeNames(); // Be safe, jump to the next rule if this one has no name // entries if (attrNames == null || attrNames.length == 0) continue; /* * PERFORMANCE: Generating the substrings is the fastest way to * parse out the matching rules as it shares the same underlying * char[] used to represent the entire location path or * attribute name and just creates a new simple String instance * with modified index/offset values that is GC'ed quickly and * easily (uses a special package-protected String constructor). * * Using regexp to match, splitting the rule or just about any * other approach would have been magnitudes more expensive both * in memory and CPU requirements than doing a simple substring. */ for (int j = 0; j < attrNames.length; j++) { String attrName = attrNames[j]; String localName = null; String namespaceURI = null; // Parse the namespaceURI out of the name if necessary if (attrName.charAt(0) == '[') { int endIndex = attrName.indexOf(']'); /* * Make sure the rule is valid so we avoid out of bounds * and keep the caller informed when their rules are * busted by failing fast. */ if (endIndex <= 2) throw new XMLParserException( "namespace URI for rule looks to be incomplete or empty for IRule: " + rule); namespaceURI = attrName.substring(1, endIndex); } int startIndex = (namespaceURI == null ? 0 : namespaceURI .length() + 2); /* * Make sure the rule is valid so we avoid out of bounds and * keep the caller informed when their rules are busted by * failing fast. */ if (attrName.length() - startIndex <= 1) throw new XMLParserException( "local name for rule looks to be missing for IRule: " + rule); // Parse the local name localName = attrName.substring(startIndex, attrName.length()); // Give the parsed attribute value to the matching rule rule.handleParsedAttribute(this, j, xpp.getAttributeValue(namespaceURI, localName), userObject); } } } } /** * Used to process a {@link org.xmlpull.v1.XmlPullParser#TEXT} event. * <p/> * By default this processes all {@link IRule}s of type * {@link Type#CHARACTER} that match the parser's current location. * * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. */ protected void doText(T userObject) { if (DEBUG) log("TEXT: %s", location); // Get the rules for the current path List<IRule<T>> ruleList = charRuleMap.get(location.getCachedHashCode()); // If there are no rules for the current path, then we are done. if (ruleList == null || ruleList.isEmpty()) return; if (DEBUG) log("\t%d rules found for TEXT...", ruleList.size()); String text = xpp.getText(); // Give the parsed text to all matching IRules for this path for (int i = 0, size = ruleList.size(); i < size; i++) { IRule<T> rule = ruleList.get(i); if (DEBUG) log("\t\tRunning Rule: %s", rule); rule.handleParsedCharacters(this, text, userObject); } } /** * Used to process a {@link org.xmlpull.v1.XmlPullParser#END_TAG} event. * * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. */ protected void doEndTag(T userObject) { // Get the rules for the current path List<IRule<T>> tagRuleList = tagRuleMap.get(location .getCachedHashCode()); // If there are no rules for the current path, then we are done. if (tagRuleList != null && !tagRuleList.isEmpty()) { if (DEBUG) log("\t%d TAG rules found for END_TAG...", tagRuleList.size()); // Process the TAG rules for (int i = 0, size = tagRuleList.size(); i < size; i++) { IRule<T> rule = tagRuleList.get(i); if (DEBUG) log("\t\tRunning TAG Rule: %s", rule); rule.handleTag(this, false, userObject); } } // Update parser location location.pop(); if (DEBUG) log("END_TAG: %s", location); } /** * Used to process a {@link org.xmlpull.v1.XmlPullParser#END_DOCUMENT} event. * <p/> * By default this method simply logs a debug statement if debugging is * enabled, but this stub is provided to make overriding the default * behavior easier if desired. * * @param userObject * The user-supplied object passed through from this parse method * to the matching {@link IRule}'s <code>handleXXX</code> method * when a match is found, or <code>null</code> if no user object * is needed. Passing through a user-object is just meant as a * convenience for giving the handler methods on the * {@link IRule}'s access to objects like DAOs that can be used * to persist or process parsed data easily. */ protected void doEndDocument(T userObject) { if (DEBUG) log("END_DOCUMENT, Parsing COMPLETE"); } /** * Simple and fast class used to mock the behavior of a stack in the form of * a string for the purposes of "pushing" and "popping" the parser's current * location within an XML document as it processes START and END_TAG events. * <p/> * Performance is optimized by using a {@link StringBuilder} who's length is * chopped (which just adjusts an <code>int</code> value) to simulate a * "pop" off the top. * <h3>Performance</h3> * As of SJXP 2.0 String object creation and char[] duplication (e.g. * {@link System#arraycopy(Object, int, Object, int, int)}) has been * completely removed and replaced with using simple integer hash codes. * <p/> * The performance improvement is huge over the original toString-based * method of matching {@link IRule}'s <code>locationPath</code>s against the * parser's current location. * * @author Riyad Kalla (software@thebuzzmedia.com) */ class Location { private static final int HASH_CODE_CACHE_SIZE = 512; private int hashCode; private Integer[] hashCodeCache; private StringBuilder path; private List<Integer> lengthList; /** * Creates a new empty location. */ public Location() { hashCode = 0; hashCodeCache = new Integer[HASH_CODE_CACHE_SIZE]; path = new StringBuilder(256); lengthList = new ArrayList<Integer>(16); } /** * Overridden to calculate the hash code of this location using the * exact same hash code calculation that {@link String#hashCode()} uses. * This allows us to say a <code>String</code> with the content * "/library/book/title" is equal to an instance of this class * representing the same location when doing lookups in a {@link java.util.Map}. * <p/> * This method calculates the hash code and then caches it, followup * calls to {@link #push(String, String)} or {@link #pop()} invalidate * the cached hash code allowing it to be recalculated again on the next * call. */ @Override public int hashCode() { /* * If the hash code is already 0 and our path is empty, there is * nothing to compute so the hash code stays 0. Otherwise we drop * into the for-loop and calculate the String-equivalent hash code. */ if (hashCode == 0 && path.length() > 0) { for (int i = 0, length = path.length(); i < length; i++) { hashCode = 31 * hashCode + path.charAt(i); } } return hashCode; } /** * Used to get a cached {@link Integer} version of the <code>int</code> * {@link #hashCode()} return value. * <p/> * To avoid unnecessary {@link Integer} allocations, this method caches * up to a certain number of {@link Integer} instances, re-using them * every time the same hash code value comes back up and creating new * instances when it doesn't. * <p/> * If a larger number of {@link Integer} instances are created than the * underlying cache can hold, then a new instance will be created and * returned like normal. * <h3>Design</h3> * The reason this works so well for parsing XML is because of the * nested, tag-matching structure of XML. When considering unique paths * inside of an XML doc (e.g. "/library", "/library/book", etc.) there * are typically not that many; maybe 20, 50 or less than a 100 in most * cases. * <p/> * Once the hash code {@link Integer} values for these unique paths is * created and cached, once we re-encounter that path again and again, * we don't need to recreate that hash code {@link Integer}, we can just * use the one from the previous occurrence. * * @return a cached {@link Integer} version of the <code>int</code> * {@link #hashCode()} return value. */ public Integer getCachedHashCode() { // Recalculate the hash code hashCode(); // Figure out the index, in our cache, where this value WOULD be. int index = hashCode % hashCodeCache.length; // Absolute value only if (index < 0) index = -index; // Get the Integer we think represents our value. Integer value = hashCodeCache[index]; // If we haven't created an Integer for this value yet, do it now. if (value == null) hashCodeCache[index] = (value = Integer.valueOf(hashCode)); /* * If a collision has occurred and we have filled up our cache * already and the Integer we grabbed doesn't represent our int * value, forget the cache and just create a new Integer the old * fashion way and return it. * * The hope is that the cache is always large enough that we only * ever hit it and have no misses like this. */ else if (hashCode != value.intValue()) value = Integer.valueOf(hashCode); return value; } /** * Used to clear all the internal state of the location. */ public void clear() { hashCode = 0; hashCodeCache = new Integer[HASH_CODE_CACHE_SIZE]; path.setLength(0); lengthList.clear(); } /** * "Pushes" a new local name and optional namespace URI onto the "stack" * by appending it to the current location path that represents the * parser's location inside of the XML doc. * * @param localName * The local name of the tag (e.g. "title"). * @param namespaceURI * Optionally, the full qualifying namespace URI for this * tag. */ public void push(String localName, String namespaceURI) { // Clear the hash code cache first to be safe. hashCode = 0; // Remember the length before we inserted this last entry lengthList.add(path.length()); // Add separator path.append('/'); // Add the namespace URI if there is one. if (namespaceURI != null && namespaceURI.length() > 0) path.append('[').append(namespaceURI).append(']'); // Append the local name path.append(localName); } /** * "Pops" the last pushed path element off the "stack" by re-adjusting * the {@link StringBuilder}'s length to what it was before the last * element was appended. * <p/> * This effectively chops the last element off the path without doing a * more costly {@link StringBuilder#delete(int, int)} operation that * would incur a call to * {@link System#arraycopy(Object, int, Object, int, int)} by simply * adjusting a single <code>int</code> counter inside of * {@link StringBuilder}. */ public void pop() { // Clear the hash code cache first to be safe. hashCode = 0; // Get the length before the last insertion Integer lastLength = lengthList.remove(lengthList.size() - 1); // 'Pop' the last insertion by cropping the length to exclude it. path.setLength(lastLength); } } }