XmlReader.java example

Explorer
TotalCrossSDK-master
- TotalCrossSDK
  - docs
    - companion_resources
      - listings
        HelloWorld.java
        TestConcurrent.java
  - src
    - tc
    - totalcross
/*********************************************************************************
 *  TotalCross Software Development Kit                                          *
 *  Copyright (C) 2003-2004 Pierre G. Richard                                    *
 *  Copyright (C) 2003-2012 SuperWaba Ltda.                                      *
 *  All Rights Reserved                                                          *
 *                                                                               *
 *  This library and virtual machine is distributed in the hope that it will     *
 *  be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of    *
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                         *
 *                                                                               *
 *  This file is covered by the GNU LESSER GENERAL PUBLIC LICENSE VERSION 3.0    *
 *  A copy of this license is located in file license.txt at the root of this    *
 *  SDK or can be downloaded here:                                               *
 *  http://www.gnu.org/licenses/lgpl-3.0.txt                                     *
 *                                                                               *
 *********************************************************************************/



package totalcross.xml;

import totalcross.io.Stream;

   /**
    * Used to read HTML or XML documents, reporting events to handlers (for
    * example, {@link ContentHandler}).
    * <P>
    * <I><B>Note:</B> While in the SAX 2.0 spirit, this implementation is not
    * fully compliant.  Speed and footprint took precedence over what the
    * author judged being details.</I>
    * </P>
    * <P>
    * Unlike SAX, reporting tag names, like in
    * {@link ContentHandler#startElement}, passes an integral
    * <code><B>tag code</B></code> rather than the name itself.  This
    * is, again, for performance reasons.  Comparing integers vs. strings is
    * notably more efficient and tag name comparison is heavily used for XML
    * applications.
    * <P>
    * The <code>tag code</code> must uniquely identify the name of the
    * tag.  The default implementation — see {@link #getTagCode} in
    * this code — simply consists to hash the tag name.  It can be
    * overriden to suit specific needs.
    * <P>
    * Tag names should be translated to tag codes as soon as they are known,
    * when reading the DTD for instance, or computed in advance and saved into a
    * static correspondence table. 
    *
    * @author Pierre G. Richard
    */
public class XmlReader extends XmlTokenizer
{
   private static final int     SPRING_STATE    = 0;
   private static final int     START_TAG_STATE = 1;
   private static final int     END_TAG_STATE   = 2;
   private static final int     PCDATA_STATE    = 3;

   /**
    * hash ID of current tag name, set by <code>foundStartTagName</code> or
    * <code>foundEndTagName</code>
    */
   protected int                tagNameHashId;

   private StringBuffer         pcdata;                                                //CKC
   private int                  state;
   private int                  newlineSignificant;
   private AttributeList        attList;
   private String               attrName;
   private ContentHandler       cntHandler;

   public XmlReader()
   {
      attList = new AttributeList();
      pcdata = new StringBuffer(1000);
   }

   /** Set to true if you want the get/set methods of the AttributeList to be case insensitive. */
   public void setCaseInsensitive(boolean caseInsensitive) // guich@tc113_29
   {
      attList.caseInsensitive = caseInsensitive;
   }
   
    /**
       * Allow an application to register a content event cntHandler.
       * <p>
       * If the application does not register a content cntHandler, all content
       * events reported by the SAX parser will be silently ignored.
       * </p>
       * <p>
       * Applications may register a new or different cntHandler in the middle
       * of a parse, and the SAX parser must begin using the new cntHandler
       * immediately.
       * </p>
       *
       * @param cntHandler
       *           The content cntHandler.
       * @exception java.lang.NullPointerException
       *               If the cntHandler argument is null.
       * @see #getContentHandler
       */
   public void setContentHandler(ContentHandler cntHandler)
   {
      this.cntHandler = cntHandler;
   }

    /**
       * Set an AttributeList.Filter to filter the attribute entered in the
       * AttributeList
       *
       * @param filter
       *           AttributeList.Filter to set, or null if the current
       *           AttributeList filter must be removed
       * @return The previous AttributeList.Filter or null if none was set
       */
   public AttributeList.Filter setAttributeListFilter(AttributeList.Filter filter)
   {
      return attList.setFilter(filter);
   }

    /**
       * Return the current content cntHandler.
       *
       * @return The current content cntHandler, or null if none has been
       *         registered.
       * @see #setContentHandler
       */
   public ContentHandler getContentHandler()
   {
      return cntHandler;
   }

    /**
       * Parse an XML document from a Stream.
       * <p>
       * The application can use this method to instruct the XML reader to begin
       * parsing an XML document from reading a Stream.
       * <p>
       * Here is the general contract for all <code>parse</code> methods.
       * <p>
       * Applications may not invoke this method while a parse is in progress
       * (they should create a new XMLReader instead for each nested XML
       * document). Once a parse is complete, an application may reuse the same
       * XMLReader object, possibly with a different input source.
       * </p>
       * <p>
       * During the parse, the XMLReader will provide information about the XML
       * document through the registered event handlers.
       * </p>
       * <p>
       * This method is synchronous: it will not return until the parsing has ended.
       * If a client application wants to terminate the parsing early, it should
       * throw an exception.
       * </p>
       *
       * @param input
       *           The input source for the top-level XML document.
       * @exception SyntaxException
       * @throws totalcross.io.IOException
       * @see #setContentHandler
       */
   public final void parse(Stream input) throws SyntaxException, totalcross.io.IOException
   {
      newlineSignificant = 0;
      state = SPRING_STATE;
      if (cntHandler != null)
         tokenize(input);
   }

    /**
       * Parse an XML document from an already buffered stream.
       * <P>
       * Unlike the general method above, this method requires more arguments.
       * It should be used when the HTML document is embedded within an HTTP
       * stream.
       * <P>
       * See the general contract of {@link XmlReader#parse(Stream)}.
       *
       * @param input
       *           stream to parse
       * @param buffer
       *           buffer, already filled with bytes read from the input stream
       * @param start
       *           starting position in the buffer
       * @param end
       *           ending position in the buffer
       * @param pos
       *           read position of the byte at offset 0 in the buffer
       * @exception SyntaxException
       * @throws totalcross.io.IOException
       */
   public final void parse(Stream input, byte[] buffer, int start, int end, int pos) throws SyntaxException,
         totalcross.io.IOException
   {
      newlineSignificant = 0;
      state = SPRING_STATE;
      if (cntHandler != null)
         tokenize(input, buffer, start, end, pos);
   }

    /**
       * Parse an XmlReadable
       * Impl. Note: This is just for conveniency. It is more natural to write:
       * rdr.parse(doc) than doc.readXml(rdr)
       *
       * @param input
       *           The input source for the top-level XML document.
       * @throws totalcross.io.IOException
       */
   public final void parse(XmlReadable input) throws SyntaxException, totalcross.io.IOException
   {
      input.readXml(this);
   }

    /**
       * Parse XML data from an array of bytes, offset and count.
       * <P>
       * See the general contract of {@link XmlReader#parse(Stream)}.
       *
       * @param input
       *           byte array to parse
       * @param offset
       *           position of the first byte in the array
       * @param count
       *           number of bytes to parse
       * @exception SyntaxException
       */
   public final void parse(byte[] input, int offset, int count) throws SyntaxException
   {
      newlineSignificant = 0;
      state = SPRING_STATE;
      if (cntHandler != null)
         tokenize(input);
   }

    /**
       * Enable or disable coalescing white spaces, according to HTML rules.
       * <P>
       * White spaces are any character less or equal to the ascii space (0x20).
       * <P>
       * This method allows to process the contents of pre-formatted lines, such
       * as the contents of the <PRE> tag.  When the parsing process starts,
       * newlines are not significant.  Hence, setNewLineSignificant must
       * be called <b>after</b> the parsing has started.  For example, to make
       * all newlines significant:
       *
       * <PRE>
       * class MyXmlReader extends XmlReader
       * {
       *    public void foundStartOfInput(byte input[], int offset, int count)
       *    {
       *       setNewLineSignificant(true);
       *    }
       * }
       *
       * </PRE>
       *
       * <P>
       * <U>Note:</U> this is a "stacked" call.
       *
       * <PRE>
       *
       * setNewlineSignificant(true); // newlines are significant - stack is 1 
       * setNewlineSignificant(true); // newlines are significant - stack is 2 
       * setNewlineSignificant(false); // newlines are still significant - stack is 1
       * setNewlineSignificant(false); // newlines are no more significant again - stack is 0
       *
       *
       * </PRE>
       *
       * @param val
       *           true if newline characters must be significant, false if they
       *           must be collapsed according to HTML rules.
       */
   public void setNewlineSignificant(boolean val)
   {
      newlineSignificant += (val ? 1 : -1);
   }

   /**
    * Method to compute the tag code identifying a tag name.
    * <P>
    * This is the value which is passed to ContentHandler's for reporting a
    * tag name.  Derived class may override it.
    * Impl Note: Transforming to uppercase takes into account that the bytes are
    * in the range [0-9A-Za-z]: (ch >= 'a') means "ch is a lower case letter".
    * Also, we *do* know that the count is > 0.
    *
    * @param b
    *           byte array containing the bytes to be hashed
    * @param offset
    *           position of the first byte in the array
    * @param count
    *           number of bytes to be hashed
    * @return the corresponding hash code
    */
   protected int getTagCode(byte b[], int offset, int count)
   {
      int i = b[offset];
      if ('a' <= i)
         i -= ('a' - 'A'); // fast toUpper
      while (--count > 0)
      {
         byte ch = b[++offset];
         if ('a' <= ch)
            ch -= ('a' - 'A'); // fast toUpper
         i = (i << 5) - i + ch;
      }
      return i;
   }

   /**
    * Override of XmlTokenizer
    */
   public void foundStartTagName(byte buffer[], int offset, int count)
   {
      switch (state)
      {
         case START_TAG_STATE:
            reportStartTag();
            break;
         case PCDATA_STATE:
            reportData(false);
            break;
      }
      tagNameHashId = getTagCode(buffer, offset, count);
      state = START_TAG_STATE;
   }

   /**
    * Override of XmlTokenizer
    */
   public void foundEndTagName(byte buffer[], int offset, int count)
   {
      switch (state)
      {
         case START_TAG_STATE:
            reportStartTag();
            break;
         case PCDATA_STATE:
             // Should have been: reportData(true). | Alas: the (badly marked
             // up) "once <i>upon </i>a time" | would be seen as "once upon a time"
            reportData(false);
            break;
      }
      tagNameHashId = getTagCode(buffer, offset, count);
      cntHandler.endElement(tagNameHashId);
      state = END_TAG_STATE;
   }

   /**
    * Override of XmlTokenizer
    */
   public final void foundEndEmptyTag()
   {
      reportStartTag();
      cntHandler.endElement(tagNameHashId); // <BR/> is like "<BR></BR>"
      state = SPRING_STATE;
   }

   /**
    * Override of XmlTokenizer
    */
   public final void foundCharacterData(byte buffer[], int offset, int count)
   {
      if (state == START_TAG_STATE)
         reportStartTag();
      storeData(buffer, offset, count, (state == START_TAG_STATE) || (state == SPRING_STATE));
      state = (pcdata.length() > 0) ? PCDATA_STATE : SPRING_STATE;
   }

   /**
    * Override of XmlTokenizer
    * Impl Note: this assumes the found character is encoded in ISO 8859-1
    * later, we will need the appropriate encoder
    */
   public final void foundCharacter(char charFound)
   {
      if (state == START_TAG_STATE)
         reportStartTag();
      pcdata.append(charFound); // kcchan@554_39
      state = PCDATA_STATE;
   }

   /** Override of XmlTokenizer */
   public final void foundAttributeName(byte buffer[], int offset, int count)
   {
      flushAttribute();
      attrName = new String(buffer, offset, count);
   }

   /** Override of XmlTokenizer */
   public final void foundAttributeValue(byte buffer[], int offset, int count, byte dlm)
   {
      attList.addAttribute(attrName, new String(buffer, offset, count), dlm);
      attrName = null;
   }

   /** Override of XmlTokenizer */
   public final void foundComment(byte buffer[], int offset, int count)
   {
      switch (state)
      {
         case START_TAG_STATE:
            reportStartTag();
            break;
         case PCDATA_STATE:
            reportData(false);
            break;
      }
      cntHandler.comment(new String(buffer, offset, count));
      state = SPRING_STATE;
   }

   /** Override of XmlTokenizer */
   public final void foundEndOfInput(int count)
   {
      switch (state)
      {
         case START_TAG_STATE:
            reportStartTag();
            break;
         case PCDATA_STATE:
            reportData(true);
            break;
      }
   }
   
   /**
    * Override of XmlTokenizer
    * 
    * @since TotalCross 1.27
    */
   protected void foundDeclaration(byte[] input, int offset, int count)
   {
      flushAttribute();
      cntHandler.startElement(tagNameHashId, attList);
      attList.clear();

      if (count > 7 && new String(totalcross.sys.Convert.charConverter.bytes2chars(input, offset, 7)).equals("[CDATA["))
         cntHandler.cdata(tagNameHashId,
               new String(totalcross.sys.Convert.charConverter.bytes2chars(input, offset + 7, count - 9)));
   }

    /**
       * Store PCDATA in our ByteArrayStream. White spaces found inside the byte
       * array <code>input</code> are replaced by one space (' ').
       * <P>
       * If "stripLeadingSpace" is on, leading spaces are removed. Trailing
       * spaces are not removed.
       * <p>
       * When the string is made only of 1 or more white spaces, the returned
       * byte array is empty (length == 0) if stripLeadingSpace is true, or is
       * made of exactly one space if stripLeadingSpace is false.
       * <p>
       * Impl Note: This applies the following HTML Rules: - the parser treats \r
       * and \r\n and \n as newlines - newlines, when relevant, are seen as spaces -
       * newlines after start tags and before end tags are ignored (this is a rough
       * simplification of SGML rules) - in HTML, 2 or more spaces are coalesced in
       * one space except if "significantWhiteSpace" is required, as for the
       * <PRE> tag
       *
       * <PRE>
       * Examples: " AB\r\nC " -> " AB C " with stripLeadingSpace set to false: " AB\r\nC " -> "AB C " with
       * stripLeadingSpace set to true; "A\tB CD" -> "A B CD"
       * </PRE>
       *
       * @param input
       *           the byte array that must be ws-coalesced
       * @param offset
       *           position of the first byte to ws-coalesce in the array
       * @param count
       *           number of bytes to ws-coalesce
       */
   private void storeData(byte input[], int offset, int count, boolean stripLeadingSpaces)
   {
      if (newlineSignificant > 0)
      {
         if (stripLeadingSpaces)
         {
            while ((count > 0) && (input[offset] & 0xFF) <= ' ')
            {
               --count;
               ++offset;
            }
         }
         if (count > 0)
            pcdata.append(totalcross.sys.Convert.charConverter.bytes2chars(input, offset, count)); // kcchan@554_39
      }
      else
      {
         int from = offset - 1;
         if ((!stripLeadingSpaces) && (count > 0) && (input[offset] & 0xFF) <= ' ')
            pcdata.append(' '); // kcchan@554_39
         ++count;
         while (--count > 0)
         {
            if ((input[++from]&0xFF) > ' ')
            {
               int fromOrig = from++;
               while ((--count > 0) && (input[from] & 0xFF) > ' ')
                  ++from;
               pcdata.append(totalcross.sys.Convert.charConverter.bytes2chars(input, fromOrig, from - fromOrig)); // kcchan@554_39
               if (count == 0)
                  break;
               else
                  pcdata.append(' '); // kcchan@554_39
            }
         }
      }
   }

   /**
    * Called when an entire chunk of PCDATA has been recognized.
    * 
    * @param stripTrailingSpace
    *           if false, the data will be followed by a space had the source data ended with one or more white spaces;
    *           if true, the data ends at the last non-space character.
    */
   private void reportData(boolean stripTrailingSpaces)
   {
      int count = pcdata.length(); // kcchan@554_39
      if (count > 0)
      {
         String s = pcdata.toString();
         if (stripTrailingSpaces)
            s = s.trim();
         cntHandler.characters(s);
         pcdata.setLength(0);
      }
   }

   /**
    * Called when an attribute had no value.
    * <P>
    * Note that an attribute name not followed by a value assignment (Ex: <code>>dl compact></code> is perfectly
    * legal HTML/SGML.
    */
   private void flushAttribute()
   {
      if (attrName != null)
      {
         attList.addAttribute(attrName, "", (byte) 0);
         attrName = null;
      }
   }

   /**
    * Called when a start tag has been completely tokenized.
    */
   private void reportStartTag()
   {
      flushAttribute();
      cntHandler.startElement(tagNameHashId, attList);
      attList.clear();
   }
}