/*********************************************************************************
* TotalCross Software Development Kit *
* Copyright (C) 2003-2004 Pierre G. Richard *
* Copyright (C) 2003-2012 SuperWaba Ltda. *
* All Rights Reserved *
* *
* This library and virtual machine is distributed in the hope that it will *
* be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* *
* This file is covered by the GNU LESSER GENERAL PUBLIC LICENSE VERSION 3.0 *
* A copy of this license is located in file license.txt at the root of this *
* SDK or can be downloaded here: *
* http://www.gnu.org/licenses/lgpl-3.0.txt *
* *
*********************************************************************************/
package totalcross.xml;
import totalcross.io.Stream;
/**
* Used to read HTML or XML documents, reporting events to handlers (for
* example, {@link ContentHandler}).
* <P>
* <I><B>Note:</B> While in the SAX 2.0 spirit, this implementation is not
* fully compliant. Speed and footprint took precedence over what the
* author judged being details.</I>
* </P>
* <P>
* Unlike SAX, reporting tag names, like in
* {@link ContentHandler#startElement}, passes an integral
* <code><B>tag code</B></code> rather than the name itself. This
* is, again, for performance reasons. Comparing integers vs. strings is
* notably more efficient and tag name comparison is heavily used for XML
* applications.
* <P>
* The <code>tag code</code> must uniquely identify the name of the
* tag. The default implementation — see {@link #getTagCode} in
* this code — simply consists to hash the tag name. It can be
* overriden to suit specific needs.
* <P>
* Tag names should be translated to tag codes as soon as they are known,
* when reading the DTD for instance, or computed in advance and saved into a
* static correspondence table.
*
* @author Pierre G. Richard
*/
public class XmlReader extends XmlTokenizer
{
private static final int SPRING_STATE = 0;
private static final int START_TAG_STATE = 1;
private static final int END_TAG_STATE = 2;
private static final int PCDATA_STATE = 3;
/**
* hash ID of current tag name, set by <code>foundStartTagName</code> or
* <code>foundEndTagName</code>
*/
protected int tagNameHashId;
private StringBuffer pcdata; //CKC
private int state;
private int newlineSignificant;
private AttributeList attList;
private String attrName;
private ContentHandler cntHandler;
public XmlReader()
{
attList = new AttributeList();
pcdata = new StringBuffer(1000);
}
/** Set to true if you want the get/set methods of the AttributeList to be case insensitive. */
public void setCaseInsensitive(boolean caseInsensitive) // guich@tc113_29
{
attList.caseInsensitive = caseInsensitive;
}
/**
* Allow an application to register a content event cntHandler.
* <p>
* If the application does not register a content cntHandler, all content
* events reported by the SAX parser will be silently ignored.
* </p>
* <p>
* Applications may register a new or different cntHandler in the middle
* of a parse, and the SAX parser must begin using the new cntHandler
* immediately.
* </p>
*
* @param cntHandler
* The content cntHandler.
* @exception java.lang.NullPointerException
* If the cntHandler argument is null.
* @see #getContentHandler
*/
public void setContentHandler(ContentHandler cntHandler)
{
this.cntHandler = cntHandler;
}
/**
* Set an AttributeList.Filter to filter the attribute entered in the
* AttributeList
*
* @param filter
* AttributeList.Filter to set, or null if the current
* AttributeList filter must be removed
* @return The previous AttributeList.Filter or null if none was set
*/
public AttributeList.Filter setAttributeListFilter(AttributeList.Filter filter)
{
return attList.setFilter(filter);
}
/**
* Return the current content cntHandler.
*
* @return The current content cntHandler, or null if none has been
* registered.
* @see #setContentHandler
*/
public ContentHandler getContentHandler()
{
return cntHandler;
}
/**
* Parse an XML document from a Stream.
* <p>
* The application can use this method to instruct the XML reader to begin
* parsing an XML document from reading a Stream.
* <p>
* Here is the general contract for all <code>parse</code> methods.
* <p>
* Applications may not invoke this method while a parse is in progress
* (they should create a new XMLReader instead for each nested XML
* document). Once a parse is complete, an application may reuse the same
* XMLReader object, possibly with a different input source.
* </p>
* <p>
* During the parse, the XMLReader will provide information about the XML
* document through the registered event handlers.
* </p>
* <p>
* This method is synchronous: it will not return until the parsing has ended.
* If a client application wants to terminate the parsing early, it should
* throw an exception.
* </p>
*
* @param input
* The input source for the top-level XML document.
* @exception SyntaxException
* @throws totalcross.io.IOException
* @see #setContentHandler
*/
public final void parse(Stream input) throws SyntaxException, totalcross.io.IOException
{
newlineSignificant = 0;
state = SPRING_STATE;
if (cntHandler != null)
tokenize(input);
}
/**
* Parse an XML document from an already buffered stream.
* <P>
* Unlike the general method above, this method requires more arguments.
* It should be used when the HTML document is embedded within an HTTP
* stream.
* <P>
* See the general contract of {@link XmlReader#parse(Stream)}.
*
* @param input
* stream to parse
* @param buffer
* buffer, already filled with bytes read from the input stream
* @param start
* starting position in the buffer
* @param end
* ending position in the buffer
* @param pos
* read position of the byte at offset 0 in the buffer
* @exception SyntaxException
* @throws totalcross.io.IOException
*/
public final void parse(Stream input, byte[] buffer, int start, int end, int pos) throws SyntaxException,
totalcross.io.IOException
{
newlineSignificant = 0;
state = SPRING_STATE;
if (cntHandler != null)
tokenize(input, buffer, start, end, pos);
}
/**
* Parse an XmlReadable
* Impl. Note: This is just for conveniency. It is more natural to write:
* rdr.parse(doc) than doc.readXml(rdr)
*
* @param input
* The input source for the top-level XML document.
* @throws totalcross.io.IOException
*/
public final void parse(XmlReadable input) throws SyntaxException, totalcross.io.IOException
{
input.readXml(this);
}
/**
* Parse XML data from an array of bytes, offset and count.
* <P>
* See the general contract of {@link XmlReader#parse(Stream)}.
*
* @param input
* byte array to parse
* @param offset
* position of the first byte in the array
* @param count
* number of bytes to parse
* @exception SyntaxException
*/
public final void parse(byte[] input, int offset, int count) throws SyntaxException
{
newlineSignificant = 0;
state = SPRING_STATE;
if (cntHandler != null)
tokenize(input);
}
/**
* Enable or disable coalescing white spaces, according to HTML rules.
* <P>
* White spaces are any character less or equal to the ascii space (0x20).
* <P>
* This method allows to process the contents of pre-formatted lines, such
* as the contents of the <PRE> tag. When the parsing process starts,
* newlines are not significant. Hence, setNewLineSignificant must
* be called <b>after</b> the parsing has started. For example, to make
* all newlines significant:
*
* <PRE>
* class MyXmlReader extends XmlReader
* {
* public void foundStartOfInput(byte input[], int offset, int count)
* {
* setNewLineSignificant(true);
* }
* }
*
* </PRE>
*
* <P>
* <U>Note:</U> this is a "stacked" call.
*
* <PRE>
*
* setNewlineSignificant(true); // newlines are significant - stack is 1
* setNewlineSignificant(true); // newlines are significant - stack is 2
* setNewlineSignificant(false); // newlines are still significant - stack is 1
* setNewlineSignificant(false); // newlines are no more significant again - stack is 0
*
*
* </PRE>
*
* @param val
* true if newline characters must be significant, false if they
* must be collapsed according to HTML rules.
*/
public void setNewlineSignificant(boolean val)
{
newlineSignificant += (val ? 1 : -1);
}
/**
* Method to compute the tag code identifying a tag name.
* <P>
* This is the value which is passed to ContentHandler's for reporting a
* tag name. Derived class may override it.
* Impl Note: Transforming to uppercase takes into account that the bytes are
* in the range [0-9A-Za-z]: (ch >= 'a') means "ch is a lower case letter".
* Also, we *do* know that the count is > 0.
*
* @param b
* byte array containing the bytes to be hashed
* @param offset
* position of the first byte in the array
* @param count
* number of bytes to be hashed
* @return the corresponding hash code
*/
protected int getTagCode(byte b[], int offset, int count)
{
int i = b[offset];
if ('a' <= i)
i -= ('a' - 'A'); // fast toUpper
while (--count > 0)
{
byte ch = b[++offset];
if ('a' <= ch)
ch -= ('a' - 'A'); // fast toUpper
i = (i << 5) - i + ch;
}
return i;
}
/**
* Override of XmlTokenizer
*/
public void foundStartTagName(byte buffer[], int offset, int count)
{
switch (state)
{
case START_TAG_STATE:
reportStartTag();
break;
case PCDATA_STATE:
reportData(false);
break;
}
tagNameHashId = getTagCode(buffer, offset, count);
state = START_TAG_STATE;
}
/**
* Override of XmlTokenizer
*/
public void foundEndTagName(byte buffer[], int offset, int count)
{
switch (state)
{
case START_TAG_STATE:
reportStartTag();
break;
case PCDATA_STATE:
// Should have been: reportData(true). | Alas: the (badly marked
// up) "once <i>upon </i>a time" | would be seen as "once upon a time"
reportData(false);
break;
}
tagNameHashId = getTagCode(buffer, offset, count);
cntHandler.endElement(tagNameHashId);
state = END_TAG_STATE;
}
/**
* Override of XmlTokenizer
*/
public final void foundEndEmptyTag()
{
reportStartTag();
cntHandler.endElement(tagNameHashId); // <BR/> is like "<BR></BR>"
state = SPRING_STATE;
}
/**
* Override of XmlTokenizer
*/
public final void foundCharacterData(byte buffer[], int offset, int count)
{
if (state == START_TAG_STATE)
reportStartTag();
storeData(buffer, offset, count, (state == START_TAG_STATE) || (state == SPRING_STATE));
state = (pcdata.length() > 0) ? PCDATA_STATE : SPRING_STATE;
}
/**
* Override of XmlTokenizer
* Impl Note: this assumes the found character is encoded in ISO 8859-1
* later, we will need the appropriate encoder
*/
public final void foundCharacter(char charFound)
{
if (state == START_TAG_STATE)
reportStartTag();
pcdata.append(charFound); // kcchan@554_39
state = PCDATA_STATE;
}
/** Override of XmlTokenizer */
public final void foundAttributeName(byte buffer[], int offset, int count)
{
flushAttribute();
attrName = new String(buffer, offset, count);
}
/** Override of XmlTokenizer */
public final void foundAttributeValue(byte buffer[], int offset, int count, byte dlm)
{
attList.addAttribute(attrName, new String(buffer, offset, count), dlm);
attrName = null;
}
/** Override of XmlTokenizer */
public final void foundComment(byte buffer[], int offset, int count)
{
switch (state)
{
case START_TAG_STATE:
reportStartTag();
break;
case PCDATA_STATE:
reportData(false);
break;
}
cntHandler.comment(new String(buffer, offset, count));
state = SPRING_STATE;
}
/** Override of XmlTokenizer */
public final void foundEndOfInput(int count)
{
switch (state)
{
case START_TAG_STATE:
reportStartTag();
break;
case PCDATA_STATE:
reportData(true);
break;
}
}
/**
* Override of XmlTokenizer
*
* @since TotalCross 1.27
*/
protected void foundDeclaration(byte[] input, int offset, int count)
{
flushAttribute();
cntHandler.startElement(tagNameHashId, attList);
attList.clear();
if (count > 7 && new String(totalcross.sys.Convert.charConverter.bytes2chars(input, offset, 7)).equals("[CDATA["))
cntHandler.cdata(tagNameHashId,
new String(totalcross.sys.Convert.charConverter.bytes2chars(input, offset + 7, count - 9)));
}
/**
* Store PCDATA in our ByteArrayStream. White spaces found inside the byte
* array <code>input</code> are replaced by one space (' ').
* <P>
* If "stripLeadingSpace" is on, leading spaces are removed. Trailing
* spaces are not removed.
* <p>
* When the string is made only of 1 or more white spaces, the returned
* byte array is empty (length == 0) if stripLeadingSpace is true, or is
* made of exactly one space if stripLeadingSpace is false.
* <p>
* Impl Note: This applies the following HTML Rules: - the parser treats \r
* and \r\n and \n as newlines - newlines, when relevant, are seen as spaces -
* newlines after start tags and before end tags are ignored (this is a rough
* simplification of SGML rules) - in HTML, 2 or more spaces are coalesced in
* one space except if "significantWhiteSpace" is required, as for the
* <PRE> tag
*
* <PRE>
* Examples: " AB\r\nC " -> " AB C " with stripLeadingSpace set to false: " AB\r\nC " -> "AB C " with
* stripLeadingSpace set to true; "A\tB CD" -> "A B CD"
* </PRE>
*
* @param input
* the byte array that must be ws-coalesced
* @param offset
* position of the first byte to ws-coalesce in the array
* @param count
* number of bytes to ws-coalesce
*/
private void storeData(byte input[], int offset, int count, boolean stripLeadingSpaces)
{
if (newlineSignificant > 0)
{
if (stripLeadingSpaces)
{
while ((count > 0) && (input[offset] & 0xFF) <= ' ')
{
--count;
++offset;
}
}
if (count > 0)
pcdata.append(totalcross.sys.Convert.charConverter.bytes2chars(input, offset, count)); // kcchan@554_39
}
else
{
int from = offset - 1;
if ((!stripLeadingSpaces) && (count > 0) && (input[offset] & 0xFF) <= ' ')
pcdata.append(' '); // kcchan@554_39
++count;
while (--count > 0)
{
if ((input[++from]&0xFF) > ' ')
{
int fromOrig = from++;
while ((--count > 0) && (input[from] & 0xFF) > ' ')
++from;
pcdata.append(totalcross.sys.Convert.charConverter.bytes2chars(input, fromOrig, from - fromOrig)); // kcchan@554_39
if (count == 0)
break;
else
pcdata.append(' '); // kcchan@554_39
}
}
}
}
/**
* Called when an entire chunk of PCDATA has been recognized.
*
* @param stripTrailingSpace
* if false, the data will be followed by a space had the source data ended with one or more white spaces;
* if true, the data ends at the last non-space character.
*/
private void reportData(boolean stripTrailingSpaces)
{
int count = pcdata.length(); // kcchan@554_39
if (count > 0)
{
String s = pcdata.toString();
if (stripTrailingSpaces)
s = s.trim();
cntHandler.characters(s);
pcdata.setLength(0);
}
}
/**
* Called when an attribute had no value.
* <P>
* Note that an attribute name not followed by a value assignment (Ex: <code>>dl compact></code> is perfectly
* legal HTML/SGML.
*/
private void flushAttribute()
{
if (attrName != null)
{
attList.addAttribute(attrName, "", (byte) 0);
attrName = null;
}
}
/**
* Called when a start tag has been completely tokenized.
*/
private void reportStartTag()
{
flushAttribute();
cntHandler.startElement(tagNameHashId, attList);
attList.clear();
}
}