/*
* Copyright 2008 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Sun designates this
* particular file as subject to the "Classpath" exception as provided
* by Sun in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
package com.sun.lwuit.html;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
/**
* The parser class is used to parse an XHTML-MP 1.0 document into a DOM object (Element).
* Unsupported tags and attributes as well as comments are dropped in the parsing process.
* The parser is also used to external CSS files, embedded CSS segments and CSS within the 'style' attribute.
*
* @author Ofir Leitner
*/
class Parser {
/**
* The most common char entities strings. When a char entity is found these will be compared against first.
*/
private static final String[] COMMON_CHAR_ENTITIES = {"nbsp", // White space
"lt", // lesser-than
"gt", // greater-than
"amp", // ampersand
"quot", //quotation mark
"apos", // apostrophe
"bull", //bullet
"euro"}; //euro
/**
* The numericals value of the most common char entities strings above.
*/
private static final int[] COMMON_CHAR_ENTITIES_VALS = {160, // "nbsp", // White space
60, // "lt", // lesser-than
62, // "gt", // greater-than
38, // "amp", // ampersand
34, // "quot", //quotation mark
39, // "apos", // apostrophe
8226, // "bull", //bullet
8364}; // "euro"}; //euro
/**
* The parser is a singleton, this static member holds its instance
*/
static Parser instance;
/**
* The list of empty tags (i.e. tags that naturally don't have any children).
* This is used to enable empty tags to be closed also in a non-strict way (i.e. <br> instead of <br>/;)
* some of these tags are not a part of the XHTML-MP 1.0 standard, but including them here allows a more smooth parsing if the document is not strictly XHTML-MP 1.0
*/
static String[] EMPTY_TAGS = {"br","link","meta","base","area","basefont","col","frame","hr","img","input","isindex","param"};
/**
* This is a list of ISO 8859-1 Symbols that can be used as HTML char entities
*/
private static final String[] CHAR_ENTITY_STRINGS = {
"iexcl","cent","pound","curren","yen","brvbar","sect","uml","copy","ordf","laquo","not","shy","reg","macr","deg","plusmn","sup2","sup3","acute",
"micro","para","middot","cedil","sup1","ordm","raquo","frac14","frac12","frac34","iquest","Agrave","Aacute","Acirc","Atilde","Auml","Aring","AElig",
"Ccedil","Egrave","Eacute","Ecirc","Euml","Igrave","Iacute","Icirc","Iuml","ETH","Ntilde","Ograve","Oacute","Ocirc","Otilde","Ouml","times","Oslash",
"Ugrave","Uacute","Ucirc","Uuml","Yacute","THORN","szlig","agrave","aacute","acirc","atilde","auml","aring","aelig","ccedil","egrave","eacute","ecirc",
"euml","igrave","iacute","icirc","iuml","eth","ntilde","ograve","oacute","ocirc","otilde","ouml","divide","oslash","ugrave","uacute","ucirc","uuml",
"yacute","thorn","yuml"};
/**
* This hashtable contains user defined char entities that can be added via HTMLComponent.addCharEntity
*/
private static Hashtable USER_DEFINED_CHAR_ENTITIES;
/**
* The supported CSS media types, this is relevant for CSS at-rules (i.e. @import and @media)
* The default values according to the WCSS specs the default one is "handheld" and "all" (Which is always accepted)
*/
private static String[] SUPPORTED_MEDIA_TYPES = {"all","handheld"};
/**
* Returns or creates the Parser's single instance
*
* @return the Parser's instance
*/
static Parser getInstance() {
if (instance==null) {
instance=new Parser();
}
return instance;
}
/**
* Sets the supported CSS media types to the given strings.
* Usually the default media types ("all","handheld") should be suitable, but in case this runs on a device that matches another profile, the developer can specify it here.
*
* @param supportedMediaTypes A string array containing the media types that should be supported
*/
static void setCSSSupportedMediaTypes(String[] supportedMediaTypes) {
SUPPORTED_MEDIA_TYPES=supportedMediaTypes;
}
/**
* Matches the given string to the given options and returns the matching value, or -1 if none found.
*
* @param str The string to compare
* @param options The options to match the string against
* @return The appropriate matching value: If the string equals (case ignored) to the option in the X position of the options array, the int X will be returned. If the string didn't match any of the options -1 is returned.
*/
static int getStringVal(String str,String[] options) {
return getStringVal(str, options, null, -1);
}
/**
* Matches the given string to the given options and returns the matching value, or -1 if none found.
*
* @param str The string to compare
* @param options The options to match the string against
* @param vals The values to match to each option (According to the position in the array), this can be null.
* @return The appropriate matching value: If the string equals (case ignored) to the option in the X position of the options array, this returns the value in the X position of the vals array, or simply X if vals is null. If the string didn't match any of the options -1 is returned.
*/
static int getStringVal(String str,String[] options,int[] vals) {
return getStringVal(str, options, vals, -1);
}
/**
* Matches the given string to the given options and returns the matching value, or the default one if none found.
*
* @param str The string to compare
* @param options The options to match the string against
* @param defaultValue The default value to return if the string was null or not found among the options
* @return The appropriate matching value: If the string equals (case ignored) to the option in the X position of the options array, the int X will be returned. If the string didn't match any of the options the defaultValue is returned.
*/
static int getStringVal(String str,String[] options,int defaultValue) {
return getStringVal(str, options, null, defaultValue);
}
/**
* Matches the given string to the given options and returns the matching value, or the default one if none found.
*
* @param str The string to compare
* @param options The options to match the string against
* @param vals The values to match to each option (According to the position in the array), this can be null.
* @param defaultValue The default value to return if the string was null or not found among the options
* @return The appropriate matching value: If the string equals (case ignored) to the option in the X position of the options array, this returns the value in the X position of the vals array, or simply X if vals is null. If the string didn't match any of the options the defaultValue is returned.
*/
static int getStringVal(String str,String[] options,int[] vals,int defaultValue) {
if (str!=null) {
for(int i=0;i<options.length;i++) {
if (str.equalsIgnoreCase(options[i])) {
if (vals!=null) {
return vals[i];
} else {
return i;
}
}
}
}
return defaultValue;
}
/**
* Adds the given symbol and code to the user defined char entities table
*
* @param symbol The symbol to add
* @param code The symbol's code
*/
static void addCharEntity(String symbol,int code) {
if (USER_DEFINED_CHAR_ENTITIES==null) {
USER_DEFINED_CHAR_ENTITIES=new Hashtable();
}
USER_DEFINED_CHAR_ENTITIES.put(trimCharEntity(symbol),new Integer(code));
}
/**
* Adds the given symbols array to the user defined char entities table with the startcode provided as the code of the first string, startcode+1 for the second etc.
* Some strings in the symbols array may be null thus skipping code numbers.
*
* @param symbols The symbols to add
* @param startcode The symbol's code
*/
static void addCharEntitiesRange(String[] symbols,int startcode) {
if (USER_DEFINED_CHAR_ENTITIES==null) {
USER_DEFINED_CHAR_ENTITIES=new Hashtable();
}
for(int i=0;i<symbols.length;i++) {
if (symbols[i]!=null) {
USER_DEFINED_CHAR_ENTITIES.put(trimCharEntity(symbols[i]),new Integer(startcode+i));
}
}
}
/**
* Trims unneeded & and ; from the symbol if exist
*
* @param symbol The char entity symbol
* @return A trimmed char entity without & and ;
*/
private static String trimCharEntity(String symbol) {
if (symbol.startsWith("&")) {
symbol=symbol.substring(1);
}
if (symbol.endsWith(";")) {
symbol=symbol.substring(0, symbol.length()-1);
}
return symbol;
}
/**
* This method translates between a HTML char entity string to the according char code.
* The string is first compared to the 6 most popular strings: nbsp,quot,apos,amp,lt and gt.
* If not found, the search continues to a wider string array of char codes 161-255 which are supported in ISO-8859-1
* In addition 'euro' was added as it is out of the regular ISO-8859-1 table but popular.
*
* @param symbol The symbol to lookup
* @return The char code of the symbol, or -1 if none found
*/
private static int getCharEntityCode(String symbol) {
// First tries the most popular char entities
int val=getStringVal(symbol, COMMON_CHAR_ENTITIES, COMMON_CHAR_ENTITIES_VALS);
if (val!=-1) {
return val;
} else {
// Not one of the most popular char codes, proceed to check the ISO-8859-1 symbols array
val=getStringVal(symbol, CHAR_ENTITY_STRINGS);
if (val!=-1) {
return val+161;
}
// Not found in the standard symbol table, see if it is in the user defined symbols table
if (USER_DEFINED_CHAR_ENTITIES!=null) {
Object charObj=USER_DEFINED_CHAR_ENTITIES.get(symbol);
if (charObj!=null) {
return ((Integer)charObj).intValue();
}
}
// Not found anywhere
return -1;
}
}
/**
* Converts a char entity to the matching character.
* This handles both numbered and symbol char entities (The latter is done via getCharEntityCode)
*
* @param charEntity The char entity to convert
* @return A string containing a single char, or an empty string if the char entity couldn't be converted
*/
private String convertCharEntity(String charEntity,HTMLCallback callback) {
int charCode=-1;
if (charEntity.startsWith("#")) { //numbered char entity
if (charEntity.startsWith("#x")) { //hex
try {
charCode=Integer.parseInt(charEntity.substring(2),16);
} catch (NumberFormatException nfe) {
//if not a number - simply ignore char entity
}
} else {
try {
charCode=Integer.parseInt(charEntity.substring(1));
} catch (NumberFormatException nfe) {
//if not a number - simply ignore char entity
}
}
} else { //not numbered, rather a symbol
charCode=getCharEntityCode(charEntity);
}
if (charCode!=-1) {
return ""+(char)charCode;
} else {
notifyError(callback, HTMLCallback.ERROR_UNRECOGNIZED_CHAR_ENTITY,null,null,null, "Unrecognized char entity: "+charEntity);
return "&"+charEntity+";"; // Another option is to return an empty string, but returning the entity will unravel bugs and will also allow ignoring common mistakes such as using the & char (instead of ')
}
}
/**
* This is the entry point for parsing a document and the only non-private member method in this class
*
* @param is The InputStream containing the XML
* @return an Element object describing the parsed document
*/
Element parse(InputStreamReader is,HTMLComponent htmlC) {
Element rootElement=new Element("ROOT"); // ROOT is a "dummy" element that all other document elements are added to
try {
parseTagContent(rootElement, is, htmlC);
} catch (IOException ioe) {
ioe.printStackTrace();
}
return rootElement;
}
/**
* This method parses tags content. It accumulates text and adds it as a child element in the parent Element.
* Upon bumping a start tag character it calls the parseTag method.
* This method is called at first from the parse method, and later on from parseTag (which creates the recursion).
*
* @param element The current parent element
* @param is The InputStream containing the XML
* @throws IOException
*/
private void parseTagContent(Element element,InputStreamReader is, HTMLComponent htmlC) throws IOException {
HTMLCallback callback=htmlC.getHTMLCallback();
if ((HTMLComponent.SUPPORT_CSS) && (htmlC.loadCSS) && (element.getId() == Element.TAG_STYLE)) { // We aren't strict and don't require text/css in a style tag // && "text/css".equals(element.getAttributeById(Element.ATTR_TYPE)))) {
CSSElement addTo = parseCSSSegment(is,null,htmlC,null);
htmlC.addToEmebeddedCSS(addTo);
return;
}
String text=null;
boolean leadingSpace=false;
char c=(char)is.read();
String charEntity=null;
while((byte)c!=-1) {
if (c=='<') {
if (text!=null) {
if (charEntity!=null) { //Mistakenly "collected" something that is not a char entity, perhaps misuse of the & character (instead of using ')
text+="&"+charEntity;
charEntity=null;
}
if (leadingSpace) {
text=" "+text;
}
Element textElement=new Element("text");
textElement.addAttribute("title", text);
element.addChild(textElement);
text=null;
leadingSpace=false;
}
Element childElement=parseTag(is,htmlC);
if (childElement==null) { //was actually an ending tag
String closingTag="";
c=(char)is.read();
while ((c!='>')) {
closingTag+=c;
c=(char)is.read();
}
if (closingTag.equalsIgnoreCase(element.getName())) {
return;
} else if (isEmptyTag(closingTag)) {
// do nothing, someone chose to close an empty tag i.e. <img ....></img> or <br></br>
} else {
notifyError(callback, HTMLCallback.ERROR_NO_CLOSE_TAG, element.getName(), null, null, "Malformed HTML - no appropriate closing tag for "+element.getName());
}
} else if (childElement.getId()!=-1) { //If tag unsupported don't add it
element.addChild(childElement);
}
} else if (text!=null) {
if (charEntity!=null) {
if (c==';') { //end
text+=convertCharEntity(charEntity,callback);
charEntity=null;
} else {
charEntity+=c;
}
} else if (c=='&') { //start char entity
charEntity=""; // The & is not included in the string we accumulate
} else {
text+=c;
}
} else if (!isWhiteSpace(c)) {
if (c=='&') { //text starts with a character entity (i.e. )
charEntity=""; // The & is not included in the string we accumulate
text=""; //Initalize text so it won't be null
} else {
text=""+c;
}
} else if (c==' ') {
leadingSpace=true;
}
c=(char)is.read();
}
}
/**
* Checks if the specified character is a white space or not.
* Exposed to packaage since used by HTMLComponent as well
*
* @param ch The character to check
* @return true if the character is a white space, false otherwise
*/
static boolean isWhiteSpace(char ch) {
return ((ch==' ') || (ch=='\n') || (ch=='\t') || (ch==10) || (ch==13));
}
/**
* This method collects the tag name and all of its attributes.
* For comments and XML declarations this will call the parseCommentOrXMLDeclaration method.
* Note that this method returns an Element with a name/id and attrbutes, but not its content which will be done by parseTagContent
*
* @param is The InputStream containing the XML
* @return The parsed element
* @throws IOException
*/
private Element parseTag(InputStreamReader is,HTMLComponent htmlC) throws IOException {
String tagName="";
String curAttribute="";
String curValue="";
boolean procInst=false;
HTMLCallback callback=htmlC.getHTMLCallback();
char c=(char)is.read();
if (c=='/') {
return null; //end tag
} else if (c=='!') {
c=(char)is.read();
char c2=(char)is.read();
if ((c=='-') && (c2=='-')) { //comment
return parseCommentOrXMLDeclaration(is,"-->");
} else {
return parseCommentOrXMLDeclaration(is,">"); //parse doctypes i.e. <!DOCTYPE .... > as comments as well - i.e. ignore them
}
} else if (c=='?') {
procInst=true;
c=(char)is.read();
//return parseCommentOrXMLDeclaration(is,">"); //parse XML declaration i.e. <?xml version="1.0" encoding="ISO-8859-1"?> as comments as well - i.e. ignore them
}
//read and ignore any whitespaces before tag name
while (isWhiteSpace(c)) {
c=(char)is.read();
}
//collect tag name
while ((!isWhiteSpace(c)) && (c!='>') && (c!='/')) {
tagName+=c;
c=(char)is.read();
}
//read and ignore any whitespaces after tag name
while (isWhiteSpace(c)) {
c=(char)is.read();
}
tagName=tagName.toLowerCase();
if (procInst) {
if (tagName.equals("xml-stylesheet")) { // The XML processing instruction <?xml-stylesheet ... ?> has the same parameters as <link .. > and behaves the same way
tagName="link";
} else { // Processing instruction not supported - read till its end
c=(char)is.read();
while (c!='>') {
c=(char)is.read();
}
return new Element("unsupported");
}
}
Element element=new Element(tagName);
if (element.getId()==-1) {
notifyError(callback, HTMLCallback.ERROR_TAG_NOT_SUPPORTED, tagName, null, null, "The tag '"+tagName+"' is not supported in XHTML-MP 1.0");
// If tag is not supported we skip it all till the closing tag.
// This is especially important for the script tag which may contain '<' and '>' which might confuse the parser
char lastChar=c;
while (c!='>') { // Read till the end of the tag
lastChar=c;
c=(char)is.read();
}
if (lastChar!='/') { // If this is an empty tag, no need to search for its closing tag as there's none...
String endTag="</"+tagName+">";
int index=0;
while(index<endTag.length()) {
c=(char)is.read();
if ((c>='A') && (c<='Z')) {
c=(char)(c-'A'+'a');
}
if (c==endTag.charAt(index)) {
index++;
} else {
index=0;
}
}
}
return element;
}
if (c=='>') { //tag declartion ended, process content
if (!isEmptyTag(tagName)) {
parseTagContent(element, is, htmlC);
}
return element;
} else if ((c=='/') || ((procInst) && (c=='?'))) { //closed tag - no content
c=(char)is.read();
if (c=='>') {
return element;
} else {
notifyError(callback, HTMLCallback.ERROR_UNEXPECTED_CHARACTER, tagName, null, null, "HTML malformed - no > after /");
}
}
while(true) {
curAttribute=""+c;
c=(char)is.read();
while ((!isWhiteSpace(c)) && (c!='=') && (c!='>')) {
curAttribute+=c;
c=(char)is.read();
}
if (c=='>') { // tag close char shouldn't be found here, but if the HTML is slightly malformed we return the element
notifyError(callback, HTMLCallback.ERROR_UNEXPECTED_TAG_CLOSING, tagName,curAttribute,null, "Unexpected tag closing in tag "+tagName+", attribute="+curAttribute);
if (!isEmptyTag(tagName)) {
parseTagContent(element, is, htmlC);
}
return element;
}
//read and ignore any whitespaces after attribute name
while (isWhiteSpace(c)) {
c=(char)is.read();
}
if (c!='=') {
notifyError(callback, HTMLCallback.ERROR_UNEXPECTED_CHARACTER, tagName, curAttribute, null, "Unexpected character "+c+", expected '=' after attribute "+curAttribute+" in tag "+tagName);
if (c=='>') { // tag close char shouldn't be found here, but if the HTML is slightly malformed we return the element
if (!isEmptyTag(tagName)) {
parseTagContent(element, is, htmlC);
}
return element;
}
continue; //if attribute is not followed by = then process the next attribute
}
c=(char)is.read();
//read and ignore any whitespaces before attribute value
while (isWhiteSpace(c)) {
c=(char)is.read();
}
char quote=' ';
if ((c=='"') || (c=='\'')) {
quote=c;
} else {
curValue+=c;
}
String charEntity=null;
boolean ended=false;
while (!ended) {
c=(char)is.read();
if (c==quote) {
ended=true;
c=(char)is.read();
} else if ((quote==' ') && ((c=='/') || (c=='>') || (isWhiteSpace(c)))) {
ended=true;
} else if (c=='&') {
if (charEntity!=null) {
curValue+="&"+charEntity; // Wasn't a char entit, probably a url as a parameter : i.e. param="/test?p=val&pw=val2&p3=val3
}
charEntity="";
} else {
if (charEntity!=null) {
if (c==';') {
curValue+=convertCharEntity(charEntity,callback);
charEntity=null;
} else {
charEntity+=c;
}
} else {
curValue+=c;
}
}
}
if (charEntity!=null) { // Mistaken something else for a char entity - for example an action which is action="http://domain/test.html?param1=val1¶m2=val2"
curValue+="&"+charEntity;
charEntity=null;
}
curAttribute=curAttribute.toLowerCase();
int error=element.addAttribute(curAttribute, curValue);
if (error==HTMLCallback.ERROR_ATTRIBUTE_NOT_SUPPORTED) {
notifyError(callback, error, tagName, curAttribute, curValue, "Attribute '"+curAttribute+"' is not supported for tag '"+tagName+"'. Supported attributes: "+element.getSupportedAttributesList());
} else if (error==HTMLCallback.ERROR_ATTIBUTE_VALUE_INVALID) {
notifyError(callback, error, tagName, curAttribute, curValue, "Attribute '"+curAttribute+"' in tag '"+tagName+"' has an invalid value ("+curValue+")");
}
//read and ignore any whitespaces after attribute/value pair
while (isWhiteSpace(c)) {
c=(char)is.read();
}
if (c=='>') { //tag declartion ended, process content
if (!isEmptyTag(tagName)) {
parseTagContent(element, is, htmlC);
}
return element;
} else if ((c=='/') || ((procInst) && (c=='?'))) { //closed tag - no content
c=(char)is.read();
if (c=='>') {
return element;
} else {
notifyError(callback, HTMLCallback.ERROR_UNEXPECTED_CHARACTER, tagName, curAttribute, curValue, "HTML malformed - no > after /");
//throw new IllegalArgumentException("HTML malformed - no > after / - 2, instead: "+((byte)c));
}
}
curAttribute="";
curValue="";
}
}
/**
* This utility method is used to parse comments and XML declarations in the HTML.
* The comment/declaration is returned as an Element.
* In the current implementation they will be ommitted from the final DOM (=the root element) as the tag name won't match supported tags.
*
* @param is The inputstream
* @param endTag The endtag to look for
* @return
* @throws IOException
*/
private Element parseCommentOrXMLDeclaration(InputStreamReader is,String endTag) throws IOException {
int endTagPos=0;
String text="";
boolean ended=false;
while (!ended) {
char c=(char)is.read();
if (c==endTag.charAt(endTagPos)) {
endTagPos++;
if (endTagPos==endTag.length()) {
ended=true;
}
} else {
if (endTagPos!=0) { //add - or -- if it wasn't an end tag eventually
text+=endTag.substring(0, endTagPos);
endTagPos=0;
}
text+=c;
}
}
String elementName=null;
if (endTag.equals("-->")) {
elementName="comment";
} else if (endTag.equals(">")) {
elementName="XML declaration";
}
Element comment = new Element(elementName);
comment.addAttribute("content", text);
return comment;
}
/**
* Checks whether the specified tag is an empty tag as defined in EMPTY_TAGS
*
* @param tagName The tag name to check
* @return true if that tag is defined as an empty tag, false otherwise
*/
private boolean isEmptyTag(String tagName) {
int i=0;
boolean found=false;
while ((i<EMPTY_TAGS.length) && (!found)) {
if (tagName.equals(EMPTY_TAGS[i])) {
found=true;
}
i++;
}
return found;
}
/**
* A utility method used to notify an error to the HTMLCallback and throw an IllegalArgumentException if parsingError returned false
*
* @param callback The HTMLCallback
* @param errorId The error ID, one of the ERROR_* constants in HTMLCallback
* @param tag The tag in which the error occured (Can be null for non-tag related errors)
* @param attribute The attribute in which the error occured (Can be null for non-attribute related errors)
* @param value The value in which the error occured (Can be null for non-value related errors)
* @param description A verbal description of the error
*/
private static void notifyError(HTMLCallback callback, int errorId,String tag, String attribute,String value,String description) {
if (callback!=null) {
boolean cont=callback.parsingError(errorId,tag,attribute,value,description);
if (!cont) {
throw new IllegalArgumentException(description);
}
}
}
// ***********
// CSS Parsing methods from here onward
// ***********
/**
* Handles a CSS comment segment
*
* @param r The stream reader
* @return The next char after the comment
* @throws IOException
*/
private char handleCSSComment(ExtInputStreamReader r) throws IOException {
char c= r.readCharFromReader();
if (c=='*') {
char lastC='\0';
while ((c!='/') || (lastC!='*')) {
lastC=c;
c= r.readCharFromReader();
}
c= r.readCharFromReader();
while(((byte)c) != -1 && isWhiteSpace(c)) { //skip white spaces
c= r.readCharFromReader();
}
} else {
r.unreadChar(c);
return '/';
}
return c;
}
/**
* Reads the next CSS token from the reader
*
* @param r The stream reader
* @param readNewline true to read new lines and not break when they're found, false otherwise
* @param ignoreCommas true to ignore commas and not break when they're found, false otherwise
* @param ignoreColons true to ignore colons and not break when they're found, false otherwise
* @param ignoreWhiteSpaces true to ignore white spaces and not break when they're found, false otherwise
* @return The next CSS token
* @throws IOException
*/
private String nextToken(ExtInputStreamReader r, boolean readNewline,boolean ignoreCommas,boolean ignoreColons,boolean ignoreWhiteSpaces) throws IOException {
boolean newline = false;
StringBuffer currentToken = new StringBuffer();
char c= r.readCharFromReader();
// read the next token from the CSS stream
while(((byte)c) != -1 && isWhiteSpace(c)) {
newline = newline || (c == 10 || c == 13 || c == ';' || ((c == ',') && (!ignoreCommas)));
if(!readNewline && newline) {
return null;
}
c= r.readCharFromReader();
}
if (c==';' && readNewline) { //leftover from compound operation
c= r.readCharFromReader();
while(((byte)c) != -1 && isWhiteSpace(c)) { // This was added since after reading ; there might be some more white spaces. However there needs to be a way to combine this with the previous white spaces code or with the revised newline detection and unreading char below
newline = newline || (c == 10 || c == 13 || c == ';' || ((c == ',') && (!ignoreCommas)));
c= r.readCharFromReader();
}
}
char segment='\0'; // segment of (...) or "..." or '...'
while(((byte)c) != -1 && ((!isWhiteSpace(c)) || (segment != '\0') || (ignoreWhiteSpaces)) && c != ';' && ((c != ':') || (segment!='\0') || (ignoreColons)) && ((c != ',') || (segment != '\0') || (ignoreCommas)) && c != '>') { //- : denotes pseudo-classes, would like to keep them as one token
if ((segment=='\0') && (c=='/')) { //comment start perhaps, if inside brackets - ignore
c=handleCSSComment(r);
}
if ((c == '}' || c == '{' || c == '*' ) && (segment=='\0')) { //enter only if not in the middle of a segment. i.e. '*N'
newline = true;
if(currentToken.length() == 0) {
if(!readNewline) {
r.unreadChar(c);
return null;
}
return "" + c;
}
r.unreadChar(c);
break;
}
currentToken.append(c);
if (c=='(') {
segment=')';
} else if ((segment=='\0') && ((c=='\"') || (c=='\''))) { //TODO - This keeps track of one segment only, while in fact there can be "nested" segments - i.e. ("...") which is common in URLs, though not sure it is critical as such pattern works correctly even now // || (c=='`') ?
segment=c;
} else if (c==segment) {
segment='\0';
}
c= r.readCharFromReader();
}
if ((c==',') && (!ignoreCommas)) {
currentToken.append(c);
}
if((!readNewline) && (c==';') && (currentToken.length() != 0) ) {
r.unreadChar(c);
}
if(currentToken.length() == 0) {
return null;
}
return currentToken.toString();
}
/**
* Copies all attributes from
*
* @param element The element to copy from
* @param selectors A vector containing grouped selectors to copy the attributes to
* @param addTo The main element to add the grouped selectors to
*/
private void copyAttributes(CSSElement element,Vector selectors,Element addTo) {
if (selectors==null) {
return;
}
for(Enumeration e=selectors.elements();e.hasMoreElements();) {
CSSElement selector=(CSSElement)e.nextElement();
addTo.addChild(selector);
while (selector.getNumChildren()>0) { // This makes sure we get the last nested selector
selector=selector.getCSSChildAt(0);
}
element.copyAttributesTo(selector);
}
}
/**
* Returns true if the specified CSS media type is unsupported, false otherwise
*
* @param media A string identifying the media type (i.e. "handheld")
* @return true if the specified CSS media type is uspported, false otherwise
*/
private boolean isMediaTypeSupported(String media) {
for(int i=0;i<SUPPORTED_MEDIA_TYPES.length;i++) {
if (media.equalsIgnoreCase(SUPPORTED_MEDIA_TYPES[i])) {
return true;
}
}
return false;
}
/**
* Checks if an at-media rule applies to the supported media types
*
* @param mediaTypes A string containing all media types the at-media rule allows
* @return true if one of the supported media types is denoted, false otherwise
*/
boolean mediaTypeMatches(String mediaTypes) {
if ((mediaTypes==null) || (mediaTypes.equals(""))) {
return true;
}
int comma=mediaTypes.indexOf(',');
while (comma!=-1) {
if (isMediaTypeSupported(mediaTypes.substring(0,comma).trim())) {
return true;
}
mediaTypes=mediaTypes.substring(comma+1);
comma=mediaTypes.indexOf(',');
}
return isMediaTypeSupported(mediaTypes.trim());
}
/**
* Returns the import URL if the specified media matches, or null otherwise
*
* @param token The string including the url and media of the import at-rule (example: url("mycss.css") handheld,tv;
* @return the import URL if the specified media matches, or null otherwise
*/
private String getImportURLByMediaType(String token) {
String url=token;
boolean mediaMatches=true;
int space=token.indexOf(' ');
if (space!=-1) {
url=token.substring(0, space);
token=token.substring(space+1);
mediaMatches=mediaTypeMatches(token);
}
if (mediaMatches) {
if (url.startsWith("url(")) {
url=CSSEngine.getCSSUrl(url);
}
return url;
} else {
return null;
}
}
/**
* Handles a media at-rule segment.
* This method checks if the media type specified in the media at-rule is supported, if it does
* it returns only the media segment as a separate stream, otherwsie it returns null
*
* @param isr The stream representing the CSS
* @return An input stream with the relevant media segment or null if the media is not supported
* @throws IOException on input stream failure
*/
private ExtInputStreamReader getMediaSegment(ExtInputStreamReader r) throws IOException {
String token = nextToken(r,true,true,true,true);
char c= r.readCharFromReader();
while ((((byte)c) != -1) && (c!='{')) { // Find the first { that marks the start of the media segment
c= r.readCharFromReader();
}
StringBuffer segment=new StringBuffer();
boolean match=mediaTypeMatches(token);
int count=1; // counts the number of opened curly brackets
while (count>0) {
c= r.readCharFromReader();
if ((((byte)c)==-1)) {
break; //end of file
}
if (match) {
segment.append(c);
}
if (c=='{') {
count++;
} else if (c=='}') {
count--;
}
}
if (match) {
return new ExtInputStreamReader(new InputStreamReader(new ByteArrayInputStream(segment.toString().getBytes())));
} else {
return null;
}
}
/**
* Reads a CSS file/stream and returns the tokenized CSS as a single level element tree with the
* root appearing as a "style".
* This method is called upon finding linked/external CSS and embedded CSS segments.
* It handles at-rules such as import/charset/media and forwards relevant segments to the parseCSS method
*
* @param isr The InputStreamReader representing the stream
* @param is The InputStream representing the stream (We need it too, in case encoding changes and we need to create another InputStreamReader)
* @param htmlC The HTMLComponent
* @param pageURL For external CSS the URL of the CSS, for embedded - null
* @return A CSSElement containing all selectors found in the stream as its children
* @throws IOException on input stream failure
*/
CSSElement parseCSSSegment(InputStreamReader isr,InputStream is,HTMLComponent htmlC,String pageURL) throws IOException {
CSSElement addTo = new CSSElement("style");
ExtInputStreamReader r = new ExtInputStreamReader(isr);
DocumentInfo docInfo=null;
String encoding=htmlC.getDocumentInfo()!=null?htmlC.getDocumentInfo().getEncoding():null;
String token = nextToken(r,true,false,true,false);
while(token.startsWith("@")) {
if (token.equals("@import")) {
token = nextToken(r,true,true,true,true);
String url=getImportURLByMediaType(token);
if (url!=null) {
if (docInfo==null) {
docInfo=pageURL==null?htmlC.getDocumentInfo():new DocumentInfo(pageURL);
}
if (docInfo!=null) {
htmlC.getThreadQueue().addCSS(docInfo.convertURL(url),encoding); // Referred CSS "inherit" charset from the referring document
} else {
if (DocumentInfo.isAbsoluteURL(url)) {
htmlC.getThreadQueue().addCSS(url,encoding); // Referred CSS "inherit" charset from the referring document
} else {
notifyError(htmlC.getHTMLCallback(), HTMLCallback.ERROR_NO_BASE_URL, "@import", null, url, "Ignoring CSS file referred in an @import rule ("+url+"), since page was set by setBody/setHTML so there's no way to access relative URLs");
}
}
}
} else if (token.equals("@media")) {
ExtInputStreamReader mediaReader = getMediaSegment(r); // TODO send is and encoding if any
if (mediaReader!=null) {
parseCSS(mediaReader, htmlC, addTo,null);
}
} else if (token.equals("@charset")) {
token = CSSEngine.omitQuotesIfExist(nextToken(r,true,false,true,false));
if (is!=null) { // @charset applies only to external style sheet, and the inputstream is null for embedded CSS segments
try {
ExtInputStreamReader encodedReader=new ExtInputStreamReader(new InputStreamReader(is, token));
r=encodedReader;
encoding=token;
} catch (UnsupportedEncodingException uee) {
notifyError(htmlC.getHTMLCallback(), HTMLCallback.ERROR_ENCODING, "@charset", null, token, "External CSS encoding @charset "+token+" directive failed: "+uee.getMessage());
}
}
}
token = nextToken(r,true,false,true,false);
}
return parseCSS(r, htmlC, addTo,token);
}
/**
* Reads a CSS file/stream and returns the tokenized CSS as a single level element tree with the
* root appearing as a "style".
* This method is called either directly on style attributes.
*
* @param r The stream reader containing the CSS segment
* @param htmlC The HTMLComponent
* @return A CSSElement containing all selectors found in the stream as its children
* @throws IOException on input stream failure
*/
CSSElement parseCSS(InputStreamReader r,HTMLComponent htmlC) throws IOException {
ExtInputStreamReader er=new ExtInputStreamReader(r);
return parseCSS(er, htmlC, null,null);
}
/**
* Reads a CSS file/stream and returns the tokenized CSS as a single level element tree with the
* root appearing as a "style".
*
* @param r The stream reader containing the CSS segment
* @param htmlC The HTMLComponent
* @param addTo the master CSSElement to add the selectors to (or null to open a new one_
* @param firstToken A first toekn to process, or null if none
* @return A CSSElement containing all selectors found in the stream as its children
* @throws IOException on input stream failure
*/
CSSElement parseCSS(ExtInputStreamReader r,HTMLComponent htmlC,CSSElement addTo,String firstToken) throws IOException {
//CSSElement addTo = new CSSElement("style");
if (addTo==null) {
addTo = new CSSElement("style");
}
CSSElement parent = addTo;
Vector selectors = new Vector();
CSSElement lastGroupedParent=null;
boolean selectorMode = true;
boolean grouping=false; // Grouping is when selector are grouped, i.e. h1,h2,h3 { ... }
String token = "";
//TODO - detect BOM for UTF8 etc.
while(true) {
if (firstToken!=null) {
token=firstToken;
firstToken=null;
} else {
token = nextToken(r,true,false,selectorMode,false);
}
if(token == null || token.indexOf("</style") > -1) {
break;
}
if("{".equals(token)) {
selectorMode = false;
grouping=false;
continue;
}
if("}".equals(token)) {
selectorMode = true;
copyAttributes(parent, selectors,addTo);
parent = addTo;
selectors = new Vector();
lastGroupedParent=null;
continue;
}
// Checks for grouped selectors, note that due to spacing the comma can either appear as a separate token, or at the start of a token or at its end
// All these scenarios are checked in the following lines of code.
if ((",".equals(token)) && (selectorMode)) {
grouping=true;
continue;
}
if(selectorMode) {
if (token.startsWith(",")) {
token=token.substring(1);
grouping=true;
}
if (grouping) {
if (token.endsWith(",")) {
token=token.substring(0, token.length()-1);
} else {
grouping=false; // there was no comma at the end, so next time it is not a grouped element (unless a comma will be detected as the next token or the start of the next token)
}
CSSElement entry = new CSSElement(token);
selectors.addElement(entry);
lastGroupedParent=entry;
} else {
if (token.endsWith(",")) {
grouping=true;
token=token.substring(0, token.length()-1);
}
CSSElement entry = new CSSElement(token);
if (lastGroupedParent==null) {
parent.addChild(entry);
parent = entry;
} else {
lastGroupedParent.addChild(entry);
lastGroupedParent=entry;
}
}
} else {
boolean compoundToken = false;
for(int iter = 0 ; iter < CSSElement.CSS_SHORTHAND_ATTRIBUTE_LIST.length ; iter++) {
if(CSSElement.CSS_SHORTHAND_ATTRIBUTE_LIST[iter].equals(token)) {
compoundToken = true;
boolean collattable=CSSElement.CSS_IS_SHORTHAND_ATTRIBUTE_COLLATABLE[iter];
int valsAdded=0;
token = nextToken(r, false,false,false,false);
// This array is used for collatable attributes - the values can't be set as they are read, first we need to see how many values appear and set accordingly
String[] tokens = new String[4];
while(token!=null) {
if (collattable) {
if (valsAdded<tokens.length) {
tokens[valsAdded]=token;
valsAdded++;
}
} else {
addShorthandAttribute(token, iter, parent);
}
token = nextToken(r, false,false,false,false);
}
// The following assigns the collatable attributes according to CSSElement.CSS_COLLATABLE_ORDER
if ((collattable) && (valsAdded>0)) {
for(int i=0;i<CSSElement.CSS_COLLATABLE_ORDER[valsAdded-1].length;i++) {
for(int j=0;j<CSSElement.CSS_COLLATABLE_ORDER[valsAdded-1][i].length;j++) {
int side=CSSElement.CSS_COLLATABLE_ORDER[valsAdded-1][i][j];
addAttributeTo(parent, CSSElement.CSS_SHORTHAND_ATTRIBUTE_INDEX[iter][side], tokens[i], htmlC);
}
}
}
break;
}
}
// if this is a "regular" css attribute is it one of the supported attributes
if(!compoundToken) {
// We ignore commas when collecting a value, since it can be for example: font-family:arial,tahoma,sans-serif etc.
// We also ignore spaces in font-family / access key since the value can be: arial, tahoma / send * , #
int result=addAttributeTo(parent, token, nextToken(r,false,true,false,(token.equalsIgnoreCase("-wap-access-key") || (token.equalsIgnoreCase("font-family")))), htmlC);
if(result!=-1) {
// unsupported token we need to read until the newline
//while(nextToken(r, false, false,false) != null && !newline) {} //TODO - what if that happens in the end of the file - do we get into an infinite loop?
while(nextToken(r, false, false,false,false) != null) {} //TODO - is newline truly unnecessary ? + what if that happens in the end of the file - do we get into an infinite loop?
}
}
}
}
return addTo;
}
/**
* Adds the specified value to the specified selector as a value to the shorthand attribute whose index is specified
* This methods deals with the complexity of adding values for shorthand attributes, since they can be specified in any order
* It also handles multiple-shorthand levels such as the 'border' attribute
*
* @param value The attribute's value
* @param shorthandAttr The attribute's index
* @param selector The selector to add the attribuet to
* @return true if succeeded to add, false otherwise (for example invalid value)
*/
private boolean addShorthandAttribute(String value,int shorthandAttr,CSSElement selector) {
if (CSSElement.CSS_IS_SHORTHAND_ATTRIBUTE_COLLATABLE[shorthandAttr]) {
return addCollatableAttribute(value, shorthandAttr, selector);
}
for(int i=0;i<CSSElement.CSS_SHORTHAND_ATTRIBUTE_INDEX[shorthandAttr].length;i++) {
int attrIndex=CSSElement.CSS_SHORTHAND_ATTRIBUTE_INDEX[shorthandAttr][i];
if (attrIndex>=CSSElement.CSS_STYLE_ID_OFFSET) {
if (!selector.isAttributeAssigned(attrIndex)) { // Only check if the attribute wasn't set yet
int result=selector.addAttribute(attrIndex, value);
if (result==-1) { //no error code return - success
return true;
}
}
} else {
boolean success=addShorthandAttribute(value, attrIndex, selector);
if (success) {
return true;
}
}
}
return false;
}
/**
* Adds the specified value to the specified selector as a value to the shorthand and collatable attribute whose index is specified
* This is called from addShorthandAttribute when a shorthand attribute maps to a collatable attribute
* Note that while usually collatable attributes can have 1-4 values, and are mapped according to CSSElement.CSS_COLLATABLE_ORDER
* When they are specified as part of a top shorthand attribute, only one value can be specified and it is copied to all base attributes.
* For example, While the definition 'border-width: 5px 10px' will set the vertical border width to 5 and the horizontal to 10,
* One cannot specify: 'border: 5px 10px solid red' - but rather has to specify only one value that will be set as the width for all sides.
*
* @param value The attribute's value
* @param shorthandAttr The attribute's index
* @param selector The selector to add the attribuet to
* @return true if succeeded to add, false otherwise (for example invalid value)
*/
private boolean addCollatableAttribute(String value,int shorthandAttr,CSSElement selector) {
int attrIndex=CSSElement.CSS_SHORTHAND_ATTRIBUTE_INDEX[shorthandAttr][0];
int result=selector.addAttribute(attrIndex, value);
if (result==-1) {
for(int i=1;i<CSSElement.CSS_SHORTHAND_ATTRIBUTE_INDEX[shorthandAttr].length;i++) {
attrIndex=CSSElement.CSS_SHORTHAND_ATTRIBUTE_INDEX[shorthandAttr][i];
selector.addAttribute(attrIndex, value);
}
return true;
}
return false;
}
/**
* Adds the specified attribute and value pair to the specified selector
*
* @param selector The selector we're working on
* @param attrId The attribute's id
* @param value The attribute value
* @param htmlC The HTMLComponent (To obtain the HTMLCallback)
* @return a positive value if an error occured, or -1 otherwise
*/
private int addAttributeTo(CSSElement selector,int attrId,String value,HTMLComponent htmlC) {
int error=selector.addAttribute(attrId, value);
reportAddAttributeError(error,selector, selector.getAttributeName(new Integer(attrId)), value, htmlC);
return error;
}
/**
* Adds the specified attribute and value pair to the specified selector
*
* @param selector The selector we're working on
* @param attributeName The attribute's name
* @param value The attribute value
* @param htmlC The HTMLComponent (To obtain the HTMLCallback)
* @return a positive value if an error occured, or -1 otherwise
*/
private int addAttributeTo(CSSElement selector,String attributeName,String value,HTMLComponent htmlC) {
int error=selector.addAttribute(attributeName, value);
reportAddAttributeError(error,selector, attributeName, value, htmlC);
return error;
}
/**
* A helper method that handles reporting of CSS errors to the HTMLCallback (if available)
*
* @param errorCode The error code as returned by the CSSElement.addAttribute methods
* @param selector The selector we're working on
* @param attributeName The attribute's name
* @param value The attribute value
* @param htmlC The HTMLComponent (To obtain the HTMLCallback)
*/
private void reportAddAttributeError(int errorCode,CSSElement selector,String attributeName,String value,HTMLComponent htmlC) {
if (errorCode!=-1) {
if (errorCode==HTMLCallback.ERROR_CSS_ATTRIBUTE_NOT_SUPPORTED) {
notifyError(htmlC.getHTMLCallback(), errorCode, selector.getName(), attributeName, value, "CSS Attribute '"+attributeName+"' (Appeared in selector '"+selector.getName()+"') is not supported in WCSS.");
} else if (errorCode==HTMLCallback.ERROR_CSS_ATTIBUTE_VALUE_INVALID) {
notifyError(htmlC.getHTMLCallback(), errorCode, selector.getName(), attributeName, value, "CSS Attribute '"+attributeName+"' (Appeared in selector '"+selector.getName()+"') has an invalid value ("+value+")");
}
}
}
}
/**
* A decorator for InputStreamReader that adds teh ability to "unread" a character
* This makes parsing easier, and is used for CSS parsing.
*
* @author Ofir Leitner
*/
class ExtInputStreamReader {
char lastCharRead = (char)-1;
InputStreamReader internalReader;
ExtInputStreamReader(InputStreamReader isr) {
internalReader=isr;
}
/**
* "Unreads" a character from the stream by placing it in a member variable to be later retreived by readCharFromReader, used by the CSS Parser
*
* @param c The character to unread
*/
void unreadChar(char c) {
lastCharRead = c;
}
/**
* Reads the next character from the input stream, used by the CSS Parser
* If there's an "unread" character in teh buffer it is returned (and no reading is done to the actual stream)
*
* @param r The stream reader
* @return the next character
* @throws IOException
*/
char readCharFromReader() throws IOException {
if(lastCharRead != (char)-1) {
char c = lastCharRead;
lastCharRead = (char)-1;
return c;
}
return (char)internalReader.read();
}
}