/*******************************************************************************
* Copyright (c) 2009-2011 Red Hat, Inc.
* Distributed under license by Red Hat, Inc. All rights reserved.
* This program is made available under the terms of the
* Eclipse Public License v1.0 which accompanies this distribution,
* and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Red Hat, Inc. - initial API and implementation
******************************************************************************/
package org.jboss.tools.jsf.web.validation;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.eclipse.jface.text.rules.ICharacterScanner;
import org.eclipse.jface.text.rules.IToken;
import org.jboss.tools.common.text.ext.util.TextScanner;
import org.jboss.tools.common.text.ext.util.Utils;
/**
* Detects if there is an xhtml is in reader
*
* @author Victor V. Rubezhny
*
*/
class XHTMLDetector extends TextScanner {
private static final String DOCTYPE_DECLARATION = "DOCTYPE";
private static final String[] VALID_DOCTYPE_DTD_DECLARATION_REQUIRED_TOKENS = { "W3C", "DTD", "XHTML"};
private static final String[] VALID_DOCTYPE_DTD_DECLARATION_ONE_OF_TOKENS = { "Strict", "Transitional", "Frameset"};
private static final String VALID_ELEMENT_XMLNS_ATTRIBUTE = "xmlns";
private static final String VALID_ELEMENT_XMLNS_ATTRIBUTE_VALUE = "http://www.w3.org/1999/xhtml";
private static final String TEXT_TOKEN = "___TEXT_TOKEN";
private static final String COMMENT_TOKEN = "___COMMENT_TOKEN";
private static final String XML_DECL_TOKEN = "___XML_DECL_TOKEN";
private static final String DECL_TOKEN = "___DECL_TOKEN";
private static final String ELEMENT_TOKEN = "___ELEMENT_TOKEN";
private static final String PUBLIC = "PUBLIC";
private static final String SYSTEM = "SYSTEM";
public XHTMLDetector(Reader reader) {
super(reader);
}
public boolean detect () {
state = STATE_START;
clearText();
boolean docTypeFound = false;
String docTypeRootName = null;
String docTypeIdKind = null;
String docTypePublicId = null;
for (IToken t = nextToken(); t != null && !t.isEOF() ; t = nextToken()) {
if (!(t instanceof TextToken))
continue;
TextToken token = (TextToken)t;
if (!docTypeFound && DECL_TOKEN.equals(token.getType())) {
if (declName != null && declName.equals(DOCTYPE_DECLARATION)) {
docTypeFound = true;
docTypeRootName = rootName;
docTypeIdKind = idKind;
docTypePublicId = publicId;
// if (!VALID_DOCTYPE_ROOT.equals(docTypeRootName))
// return false;
if (!PUBLIC.equals(docTypeIdKind))
return false;
if (!hasAllTokens(docTypePublicId, VALID_DOCTYPE_DTD_DECLARATION_REQUIRED_TOKENS))
return false;
if (!hasOneOfTokens(docTypePublicId, VALID_DOCTYPE_DTD_DECLARATION_ONE_OF_TOKENS))
return false;
return true;
}
}
if (ELEMENT_TOKEN.equals(token.getType())) {
/*
* These checks are removed because of many pages (many of them I've found at www.oracle.com)
* which aren't define xmlns="http://www.w3.org/1999/xhtml" attribute, but normally parsed by browsers
* So, we'll just use doctype/root name check to see if it is the XHTML
*/
if (docTypeFound) {
if (elementName == null)
return false;
String name = elementName.substring(elementName.indexOf(':') + 1); // Cut the prefix off
if (!name.equals(docTypeRootName))
return false;
if (!elementAttributes.containsKey(VALID_ELEMENT_XMLNS_ATTRIBUTE))
return false;
String value = elementAttributes.get(VALID_ELEMENT_XMLNS_ATTRIBUTE);
if (value == null) return false;
if (!VALID_ELEMENT_XMLNS_ATTRIBUTE_VALUE.equals(Utils.trimQuotes(value).toLowerCase()))
return false;
return true;
/* } else {
if (!elementAttributes.containsKey(VALID_ELEMENT_XMLNS_ATTRIBUTE))
continue;
String value = elementAttributes.get(VALID_ELEMENT_XMLNS_ATTRIBUTE);
if (value == null) continue;
if (!VALID_ELEMENT_XMLNS_ATTRIBUTE_VALUE.equals(Utils.trimQuotes(value).toLowerCase()))
continue;
return true;
*/
}
return false;
}
}
return false;
}
private boolean hasAllTokens(String publicId, String[] reqiured) {
if (publicId == null) return false;
for (String r : reqiured) {
int idx = publicId.indexOf(r);
if (idx == -1)
return false;
if (publicId.length() < idx + r.length() + 1)
return false;
if (Character.isJavaIdentifierPart(publicId.charAt(idx + r.length())))
return false;
}
return true;
}
private boolean hasOneOfTokens(String publicId, String[] oneOf) {
if (publicId == null) return false;
boolean found = false;
for (String r : oneOf) {
int idx = publicId.indexOf(r);
if (idx != -1) {
if (publicId.length() >= idx + r.length() + 1) {
if (Character.isJavaIdentifierPart(publicId.charAt(idx + r.length())))
continue;
}
if (found)
return false; // Contains more than one token
found = true;
}
}
return found;
}
private static final int STATE_START = 0;
private static final int STATE_ELEMENT = 1;
private static final int STATE_XML_DECL = 2;
private static final int STATE_DECL = 3;
private static final int STATE_COMMENT = 4;
private static final int STATE_END = 5;
private int state;
private String declName;
private String rootName;
private String idKind;
private String publicId;
private String systemId;
private String elementName;
private Map<String, String> elementAttributes = new HashMap<String, String>();
/* (non-Javadoc)
* @see org.jboss.tools.jsf.text.ext.util.TextScanner#nextToken()
*/
public IToken nextToken() {
offset += length;
switch (state) {
case STATE_ELEMENT:
return nextElementToken();
case STATE_DECL:
return nextDeclToken();
case STATE_XML_DECL:
return nextXmlDeclToken();
case STATE_COMMENT:
return nextCommentToken();
}
return nextTextToken();
}
private IToken nextTextToken() {
int count = skipWhitespaceToken();
int ch = read();
while (ch != ICharacterScanner.EOF) {
if (ch == '<') {
state = STATE_ELEMENT;
ch = read();
if (ch == '!') {
state = STATE_DECL;
ch = read();
if (ch == '-') {
ch = read();
if (ch == '-') {
state = STATE_COMMENT;
unread(); // "-"
unread(); // "-"
unread(); // "!" char
unread(); // "<" char
return (count > 0 ? getToken(TEXT_TOKEN) : nextCommentToken());
}
if (ch != -1)
unread(); // last char
unread(); // "-"
}
if (ch != -1)
unread(); // last char
unread(); // "!" char
unread(); // "<" char
return (count > 0 ? getToken(TEXT_TOKEN) : nextDeclToken());
}
if (ch == '?') {
state = STATE_XML_DECL;
unread(); // "?" char
unread(); // "<" char
return (count > 0 ? getToken(TEXT_TOKEN) : nextXmlDeclToken());
}
if (ch != -1)
unread(); // last char
unread(); // "<" char
return (count > 0 ? getToken(TEXT_TOKEN) : nextElementToken());
}
count++;
ch = read();
}
state = STATE_END;
return getToken(TEXT_TOKEN);
}
private IToken nextCommentToken() {
int count = skip(3); // Skip '<!--' chars
if (count < 3) {
state = STATE_END;
return getToken(COMMENT_TOKEN);
}
int ch = read();
while (ch != ICharacterScanner.EOF) {
if (ch == '-') {
ch = read();
if (ch == ICharacterScanner.EOF) {
break;
}
if (ch == '-') {
ch = read();
if (ch == ICharacterScanner.EOF) {
break;
}
if (ch == '>') {
state = STATE_START;
return getToken(COMMENT_TOKEN);
}
}
}
count++;
ch = read();
}
state = STATE_END;
return getToken(COMMENT_TOKEN);
}
private IToken nextXmlDeclToken() {
int count = skip(2); // Skip '<?' chars
if (count < 2) {
state = STATE_END;
return getToken(XML_DECL_TOKEN);
}
int ch = read();
while (ch != ICharacterScanner.EOF) {
if (ch == '"' || ch == '\'') {
count += skipLiteralToken(ch);
ch = read();
continue;
}
if (ch == '?') {
ch = read();
if (ch == ICharacterScanner.EOF) {
break;
}
if (ch == '>') {
state = STATE_START;
return getToken(XML_DECL_TOKEN);
}
}
count++;
ch = read();
}
state = STATE_END;
return getToken(XML_DECL_TOKEN);
}
private IToken nextDeclToken() {
int count = skip(2); // Skip '<' chars
if (count < 2) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
// Read declaration name (we're very expecting to see 'DOCTYPE' here)
declName = readName();
if (declName == null || declName.length() == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += declName.length();
// At least one WS-char is expected here
int wsCount = skipWhitespaceToken();
if (wsCount == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += wsCount;
// Read root element name here (http://www.w3.org/TR/xhtml1/#strict says that 'html' is strictly expected here)
rootName = readName();
if (declName == null || declName.length() == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += declName.length();
// At least one WS-char is expected here
wsCount = skipWhitespaceToken();
if (wsCount == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += wsCount;
// Read 'PUBLIC' or 'SYSTEM' word here
idKind = readName();
if (declName == null || declName.length() == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += declName.length();
// At least one WS-char is expected here
wsCount = skipWhitespaceToken();
if (wsCount == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += wsCount;
if (!PUBLIC.equals(idKind) && !SYSTEM.equals(idKind)) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
// If ID is PUBLIC then read PUBLIC ID value
if (PUBLIC.equals(idKind)) {
publicId = readLiteralValue();
count += publicId.length();
// At least one WS-char is expected here
wsCount = skipWhitespaceToken();
if (wsCount == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += wsCount;
}
// Read SYSTEM ID value
systemId = readLiteralValue();
count += systemId.length();
// Expecting end of declaration, so don't check count of WS-chars
count += skipWhitespaceToken();
count += wsCount;
int ch = read();
state = ch == '>' ? STATE_START : STATE_END;
return getToken(DECL_TOKEN);
}
private IToken nextElementToken() {
int count = skip(1); // Skip '<' char
if (count < 1) {
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
// Check for '/' char (ending tag)
int ch = read();
if (ch == -1) {
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
boolean closingTag = true;
if (ch != '/') { // unread if it's not '/' char
unread();
closingTag = false;
} else {
count++;
}
// Read tag name (the tag that is interesting for us is 'html', but it could be any tag)
elementName = readName();
elementAttributes.clear();
if (elementName == null || elementName.length() == 0) {
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
count += elementName.length();
ch = read(); // Check that the next char exists
while (ch != ICharacterScanner.EOF) {
if (ch == -1) {
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
unread();
int wsCount = skipWhitespaceToken();
count += wsCount;
// Check for end of tag:
ch = read();
if (!closingTag && ch == '/') { // - end of tag with no body
ch = read();
state = ch == '>' ? STATE_START : STATE_END;
return getToken(ELEMENT_TOKEN);
} else if (ch == '>') { // - end of tag with body
state = STATE_START;
return getToken(ELEMENT_TOKEN);
} else {
if (wsCount == 0) {
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
}
unread();
count += wsCount;
if (!closingTag) {
// Read attr name
String attrName = readName();
if (attrName == null || attrName.length() == 0) {
state = STATE_END;
return getToken(DECL_TOKEN);
}
count += attrName.length();
count += skipWhitespaceToken();
// read eq sign
ch = read();
if (ch != '=') {
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
count++;
count += skipWhitespaceToken();
// read attr value
String attrValue = readLiteralValue();
count += attrValue.length();
elementAttributes.put(attrName, attrValue);
}
ch = read();
}
state = STATE_END;
return getToken(ELEMENT_TOKEN);
}
int skip(int count) {
int skipped = 0;
for (;skipped < count && read() != -1;skipped++) ;
return skipped;
}
public int skipLiteralToken(int quote) {
int count = 0;
for (int ch = read(); ch != -1 && ch != quote; ch = read()) count++;
return count;
}
String readLiteralValue() {
StringBuffer sb = new StringBuffer();
int quote = read();
if (quote != '"' && quote != '\'') {
unread();
return sb.toString();
}
sb.append((char)quote);
int ch = read();
for (; ch != -1 && ch != quote; ch = read()) sb.append((char)ch);
if (ch != -1)
sb.append((char)ch);
return sb.toString();
}
String readName() {
StringBuffer sb = new StringBuffer();
// Check first one char in the stream
int ch = read();
if (ch == ICharacterScanner.EOF) {
return null;
}
if (!NMTOKEN_DETECTOR.isWordStart((char)ch)) {
return null;
}
sb.append((char)ch);
ch = read();
while (ch != ICharacterScanner.EOF) {
if (!NMTOKEN_DETECTOR.isWordPart((char)ch)) {
unread();
break;
}
sb.append((char)ch);
ch = read();
}
return sb.toString();
}
}