/**
* This file Copyright (c) 2005-2008 Aptana, Inc. This program is
* dual-licensed under both the Aptana Public License and the GNU General
* Public license. You may elect to use one or the other of these licenses.
*
* This program is distributed in the hope that it will be useful, but
* AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
* NONINFRINGEMENT. Redistribution, except as permitted by whichever of
* the GPL or APL you select, is prohibited.
*
* 1. For the GPL license (GPL), you can redistribute and/or modify this
* program under the terms of the GNU General Public License,
* Version 3, as published by the Free Software Foundation. You should
* have received a copy of the GNU General Public License, Version 3 along
* with this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Aptana provides a special exception to allow redistribution of this file
* with certain other free and open source software ("FOSS") code and certain additional terms
* pursuant to Section 7 of the GPL. You may view the exception and these
* terms on the web at http://www.aptana.com/legal/gpl/.
*
* 2. For the Aptana Public License (APL), this program and the
* accompanying materials are made available under the terms of the APL
* v1.0 which accompanies this distribution, and is available at
* http://www.aptana.com/legal/apl/.
*
* You may view the GPL, Aptana's exception and additional terms, and the
* APL in the file titled license.html at the root of the corresponding
* plugin containing this source file.
*
* Any modifications to this file must keep this entire header intact.
*/
package com.aptana.ide.editor.html.parsing;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Stack;
import com.aptana.ide.editor.html.lexing.HTMLTokenTypes;
import com.aptana.ide.editor.html.parsing.nodes.HTMLDeclarationNode;
import com.aptana.ide.editor.html.parsing.nodes.HTMLDocumentNode;
import com.aptana.ide.editor.html.parsing.nodes.HTMLElementNode;
import com.aptana.ide.editor.html.parsing.nodes.HTMLParseNode;
import com.aptana.ide.editor.html.parsing.nodes.HTMLParseNodeTypes;
import com.aptana.ide.editor.html.parsing.nodes.HTMLSpecialNode;
import com.aptana.ide.lexer.ILexer;
import com.aptana.ide.lexer.Lexeme;
import com.aptana.ide.lexer.LexemeList;
import com.aptana.ide.lexer.LexerException;
import com.aptana.ide.lexer.Range;
import com.aptana.ide.lexer.TokenCategories;
import com.aptana.ide.lexer.matcher.AndMatcher;
import com.aptana.ide.lexer.matcher.CharacterMatcher;
import com.aptana.ide.lexer.matcher.StringMatcher;
import com.aptana.ide.lexer.matcher.WhitespaceMatcher;
import com.aptana.ide.lexer.matcher.ZeroOrMoreMatcher;
import com.aptana.ide.parsing.IParser;
import com.aptana.ide.parsing.ParserInitializationException;
import com.aptana.ide.parsing.nodes.IParseNode;
import com.aptana.ide.parsing.nodes.IParseNodeAttribute;
import com.aptana.ide.parsing.nodes.QuoteType;
/**
* @author Kevin Lindsey
*/
public class HTMLParser extends HTMLParserBase
{
private static String ATTRIBUTE_LANGUAGE_GROUP = "attribute-language"; //$NON-NLS-1$
private static String DOUBLE_QUOTED_ATTRIBUTE_DELIMITER_GROUP = "double-quoted-attribute-delimiter"; //$NON-NLS-1$
private static String SINGLE_QUOTED_ATTRIBUTE_DELIMITER_GROUP = "single-quoted-attribute-delimiter"; //$NON-NLS-1$
private static final String CDATA_SECTION_GROUP = "cdata-section"; //$NON-NLS-1$
private static final String DEFAULT_GROUP = "default"; //$NON-NLS-1$
private static final String PERCENT_INSTRUCTION_GROUP = "percent-instruction"; //$NON-NLS-1$
private static final String PROCESSING_INSTRUCTION_GROUP = "processing-instruction"; //$NON-NLS-1$
// private static final String TEXT_GROUP = "text"; //$NON-NLS-1$
// private static final String XML_DECLARATION_GROUP = "xml-declaration"; //$NON-NLS-1$
public static final String ATTRIBUTE_GROUP = "attribute"; //$NON-NLS-1$
public static final String PERCENT_INSTRUCTION_DELIMITER_GROUP = "percent-instruction-delimiter"; //$NON-NLS-1$
public static final String PROCESSING_INSTRUCTION_DELIMITER_GROUP = "processing-instruction-delimiter"; //$NON-NLS-1$
public static final String SCRIPT_DELIMITER_GROUP = "script-delimiter"; //$NON-NLS-1$
public static final String STYLE_DELIMITER_GROUP = "style-delimiter"; //$NON-NLS-1$
public static final String TAG_DELIMITER_GROUP = "tag-delimiter"; //$NON-NLS-1$
private Stack<IParseNode> _elementStack = new Stack<IParseNode>();
private static final int[] elementEndSet = new int[] { HTMLTokenTypes.GREATER_THAN,
HTMLTokenTypes.SLASH_GREATER_THAN };
private AndMatcher _closeTagMatcher;
private StringMatcher _closeTagNameMatcher;
/**
* static constructor
*/
static
{
// make sure all of our sets are sorted so that inSet will work properly
// (that method uses a binary search to test existence of members in the
// set)
Arrays.sort(elementEndSet);
}
/**
* Create a new instance of CSSParser
*
* @throws ParserInitializationException
*/
public HTMLParser() throws ParserInitializationException
{
this(HTMLMimeType.MimeType);
}
/**
* Create a new instance of CSSParser
*
* @param mimeType
* @throws ParserInitializationException
*/
public HTMLParser(String mimeType) throws ParserInitializationException
{
super(mimeType);
this._elementStack = new Stack<IParseNode>();
// match tag name. Note this will change each time this is needed
this._closeTagNameMatcher = new StringMatcher();
this._closeTagNameMatcher.setCaseInsensitive(true);
// match whitespace
ZeroOrMoreMatcher whitespaces = new ZeroOrMoreMatcher();
whitespaces.appendChild(new WhitespaceMatcher());
// match </tagname\s*>
this._closeTagMatcher = new AndMatcher();
this._closeTagMatcher.appendChild(new StringMatcher("</")); //$NON-NLS-1$
this._closeTagMatcher.appendChild(this._closeTagNameMatcher);
this._closeTagMatcher.appendChild(whitespaces);
this._closeTagMatcher.appendChild(new CharacterMatcher('>'));
}
/**
* Close the element that is on the top of the stack
*/
private void closeElement()
{
if (this._currentElement != null)
{
this._currentElement.includeLexemeInRange(this.currentLexeme);
}
if (this._elementStack.size() > 0)
{
this._currentElement = this._elementStack.pop();
}
else
{
this._currentElement = null;
}
}
/**
* Push the currently active element onto the stack and set the specified element as the new active element
*
* @param element
*/
private void openElement(HTMLParseNode element)
{
// add the new parent as a child of the current parent
if (this._currentElement != null)
{
this._currentElement.appendChild(element);
}
HTMLParseState parseState = (HTMLParseState) this.getParseState();
if (parseState.getCloseTagType(element.getName()) != HTMLTagInfo.END_FORBIDDEN)
{
this._elementStack.push(this._currentElement);
this._currentElement = element;
}
}
/**
* createNode
*
* @param type
* @param startingLexeme
* @return HTMLParseNode
*/
private HTMLParseNode createNode(int type, Lexeme startingLexeme)
{
return (HTMLParseNode) this.getParseNodeFactory().createParseNode(type, startingLexeme);
}
/**
* @see com.aptana.ide.parsing.AbstractParser#parseAll(com.aptana.ide.parsing.nodes.IParseNode)
*/
public synchronized void parseAll(IParseNode parentNode) throws LexerException
{
this._elementStack.clear();
this._currentElement = parentNode;
ILexer lexer = this.getLexer();
lexer.setLanguageAndGroup(this.getLanguage(), DEFAULT_GROUP);
// this.advance();
this.parseText();
while (this.currentLexeme != EOS)
{
try
{
switch (this.currentLexeme.typeIndex)
{
case HTMLTokenTypes.CDATA_START:
this.parseCDATASection();
break;
case HTMLTokenTypes.COMMENT:
this.parseText();
break;
case HTMLTokenTypes.END_TAG:
this.parseEndTag();
break;
case HTMLTokenTypes.PERCENT_OPEN:
this.parsePercentInstruction();
this.parseText();
break;
case HTMLTokenTypes.PI_OPEN:
this.parseProcessingInstruction();
this.parseText();
break;
case HTMLTokenTypes.START_TAG:
this.parseStartTag();
break;
case HTMLTokenTypes.XML_DECL:
this.parseXMLDeclaration();
break;
// attempt error recovery for language change on malformed script and style tags
case HTMLTokenTypes.GREATER_THAN:
if (this._currentElement != null)
{
String currentElementName = ((HTMLParseNode) this._currentElement).getName();
if (currentElementName.equals("script")) //$NON-NLS-1$
{
switchToScriptLanguage();
}
else if (currentElementName.equals("style")) //$NON-NLS-1$
{
switchToStyleLanguage();
}
else
{
this.advance();
}
}
else
{
this.advance();
}
break;
default:
if (this.currentLexeme.getLanguage().equals(HTMLMimeType.MimeType) == false)
{
LexemeList lexemes = this.getLexemeList();
lexemes.getAffectedRegion().includeInRange(this.currentLexeme.offset);
lexemes.remove(this.currentLexeme);
lexer.setCurrentOffset(this.currentLexeme.offset);
}
this.advance();
break;
}
}
catch (ParseException e)
{
// reset group
lexer.setGroup(DEFAULT_GROUP);
}
}
if (parentNode instanceof HTMLDocumentNode)
{
correctDocumentNodeEndingLexeme((HTMLDocumentNode) parentNode);
}
}
/**
* This function corrects an error with the ending lexeme of the html document not being set correctly. We want it
* to be set to the ending lexeme of the last child.
*
* @param parentNode
*/
private void correctDocumentNodeEndingLexeme(HTMLDocumentNode parentNode)
{
int childCount = parentNode.getChildCount();
if (childCount > 0)
{
IParseNode lastChild = parentNode.getChild(childCount - 1);
parentNode.includeLexemeInRange(lastChild.getEndingLexeme());
}
}
/**
* parseException
*
* @param element
* @throws ParseException
* @throws LexerException
*/
private void parseAttribute(HTMLElementNode element) throws ParseException, LexerException
{
// assume we have a valid attribute
String name = this.currentLexeme.getText();
// advance over attribute name
this.assertAndAdvance(HTMLTokenTypes.NAME, "error.attribute"); //$NON-NLS-1$
// get lexer
ILexer lexer = this.getLexer();
// switch to attribute lexer group
lexer.setGroup(ATTRIBUTE_GROUP);
// advance over '='
this.assertAndAdvance(HTMLTokenTypes.EQUAL, "error.attribute.equal"); //$NON-NLS-1$
// update attribute value and quote flag
if (this.currentLexeme.getCategoryIndex() != TokenCategories.ERROR)
{
// get value
String value = this.currentLexeme.getText();
// see if we're starting with a quote
char firstChar = value.charAt(0);
// only check for attribute language if the value is in quotes
IParser parser = null;
if (firstChar == '"' || firstChar == '\'')
{
parser = this.languageRegistry.getProcessingInstructionLanguage(name);
}
if (this.isType(HTMLTokenTypes.STRING))
{
// remove quotes, if needed
int quoteType = QuoteType.NONE;
if (value.length() > 1)
{
if (firstChar == '"')
{
value = value.substring(1, value.length() - 1);
quoteType = QuoteType.DOUBLE_QUOTE;
}
else if (firstChar == '\'')
{
value = value.substring(1, value.length() - 1);
quoteType = QuoteType.SINGLE_QUOTE;
}
}
// add attribute to element node
element.setAttribute(name, value);
// set quote type
IParseNodeAttribute attr = element.getAttributeNode(name);
attr.setQuoteType(quoteType);
if (parser != null)
{
int offset = this.currentLexeme.offset;
// remove lexeme from list
LexemeList lexemes = this.getLexemeList();
lexemes.remove(this.currentLexeme);
lexemes.getAffectedRegion().includeInRange(offset);
// reposition lexer
lexer.setCurrentOffset(offset);
lexer.setGroup(HTMLParser.ATTRIBUTE_LANGUAGE_GROUP);
// capture quote
this.advance();
}
}
if (parser != null)
{
Range range;
if (firstChar == '"')
{
range = lexer.find(HTMLParser.DOUBLE_QUOTED_ATTRIBUTE_DELIMITER_GROUP);
}
else
{
range = lexer.find(HTMLParser.SINGLE_QUOTED_ATTRIBUTE_DELIMITER_GROUP);
}
// process nested language
this.processNestedLanguage(parser, range.getStartingOffset(), false);
// capture closing quote
lexer.setGroup(HTMLParser.ATTRIBUTE_LANGUAGE_GROUP);
this.advance();
}
}
lexer.setGroup(DEFAULT_GROUP);
this.advance();
}
/**
* parseCDATASection
*
* @throws ParseException
* @throws LexerException
*/
private void parseCDATASection() throws LexerException, ParseException
{
// get lexer
ILexer lexer = this.getLexer();
// switch to cdata-section group
lexer.setGroup(CDATA_SECTION_GROUP);
this.assertAndAdvance(HTMLTokenTypes.CDATA_START, "error.cdata"); //$NON-NLS-1$
// grab text
this.assertAndAdvance(HTMLTokenTypes.CDATA_END, "error.cdata.close"); //$NON-NLS-1$
}
/**
* parseEndTag
*
* @throws ParseException
* @throws LexerException
*/
private void parseEndTag() throws LexerException, ParseException
{
Lexeme currentLexeme2 = this.currentLexeme;
// skip over close tag
this.assertAndAdvance(HTMLTokenTypes.END_TAG, "error.tag.end"); //$NON-NLS-1$
// only close current element if current lexeme and element have the same tag name
if (this._currentElement != null)
{
String tagName = HTMLUtils.stripTagEndings(currentLexeme2.getText());
if (this._currentElement.getName().equalsIgnoreCase(tagName))
{
this.closeElement();
}
}
// handle possible inner text
ILexer lexer = this.getLexer();
// switch to text group
lexer.setGroup("text"); //$NON-NLS-1$
// advance over tag close
this.assertAndAdvance(HTMLTokenTypes.GREATER_THAN, "error.tag.end.close"); //$NON-NLS-1$
// switch back to default group
lexer.setGroup(DEFAULT_GROUP);
if (this.currentLexeme == EOS || this.isType(HTMLTokenTypes.ERROR))
{
if (this.currentLexeme != EOS)
{
lexer.setCurrentOffset(this.currentLexeme.offset);
this.removeLexeme(this.currentLexeme);
}
// rescan in case we have a false EOS
this.advance();
}
}
/**
* parsePercentInstruction
*
* @throws LexerException
* @throws ParseException
*/
private void parsePercentInstruction() throws LexerException, ParseException
{
ILexer lexer = this.getLexer();
// find offset
Range range = lexer.find(PERCENT_INSTRUCTION_DELIMITER_GROUP);
int offset = range.getStartingOffset();
if (range.isEmpty())
{
offset = lexer.getSourceLength();
}
String elementName = this.currentLexeme.getText();
IParser parser = this.languageRegistry.getPercentInstructionLanguage(elementName);
if (parser != null)
{
this.processNestedLanguage(parser, offset);
}
else
{
// switch to percent-instruction group
lexer.setGroup(PERCENT_INSTRUCTION_GROUP);
this.assertAndAdvance(HTMLTokenTypes.PERCENT_OPEN, "error.percent.instruction"); //$NON-NLS-1$
}
// switch back to default
lexer.setGroup(DEFAULT_GROUP);
// advance over '%>'
this.advance();
}
/**
* parseProcessingInstruction
*
* @throws LexerException
* @throws ParseException
*/
private void parseProcessingInstruction() throws LexerException, ParseException
{
ILexer lexer = this.getLexer();
String elementName = this.currentLexeme.getText();
int offset = lexer.getSourceLength();
if (this.languageRegistry.getHandlesEOF(elementName) == false)
{
// find offset
Range range = lexer.find(PROCESSING_INSTRUCTION_DELIMITER_GROUP);
if (range.isEmpty() == false)
{
offset = range.getStartingOffset();
}
}
IParser parser = this.languageRegistry.getProcessingInstructionLanguage(elementName);
if (parser != null)
{
this.processNestedLanguage(parser, offset);
}
else
{
// switch to cdata-section group
lexer.setGroup(PROCESSING_INSTRUCTION_GROUP);
this.assertAndAdvance(HTMLTokenTypes.PI_OPEN, "error.pi"); //$NON-NLS-1$
}
// switch back to default
lexer.setGroup(DEFAULT_GROUP);
// advance over '?>'
this.advance();
}
/**
* parseStartTag
*
* @throws ParseException
* @throws LexerException
*/
private void parseStartTag() throws ParseException, LexerException
{
Lexeme startTag = this.currentLexeme;
// make sure we're currently on a start tag
this.assertType(HTMLTokenTypes.START_TAG, "error.tag.start"); //$NON-NLS-1$
// create the new element
HTMLElementNode element = (HTMLElementNode) this.createNode(HTMLParseNodeTypes.ELEMENT, this.currentLexeme);
// push the element onto our stack
this.openElement(element);
// grab element name
String elementName = this.currentLexeme.getText().substring(1).toLowerCase();
// advance over beginning of element
this.advance();
// check for possible cache fault
if (this.isEOS() == false && this.currentLexeme.getLanguage().equals(this.getLanguage()) == false)
{
this.flushCache(TAG_DELIMITER_GROUP);
}
// process until we close this tag
while (this.isEOS() == false && this.inSet(elementEndSet) == false)
{
switch (this.currentLexeme.typeIndex)
{
case HTMLTokenTypes.PI_OPEN:
this.parseProcessingInstruction();
break;
case HTMLTokenTypes.PERCENT_OPEN:
this.parsePercentInstruction();
break;
default:
this.parseAttribute(element);
}
}
switch (this.currentLexeme.typeIndex)
{
case HTMLTokenTypes.GREATER_THAN:
// check for extension points
if (elementName.equals("script")) //$NON-NLS-1$
{
this.switchToScriptLanguage();
}
else if (elementName.equals("style")) //$NON-NLS-1$
{
this.switchToStyleLanguage();
}
else
{
IParser parser = this.languageRegistry.getElementLanguage(elementName, "", ""); //$NON-NLS-1$ //$NON-NLS-2$
if (parser != null)
{
ILexer lexer = this.getLexer();
int offset = -1;
if (this.languageRegistry.getLanguageOwnsElement(elementName))
{
LexemeList lexemes = this.getLexemeList();
int startingOffset = startTag.offset;
// get the current lexeme's index in the list
int index = lexemes.getLexemeIndex(startTag);
// remove token since it is (potentially) invalid
lexemes.remove(index);
// update the affected region
lexemes.getAffectedRegion().includeInRange(startingOffset);
offset = lexer.getSourceLength();
lexer.setCurrentOffset(startingOffset);
}
else
{
this._closeTagNameMatcher.removeText();
this._closeTagNameMatcher.appendText(elementName);
char[] source = lexer.getSourceUnsafe();
for (int i = 0; i < lexer.getSourceLength(); i++)
{
int candidate = this._closeTagMatcher.match(source, i, source.length);
if (candidate != -1)
{
offset = i;
break;
}
}
if (offset == -1)
{
offset = lexer.getSourceLength();
}
}
this.processNestedLanguage(parser, offset);
}
else
{
this.parseText();
}
}
break;
case HTMLTokenTypes.SLASH_GREATER_THAN:
// NOTE: the current element will not equal this element if this element forbids close tags
if (this._currentElement == element)
{
this.closeElement();
}
this.parseText();
break;
default:
throwParseError("error.tag.start.unclosed"); //$NON-NLS-1$
}
}
private void parseText() throws LexerException
{
// get reference to lexer
ILexer lexer = this.getLexer();
// switch to text group
lexer.setGroup("text"); //$NON-NLS-1$
// advance over '>' or '/>'
this.advance();
// switch back to default group
lexer.setGroup(DEFAULT_GROUP);
if (this.currentLexeme == EOS || this.isType(HTMLTokenTypes.ERROR))
{
if (this.currentLexeme != EOS)
{
lexer.setCurrentOffset(this.currentLexeme.offset);
this.removeLexeme(this.currentLexeme);
}
// rescan in case we have a false EOS
this.advance();
}
}
/**
* processNestedLanguage
*
* @param parser
* @throws ParseException
* @throws LexerException
*/
private void processNestedLanguage(IParser parser, int offset) throws LexerException, ParseException
{
this.processNestedLanguage(parser, offset, false);
}
/**
* processNestedLanguage
*
* @param parser
* @throws ParseException
* @throws LexerException
*/
private void processNestedLanguage(IParser parser, int offset, boolean createNode) throws LexerException, ParseException
{
// save current lexeme for later
Lexeme startingLexeme = this.currentLexeme;
// parse nested language
this.changeLanguage(parser.getLanguage(), offset, this._currentElement);
if (createNode)
{
// create placeholder node for outline
HTMLSpecialNode node = (HTMLSpecialNode) this.createNode(HTMLParseNodeTypes.SPECIAL, startingLexeme);
// this node needs access to the lexeme list to build its label in the outline
node.setLexemeList(this.getParseState().getLexemeList());
// set language
node.setNestedLanguage(parser.getLanguage());
// open and close element
this.openElement(node);
this.closeElement();
}
}
/**
* switchToScriptLanguage
*
* @throws ParseException
* @throws LexerException
*/
private void switchToScriptLanguage() throws ParseException, LexerException
{
this.switchLanguage(SCRIPT_DELIMITER_GROUP, "type"); //$NON-NLS-1$
}
/**
* switchToStyleLanguage
*
* @throws ParseException
* @throws LexerException
*/
private void switchToStyleLanguage() throws ParseException, LexerException
{
this.switchLanguage(STYLE_DELIMITER_GROUP, "type"); //$NON-NLS-1$
}
/**
* switchLanguage
*
* @param endingDelimiterGroup
* @throws LexerException
* @throws ParseException
*/
private void switchLanguage(String endingDelimiterGroup, String attributeName) throws LexerException,
ParseException
{
ILexer lexer = this.getLexer();
// find offset
Range range = lexer.find(endingDelimiterGroup);
int offset = range.getStartingOffset();
if (range.isEmpty())
{
offset = lexer.getSourceLength();
}
String elementName = this._currentElement.getName();
String attributeValue = this._currentElement.getAttribute(attributeName);
if (attributeValue == null)
{
attributeValue = ""; //$NON-NLS-1$
}
IParser parser = this.languageRegistry.getElementLanguage(elementName, attributeName, attributeValue);
if (parser != null)
{
this.changeLanguage(parser.getLanguage(), offset, this._currentElement);
// advance over tag close
this.advance();
}
else
{
LexemeList lexemes = this.getLexemeList();
lexemes.getAffectedRegion().includeInRange(offset);
this.parseText();
}
}
/**
* Parse XML declaration
*
* @throws LexerException
* @throws ParseException
*/
private void parseXMLDeclaration() throws LexerException, ParseException
{
// switch to XML declaration
this.getLexer().setGroup("xml-declaration"); //$NON-NLS-1$
HTMLDeclarationNode decl = (HTMLDeclarationNode) this.createNode(HTMLParseNodeTypes.DECLARATION,
this.currentLexeme);
// advance over '<?xml'
this.assertAndAdvance(HTMLTokenTypes.XML_DECL, "error.xml.declaration"); //$NON-NLS-1$
// this._parseResults.add(decl); // always root
decl.setVersion(this.currentLexeme.getText());
// parse declaration
this.assertAndAdvance(HTMLTokenTypes.VERSION, "error.xml.declaration.version"); //$NON-NLS-1$
if (this.isType(HTMLTokenTypes.ENCODING))
{
decl.setEncoding(this.currentLexeme.getText());
this.advance();
}
if (this.isType(HTMLTokenTypes.STANDALONE))
{
decl.setStandalone(this.currentLexeme.getText());
this.advance();
}
decl.includeLexemeInRange(this.currentLexeme);
this.assertAndAdvance(HTMLTokenTypes.GREATER_THAN, "error.xml.declaration.close"); //$NON-NLS-1$
}
}