/*
* Copyright (c) 2013, the Dart project authors.
*
* Licensed under the Eclipse Public License v1.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.eclipse.org/legal/epl-v10.html
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.dart.engine.html.scanner;
import com.google.dart.engine.source.Source;
import com.google.dart.engine.utilities.collection.IntList;
import com.google.dart.engine.utilities.general.StringUtilities;
import static com.google.dart.engine.html.scanner.TokenType.COMMENT;
import static com.google.dart.engine.html.scanner.TokenType.DECLARATION;
import static com.google.dart.engine.html.scanner.TokenType.DIRECTIVE;
import static com.google.dart.engine.html.scanner.TokenType.EOF;
import static com.google.dart.engine.html.scanner.TokenType.EQ;
import static com.google.dart.engine.html.scanner.TokenType.GT;
import static com.google.dart.engine.html.scanner.TokenType.LT;
import static com.google.dart.engine.html.scanner.TokenType.LT_SLASH;
import static com.google.dart.engine.html.scanner.TokenType.SLASH_GT;
import static com.google.dart.engine.html.scanner.TokenType.STRING;
import static com.google.dart.engine.html.scanner.TokenType.TAG;
import static com.google.dart.engine.html.scanner.TokenType.TEXT;
/**
* The abstract class {@code AbstractScanner} implements a scanner for HTML code. Subclasses are
* required to implement the interface used to access the characters being scanned.
*
* @coverage dart.engine.html
*/
public abstract class AbstractScanner {
private static final String[] NO_PASS_THROUGH_ELEMENTS = new String[] {};
/**
* The source being scanned.
*/
private final Source source;
/**
* The token pointing to the head of the linked list of tokens.
*/
private Token tokens;
/**
* The last token that was scanned.
*/
private Token tail;
/**
* A list containing the offsets of the first character of each line in the source code.
*/
private IntList lineStarts = new IntList();
/**
* An array of element tags for which the content between tags should be consider a single token.
*/
private String[] passThroughElements = NO_PASS_THROUGH_ELEMENTS;
/**
* Initialize a newly created scanner.
*
* @param source the source being scanned
*/
public AbstractScanner(Source source) {
this.source = source;
tokens = new Token(EOF, -1);
tokens.setNext(tokens);
tail = tokens;
recordStartOfLine();
}
/**
* Return an array containing the offsets of the first character of each line in the source code.
*
* @return an array containing the offsets of the first character of each line in the source code
*/
public int[] getLineStarts() {
return lineStarts.toArray();
}
/**
* Return the current offset relative to the beginning of the file. Return the initial offset if
* the scanner has not yet scanned the source code, and one (1) past the end of the source code if
* the source code has been scanned.
*
* @return the current offset of the scanner in the source
*/
public abstract int getOffset();
/**
* Answer the source being scanned.
*
* @return the source or {@code null} if undefined
*/
public Source getSource() {
return source;
}
/**
* Set array of element tags for which the content between tags should be consider a single token.
*/
public void setPassThroughElements(String[] passThroughElements) {
this.passThroughElements = passThroughElements != null ? passThroughElements
: NO_PASS_THROUGH_ELEMENTS;
}
/**
* Scan the source code to produce a list of tokens representing the source.
*
* @return the first token in the list of tokens that were produced
*/
public Token tokenize() {
scan();
appendEofToken();
return firstToken();
}
/**
* Advance the current position and return the character at the new current position.
*
* @return the character at the new current position
*/
protected abstract int advance();
/**
* Return the substring of the source code between the start offset and the modified current
* position. The current position is modified by adding the end delta.
*
* @param start the offset to the beginning of the string, relative to the start of the file
* @param endDelta the number of character after the current location to be included in the
* string, or the number of characters before the current location to be excluded if the
* offset is negative
* @return the specified substring of the source code
*/
protected abstract String getString(int start, int endDelta);
/**
* Return the character at the current position without changing the current position.
*
* @return the character at the current position
*/
protected abstract int peek();
/**
* Record the fact that we are at the beginning of a new line in the source.
*/
protected void recordStartOfLine() {
lineStarts.add(getOffset());
}
private void appendEofToken() {
Token eofToken = new Token(EOF, getOffset());
// The EOF token points to itself so that there is always infinite look-ahead.
eofToken.setNext(eofToken);
tail = tail.setNext(eofToken);
}
private Token emit(Token token) {
tail.setNext(token);
tail = token;
return token;
}
private Token emitWithOffset(TokenType type, int start) {
return emit(new Token(type, start));
}
private Token emitWithOffsetAndLength(TokenType type, int start, int count) {
return emit(new Token(type, start, getString(start, count)));
}
private Token firstToken() {
return tokens.getNext();
}
private int recordStartOfLineAndAdvance(int c) {
if (c == '\r') {
c = advance();
if (c == '\n') {
c = advance();
}
recordStartOfLine();
} else if (c == '\n') {
c = advance();
recordStartOfLine();
} else {
c = advance();
}
return c;
}
private void scan() {
boolean inBrackets = false;
String endPassThrough = null;
// <--, -->, <?, <, >, =, "***", '***', in brackets, normal
int c = advance();
while (c >= 0) {
final int start = getOffset();
if (c == '<') {
c = advance();
if (c == '!') {
c = advance();
if (c == '-' && peek() == '-') {
// handle a comment
c = advance();
int dashCount = 1;
while (c >= 0) {
if (c == '-') {
dashCount++;
} else if (c == '>' && dashCount >= 2) {
c = advance();
break;
} else {
dashCount = 0;
}
c = recordStartOfLineAndAdvance(c);
}
emitWithOffsetAndLength(COMMENT, start, -1);
// Capture <!--> and <!---> as tokens but report an error
if (tail.getLength() < 7) {
// TODO (danrubel): Report invalid HTML comment
}
} else {
// handle a declaration
while (c >= 0) {
if (c == '>') {
c = advance();
break;
}
c = recordStartOfLineAndAdvance(c);
}
emitWithOffsetAndLength(DECLARATION, start, -1);
if (!StringUtilities.endsWithChar(tail.getLexeme(), '>')) {
// TODO (danrubel): Report missing '>' in directive
}
}
} else if (c == '?') {
// handle a directive
while (c >= 0) {
if (c == '?') {
c = advance();
if (c == '>') {
c = advance();
break;
}
} else {
c = recordStartOfLineAndAdvance(c);
}
}
emitWithOffsetAndLength(DIRECTIVE, start, -1);
if (tail.getLength() < 4) {
// TODO (danrubel): Report invalid directive
}
} else if (c == '/') {
emitWithOffset(LT_SLASH, start);
inBrackets = true;
c = advance();
} else {
inBrackets = true;
emitWithOffset(LT, start);
// ignore whitespace in braces
while (Character.isWhitespace(c)) {
c = recordStartOfLineAndAdvance(c);
}
// get tag
if (Character.isLetterOrDigit(c)) {
int tagStart = getOffset();
c = advance();
while (Character.isLetterOrDigit(c) || c == '-' || c == '_') {
c = advance();
}
emitWithOffsetAndLength(TAG, tagStart, -1);
// check tag against passThrough elements
String tag = tail.getLexeme();
for (String str : passThroughElements) {
if (str.equals(tag)) {
endPassThrough = "</" + str + ">";
break;
}
}
}
}
} else if (c == '>') {
emitWithOffset(GT, start);
inBrackets = false;
c = advance();
// if passThrough != null, read until we match it
if (endPassThrough != null) {
boolean endFound = false;
int len = endPassThrough.length();
int firstC = endPassThrough.charAt(0);
int index = 0;
int nextC = firstC;
while (c >= 0) {
if (c == nextC) {
index++;
if (index == len) {
endFound = true;
break;
}
nextC = endPassThrough.charAt(index);
} else if (c == firstC) {
index = 1;
nextC = endPassThrough.charAt(1);
} else {
index = 0;
nextC = firstC;
}
c = recordStartOfLineAndAdvance(c);
}
if (start + 1 < getOffset()) {
if (endFound) {
emitWithOffsetAndLength(TEXT, start + 1, -len);
emitWithOffset(LT_SLASH, getOffset() - len + 1);
emitWithOffsetAndLength(TAG, getOffset() - len + 3, -1);
} else {
emitWithOffsetAndLength(TEXT, start + 1, -1);
}
}
endPassThrough = null;
}
} else if (c == '/' && peek() == '>') {
advance();
emitWithOffset(SLASH_GT, start);
inBrackets = false;
c = advance();
} else if (!inBrackets) {
c = recordStartOfLineAndAdvance(c);
while (c != '<' && c >= 0) {
c = recordStartOfLineAndAdvance(c);
}
emitWithOffsetAndLength(TEXT, start, -1);
} else if (c == '"' || c == '\'') {
// read a string
int endQuote = c;
c = advance();
while (c >= 0) {
if (c == endQuote) {
c = advance();
break;
}
c = recordStartOfLineAndAdvance(c);
}
emitWithOffsetAndLength(STRING, start, -1);
} else if (c == '=') {
// a non-char token
emitWithOffset(EQ, start);
c = advance();
} else if (Character.isWhitespace(c)) {
// ignore whitespace in braces
do {
c = recordStartOfLineAndAdvance(c);
} while (Character.isWhitespace(c));
} else if (Character.isLetterOrDigit(c)) {
c = advance();
while (Character.isLetterOrDigit(c) || c == '-' || c == '_') {
c = advance();
}
emitWithOffsetAndLength(TAG, start, -1);
} else {
// a non-char token
emitWithOffsetAndLength(TEXT, start, 0);
c = advance();
}
}
}
}