/* * Copyright (c) 2012, the Dart project authors. * * Licensed under the Eclipse Public License v1.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.eclipse.org/legal/epl-v10.html * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package com.google.dart.tools.core.html; import com.google.common.io.CharStreams; import java.io.IOException; import java.io.Reader; import java.nio.CharBuffer; import java.util.LinkedList; /** * Public for testing. * * @coverage dart.tools.core */ public class Tokenizer { // public static void main(String[] args) { // // <html> // // <head> // // <meta charset="utf-8"> // final String data = "<html>\n<head>\n <meta charset=\"utf-8\">\n</head>\n<p>what is this</p></html>\n"; // // Tokenizer tokenizer = new Tokenizer(data); // // System.out.println("parsing data..."); // // while (tokenizer.hasNext()) { // System.out.println(tokenizer.next()); // } // } private char[] buffer; private int position = 0; private int line = 1; private LinkedList<Token> tokens; private String[] passThroughElements; public Tokenizer(CharBuffer buffer) { this(buffer.toString()); } public Tokenizer(Reader reader) throws IOException { this(CharStreams.toString(reader)); reader.close(); } public Tokenizer(String data) { this.buffer = data.toCharArray(); } public boolean hasNext() { if (tokens == null) { parse(); } return !tokens.isEmpty(); } public Token next() { if (tokens == null) { parse(); } return tokens.poll(); } public Token peek() { if (tokens == null) { parse(); } return tokens.peek(); } public void setOffset(int offset) { position = offset; } public void setPassThroughElements(String[] passThroughElements) { this.passThroughElements = passThroughElements; } private Token emit(int count) { if (position + count > buffer.length) { count = buffer.length - position; } Token token = new Token(new String(buffer, position, count), position, line); tokens.add(token); for (int i = 0; i < count; i++) { if (buffer[position + i] == '\n') { line++; } } position += count; return token; } private boolean matchesPassThrough() { for (String str : passThroughElements) { int count = 0; while (Character.isWhitespace(peek(count))) { count++; } for (int i = 0; i <= str.length(); i++) { if (i == str.length()) { return true; } if (peek(count + i) != str.charAt(i)) { break; } } } return false; } private void parse() { tokens = new LinkedList<Token>(); boolean inBrackets = false; boolean passThrough = false; // <--, -->, <?, <, >, =, "***", '***', in brackets, normal while (position < buffer.length) { final char c = peek(0); if (c == '<') { if (peek(1) == '!' && peek(2) == '-' && peek(3) == '-') { // handle a comment int count = 3; while (!(peek(count - 2) == '-' && peek(count - 1) == '-' && peek(count) == '>') && peek(count) != 0) { count++; } emit(count + 1); } else if (peek(1) == '!') { // handle a directive int count = 2; while (peek(count) != '>' && peek(count) != 0) { count++; } emit(count + 1); } else if (peek(1) == '?') { // handle a directive int count = 2; while (!(peek(count - 1) == '?' && peek(count) == '>') && peek(count) != 0) { count++; } emit(count + 1); } else if (peek(1) == '/') { emit(2); inBrackets = true; } else { inBrackets = true; emit(1); if (passThroughElements != null && peek(0) == 's') { passThrough = matchesPassThrough(); } } } else if (c == '>') { emit(1); inBrackets = false; // if passThrough != null, read until we match it if (passThrough) { int count = 0; int peek = peek(count); while (peek != 0) { if (peek == '<' && peek(count + 1) == '/') { if (count > 0) { emit(count); } break; } count++; peek = peek(count); } passThrough = false; } } else if (c == '/' && peek(1) == '>') { emit(2); inBrackets = false; } else if (!inBrackets) { int count = 1; int peek = peek(count); while (peek != '<' && peek != 0) { count++; peek = peek(count); } emit(count); } else if (c == '"') { // read a string int count = 1; int peek = peek(count); while (peek != '"' && peek != 0) { count++; peek = peek(count); } if (peek == '"') { emit(count + 1); } else { emit(count); } } else if (c == '\'') { // read a string int count = 1; int peek = peek(count); while (peek != '\'' && peek != 0) { count++; peek = peek(count); } if (peek == '\'') { emit(count + 1); } else { emit(count); } } else if (Character.isWhitespace(c)) { int count = 1; while (Character.isWhitespace(peek(count))) { count++; } if (inBrackets) { // ignore whitespace position += count; } else { emit(count).setWhitespace(true); } } else if (Character.isLetterOrDigit(c)) { int count = 1; char peek = peek(count); while (Character.isLetterOrDigit(peek) || peek == '-' || peek == '_') { count++; peek = peek(count); } emit(count); } else { // a non-char token (=, ...) emit(1); } } } private char peek(int lookAhead) { if (position + lookAhead >= buffer.length) { return 0; } else { return buffer[position + lookAhead]; } } }