/** * Copyright (c) 2009-2015, Christer Sandberg */ package se.fishtank.css.selectors.tokenizer; import java.util.regex.Pattern; /** * A CSS tokenizer according to <a href="http://www.w3.org/TR/css-syntax-3/">http://www.w3.org/TR/css-syntax-3/</a> * * @author Christer Sandberg */ public class Tokenizer { /** Replacement code point. */ public static final char REPLACEMENT_CHAR = '\uFFFD'; /** End of file code point. */ public static final int EOF = -1; /** End of file token. */ public static final Token EOF_TOKEN = new Token(TokenType.EOF, EOF, ""); /** Regex used to preprocess the input (see http://www.w3.org/TR/css-syntax-3/#input-preprocessing). */ public static final Pattern PREPROCESS_REGEX = Pattern.compile("\\f|\\r\\n?"); /** The input to tokenize. */ public final String input; /** The current position. */ private int pos = 0; /** The current mark. */ private int mark = 0; /** * Create a new tokenizer. * * @param input The input to tokenize. */ public Tokenizer(String input) { this.input = PREPROCESS_REGEX.matcher(input).replaceAll("\n").replace('\u0000', REPLACEMENT_CHAR); } /** * Returns the current position in the input. * * @return The current position. */ public int getPosition() { return pos; } /** * Resets the position to {@code 0} */ public void reset() { this.pos = 0; this.mark = 0; } /** * Returns whether the given code point matches <code>[a-zA-Z]</code> * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isAlpha(int c) { return (c | 0x20) >= 'a' && (c | 0x20) <= 'z'; } /** * Returns whether the given code point matches <code>[0-9]</code> * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isDigit(int c) { return c >= '0' && c <= '9'; } /** * Returns whether the given code point matches <code>[0-9a-fA-F]</code> * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isHexDigit(int c) { return isDigit(c) || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f'); } /** * Returns whether the given code point matches <code>[ \t\r\n\f]</code> * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isSpace(int c) { return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'; } /** * Returns whether the given code point is a <code>name-start</code> code point * as of http://www.w3.org/TR/css-syntax-3/#name-start-code-point * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isNameStart(int c) { return c == '_' || c >= 0x80 || isAlpha(c); } /** * Returns whether the given code point is a <code>name</code> code point * as of http://www.w3.org/TR/css-syntax-3/#name-code-point * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isName(int c) { return c == '-' || isNameStart(c) || isDigit(c); } /** * Returns whether the given code point is a <code>non-printable</code> code point * as of http://www.w3.org/TR/css-syntax-3/#non-printable-code-point * * @param c Code point to check * @return {@code true} or {@code false} */ public static boolean isNonPrintable(int c) { return (c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || c == 0x7F; } /** * Returns whether the two code points are a valid <code>escape</code> * as of http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape * * @param c1 Code point to check * @param c2 Code point to check * @return {@code true} or {@code false} */ public static boolean isValidEscape(int c1, int c2) { return c1 == '\\' && c2 != '\n'; } /** * Convert the given code point to its numeric value. * * @param c The code point to convert. * @return The numeric value for the code point. */ public static int hexValue(int c) { if (c < 'A') { return c - '0'; } return (c - 'A' + 10) & 0xF; } /** * Returns whether all code points in the input have been consumed. * * @return {@code true} or {@code false} */ public boolean isEof() { return this.pos >= this.input.length(); } /** * Returns the next token. * * @return The next token. */ public Token nextToken() { if (isEof()) { return EOF_TOKEN; } skipComments(); if (isEof()) { return EOF_TOKEN; } int p = this.pos; int n = skipSpace(); if (n > 0) { return new Token(TokenType.WHITESPACE, p, ""); } mark(); int c = next(); switch (c) { case '"': setPositionToMark(); return consumeStringToken(false); case '#': if (isIdentStart()) { return new Token.Hash(p, consumeName(), true); } int[] d = peek2(); if (isName(d[0]) || isValidEscape(d[0], d[1])) { return new Token.Hash(p, consumeName(), false); } return new Token(TokenType.DELIM, p, "#"); case '$': if (peek() == '=') { next(); return new Token(TokenType.SUFFIX_MATCH, p, "$="); } return new Token(TokenType.DELIM, p, "$"); case '\'': setPositionToMark(); return consumeStringToken(true); case '(': return new Token(TokenType.LEFT_PAREN, p, "("); case ')': return new Token(TokenType.RIGHT_PAREN, p, ")"); case '*': if (peek() == '=') { next(); return new Token(TokenType.SUBSTRING_MATCH, p, "*="); } return new Token(TokenType.DELIM, p, "*"); case '+': setPositionToMark(); if (isNumberStart()) { return consumeNumericToken(); } next(); return new Token(TokenType.DELIM, p, "+"); case ',': return new Token(TokenType.COMMA, p, ","); case '-': setPositionToMark(); if (isNumberStart()) { return consumeNumericToken(); } if (isIdentStart()) { return consumeIdentLikeToken(); } if (consume("-->")) { return new Token(TokenType.CDC, p, "-->"); } next(); return new Token(TokenType.DELIM, p, "-"); case '.': setPositionToMark(); if (isNumberStart()) { return consumeNumericToken(); } next(); return new Token(TokenType.DELIM, p, "."); case ':': return new Token(TokenType.COLON, p, ":"); case ';': return new Token(TokenType.SEMICOLON, p, ";"); case '<': if (consume("!--")) { return new Token(TokenType.CDO, p, "<!--"); } return new Token(TokenType.DELIM, p, "<"); case '@': if (isIdentStart()) { return new Token(TokenType.AT_KEYWORD, p, consumeName()); } return new Token(TokenType.DELIM, p, "@"); case '[': return new Token(TokenType.LEFT_SQUARE_BRACKET, p, "["); case ']': return new Token(TokenType.RIGHT_SQUARE_BRACKET, p, "]"); case '\\': if (isValidEscape('\\', peek())) { setPositionToMark(); return consumeIdentLikeToken(); } return new Token(TokenType.DELIM, p, "\\"); case '^': if (peek() == '=') { next(); return new Token(TokenType.PREFIX_MATCH, p, "^="); } return new Token(TokenType.DELIM, p, "^"); case '{': return new Token(TokenType.LEFT_CURLY_BRACKET, p, "{"); case '}': return new Token(TokenType.RIGHT_CURLY_BRACKET, p, "}"); case '|': int x = peek(); switch (x) { case '=': next(); return new Token(TokenType.DASH_MATCH, p, "|="); case '|': next(); return new Token(TokenType.COLUMN, p, "||"); } return new Token(TokenType.DELIM, p, "|"); case '~': if (peek() == '=') { next(); return new Token(TokenType.INCLUDE_MATCH, p, "~="); } return new Token(TokenType.DELIM, p, "~"); } if (isDigit(c)) { setPositionToMark(); return consumeNumericToken(); } if (c == 'U' || c == 'u') { int[] e = peek2(); if (e[0] == '+' && (e[1] == '?' || isHexDigit(e[1]))) { next(); // Consume the '+' return consumeUnicodeRangeToken(); } setPositionToMark(); return consumeIdentLikeToken(); } if (isNameStart(c)) { setPositionToMark(); return consumeIdentLikeToken(); } return new Token(TokenType.DELIM, p, String.copyValueOf(Character.toChars(c))); } /** * Mark the current position in the input. */ private void mark() { this.mark = this.pos; } /** * Sets the position to the marked position in the input. */ private void setPositionToMark() { this.pos = this.mark; } /** * Consumes and returns the next code point in the input. * * @return The next code point in the input. */ private int next() { if (isEof()) { return EOF; } int c = this.input.codePointAt(this.pos); this.pos += Character.charCount(c); return c; } /** * Returns the next code point in the input without consuming it. * * @return The next code point in the input. */ private int peek() { int p = this.pos; int c = next(); this.pos = p; return c; } /** * Returns the next two code points in the input without consuming them. * * @return The next two code points in the input. */ private int[] peek2() { int p = this.pos; int[] c = new int[] { next(), next() }; this.pos = p; return c; } /** * Returns the next three code points in the input without consuming them. * * @return The next three code points in the input. */ private int[] peek3() { int p = this.pos; int[] c = new int[] { next(), next(), next() }; this.pos = p; return c; } /** * Skip comments at the current position. */ private void skipComments() { if (consume("/*")) { while (true) { int c = next(); if (c == EOF) { return; } if (c == '*' && peek() == '/') { next(); // Consume the '/' return; } } } } /** * Skip whitespace at the current position. * * @return The number of whitespace code points skipped. */ private int skipSpace() { int n = 0; while (isSpace(peek())) { n += 1; next(); } return n; } /** * Tries to consume the string {@code str} at the current position. * * @param str The string to consume. * @return {@code true} on success consuming {@code str} */ private boolean consume(String str) { if (!isEof() && this.input.startsWith(str, this.pos)) { this.pos += str.length(); return true; } return false; } /** * Returns whether the tokenizer could match an identifier at the current position. * <p/> * See http://www.w3.org/TR/css-syntax-3/#would-start-an-identifier * * @return {@code true} or {@code false} */ private boolean isIdentStart() { if (isEof()) { return false; } int[] c = peek3(); return isNameStart(c[0]) || isValidEscape(c[0], c[1]) || (c[0] == '-' && (isNameStart(c[1]) || isValidEscape(c[1], c[2]))); } /** * Returns whether the tokenizer could match a number at the current position. * <p/> * See http://www.w3.org/TR/css-syntax-3/#starts-with-a-number * * @return {@code true} or {@code false} */ private boolean isNumberStart() { if (isEof()) { return false; } int[] c = peek3(); if (isDigit(c[0]) || (c[0] == '.' && isDigit(c[1]))) { return true; } if (c[0] == '+' || c[0] == '-') { if (isDigit(c[1])) { return true; } if (c[1] == '.' && isDigit(c[2])) { return true; } } return false; } /** * Returns whether the next code point at the current position start the exponential part of a number. * * @return {@code true} or {@code false} */ private boolean isValidExponent() { if (isEof()) { return false; } try { mark(); int c = next(); if (c != 'e' && c != 'E') { return false; } c = next(); if (c == '+' || c == '-') { return isDigit(next()); } return isDigit(c); } finally { setPositionToMark(); } } /** * Consume an escaped code point. * <p/> * It is assumed that the U+005C REVERSE SOLIDUS (\) has already been consumed * and that the next code point in the input has been verified to not be a newline. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point * * @return The consumed code point. */ private int consumeEscape() { if (isEof()) { return REPLACEMENT_CHAR; } if (isHexDigit(peek())) { int uc = 0; int len = 6; while (len > 0 && isHexDigit(peek())) { uc = (uc << 4) + hexValue(next()); --len; } if (uc == 0 || uc > Character.MAX_CODE_POINT || (uc >= 0xD800 && uc <= 0xDFFF)) { uc = REPLACEMENT_CHAR; } if (isSpace(peek())) { next(); } return uc; } return next(); } /** * Consume a name. * <p/> * It is assumed that the current position of the tokenizer represents a name. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-a-name * * @return The consumed name. */ private String consumeName() { StringBuilder sb = new StringBuilder(); while (true) { mark(); int c = next(); if (isName(c)) { sb.appendCodePoint(c); } else if (isValidEscape(c, peek())) { sb.appendCodePoint(consumeEscape()); } else { setPositionToMark(); break; } } return sb.toString(); } /** * Consume a number. * <p/> * It is assumed that the current position of the tokenizer represents a number token. * * @return The consumed number token. */ private Token.Number consumeNumber() { StringBuilder sb = new StringBuilder(); int p = this.pos; int c = peek(); if (c == '+' || c == '-') { sb.appendCodePoint(next()); } while (isDigit(peek())) { sb.appendCodePoint(next()); } mark(); boolean integer = true; int c1 = next(); int c2 = next(); if (c1 == '.' && isDigit(c2)) { sb.appendCodePoint(c1).appendCodePoint(c2); while (isDigit(peek())) { sb.appendCodePoint(next()); } integer = false; } else { setPositionToMark(); } if (isValidExponent()) { integer = false; sb.appendCodePoint(next()).appendCodePoint(next()); while (isDigit(peek())) { sb.appendCodePoint(next()); } } return Token.Number.number(p, sb.toString(), integer); } /** * Consume a numeric token. * <p/> * It is assumed that the current position of the tokenizer represents a number token. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-a-numeric-token * * @return The consumed numeric token. */ private Token consumeNumericToken() { Token.Number token = consumeNumber(); if (peek() == '%') { next(); return Token.Number.percentage(token.position, token.value, token.integer); } if (isIdentStart()) { return new Token.Dimension(token.position, token.value, token.integer, consumeName()); } return token; } /** * Consume an ident-like token. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-an-ident-like-token * * @return The consumed token. */ private Token consumeIdentLikeToken() { int p = this.pos; String name = consumeName(); TokenType type = TokenType.IDENT; if (peek() == '(') { next(); // Consume the '(' if ("url".equalsIgnoreCase(name)) { return consumeUrlToken(); } else { type = TokenType.FUNCTION; } } return new Token(type, p, name); } /** * Consume a string token. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-a-string-token * * @param apostrophe If the string contents is surrounded by apostrophes. * @return The consumed string token. */ private Token consumeStringToken(boolean apostrophe) { StringBuilder sb = new StringBuilder(); int p = this.pos; next(); // Consume the quote while (true) { mark(); int c = next(); if (c == EOF || (c == '\'' && apostrophe) || (c == '"' && !apostrophe)) { break; } if (c == '\n') { setPositionToMark(); return new Token(TokenType.BAD_STRING, p, ""); } if (c == '\\') { int d = peek(); if (d != EOF) { if (d == '\n') { next(); // Consume the newline } else { sb.appendCodePoint(consumeEscape()); } } } else { sb.appendCodePoint(c); } } return new Token(TokenType.STRING, p, sb.toString()); } /** * Consume a URL token. * <p/> * It is assumed that the current position of the tokenizer represents a URL token. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-a-url-token * * @return The consumed URL token. */ private Token consumeUrlToken() { skipSpace(); int p = this.pos; if (isEof()) { return new Token(TokenType.URL, p, ""); } int c = peek(); if (c == '\'' || c == '"') { Token token = consumeStringToken(c != '"'); if (token.type == TokenType.BAD_STRING) { p = this.pos; consumeBadUrl(); return new Token(TokenType.BAD_URL, p, token.value); } else { skipSpace(); c = peek(); if (c == ')' || c == EOF) { if (c == ')') { next(); // Consume the ')' } return new Token(TokenType.URL, p, token.value); } p = this.pos; consumeBadUrl(); return new Token(TokenType.BAD_URL, p, token.value); } } StringBuilder sb = new StringBuilder(); boolean spaceSeen = false; while (true) { c = next(); if (c == ')' || c == EOF) { return new Token(TokenType.URL, p, sb.toString()); } if (isSpace(c)) { spaceSeen = true; skipSpace(); continue; } if (spaceSeen) { p = this.pos; consumeBadUrl(); return new Token(TokenType.BAD_URL, p, ""); } if (c == '\'' || c == '"' || c == '(' || isNonPrintable(c)) { p = this.pos; consumeBadUrl(); return new Token(TokenType.BAD_URL, p, ""); } if (c == '\\') { if (isValidEscape(c, peek())) { sb.appendCodePoint(consumeEscape()); } else { p = this.pos; consumeBadUrl(); return new Token(TokenType.BAD_URL, p, ""); } } else { sb.appendCodePoint(c); } } } /** * Consume a unicode range. * <p/> * Is is assumed that the initial {@code u+} has already been consumed and that * the next input code point has been verified to be a hex digit or a {@code ?}. * * @return The consumed Unicode range token. */ private Token.UnicodeRange consumeUnicodeRangeToken() { int p = this.pos; int start = 0; int length = 0; while (isHexDigit(peek()) && length < 6) { start = (start << 4) + hexValue(next()); ++length; } int q = 0; if (length < 6) { while (peek() == '?' && length < 6) { next(); ++length; ++q; } } if (q != 0) { int end = start; for (int i = 0; i < q; ++i) { start = start << 4; end = (end << 4) + 15; } return new Token.UnicodeRange(p, start, end); } int end = 0; int[] c = peek2(); if (c[0] == '-' && isHexDigit(c[1])) { next(); // Consume the '-' length = 0; while (isHexDigit(peek()) && length < 6) { end = (end << 4) + hexValue(next()); ++length; } } else { end = start; } return new Token.UnicodeRange(p, start, end); } /** * Consume the remnants of a bad URL. * <p/> * See http://www.w3.org/TR/css-syntax-3/#consume-the-remnants-of-a-bad-url */ private void consumeBadUrl() { while (true) { int c = next(); if (c == ')' || c == EOF) { break; } if (isValidEscape(c, peek())) { consumeEscape(); } } } }