Tokenizer.java example

Explorer
css-selectors-master
- src
  - main
    - java
      - se
        fishtank
        css
        selectors
        Selectors.java
        dom
        DOMNode.java
        Traversal.java
        Visitor.java
        W3CNode.java
        matching
        SelectorMatcher.java
        SimpleSelectorMatcher.java
        parser
        NthParser.java
        ParserException.java
        SelectorParser.java
        selector
        AttributeSelector.java
        Combinator.java
        CompoundSelector.java
        LocalNameSelector.java
        PseudoClassSelector.java
        PseudoElementSelector.java
        PseudoFunctionSelector.java
        PseudoNegationSelector.java
        PseudoNthSelector.java
        Selector.java
        SimpleSelector.java
        SimpleSelectorType.java
        tokenizer
        Token.java
        TokenType.java
        Tokenizer.java
        util
        Pair.java
        Reference.java
  - test
    - java
      - se
        fishtank
        css
        selectors
        SelectorsTest.java
        Support.java
        matching
        SelectorMatcherTest.java
        parser
        NthParserTest.java
        tokenizer
        TokenizerTest.java
/**
 * Copyright (c) 2009-2015, Christer Sandberg
 */
package se.fishtank.css.selectors.tokenizer;

import java.util.regex.Pattern;

/**
 * A CSS tokenizer according to <a href="http://www.w3.org/TR/css-syntax-3/">http://www.w3.org/TR/css-syntax-3/</a>
 *
 * @author Christer Sandberg
 */
public class Tokenizer {

    /** Replacement code point. */
    public static final char REPLACEMENT_CHAR = '\uFFFD';

    /** End of file code point. */
    public static final int EOF = -1;

    /** End of file token. */
    public static final Token EOF_TOKEN = new Token(TokenType.EOF, EOF, "");

    /** Regex used to preprocess the input (see http://www.w3.org/TR/css-syntax-3/#input-preprocessing). */
    public static final Pattern PREPROCESS_REGEX = Pattern.compile("\\f|\\r\\n?");

    /** The input to tokenize. */
    public final String input;

    /** The current position. */
    private int pos = 0;

    /** The current mark. */
    private int mark = 0;

    /**
     * Create a new tokenizer.
     *
     * @param input The input to tokenize.
     */
    public Tokenizer(String input) {
        this.input = PREPROCESS_REGEX.matcher(input).replaceAll("\n").replace('\u0000', REPLACEMENT_CHAR);
    }

    /**
     * Returns the current position in the input.
     *
     * @return The current position.
     */
    public int getPosition() {
        return pos;
    }

    /**
     * Resets the position to {@code 0}
     */
    public void reset() {
        this.pos = 0;
        this.mark = 0;
    }

    /**
     * Returns whether the given code point matches <code>[a-zA-Z]</code>
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isAlpha(int c) {
        return (c | 0x20) >= 'a' && (c | 0x20) <= 'z';
    }

    /**
     * Returns whether the given code point matches <code>[0-9]</code>
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isDigit(int c) {
        return c >= '0' && c <= '9';
    }

    /**
     * Returns whether the given code point matches <code>[0-9a-fA-F]</code>
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isHexDigit(int c) {
        return isDigit(c) || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f');
    }

    /**
     * Returns whether the given code point matches <code>[ \t\r\n\f]</code>
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isSpace(int c) {
        return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f';
    }

    /**
     * Returns whether the given code point is a <code>name-start</code> code point
     * as of http://www.w3.org/TR/css-syntax-3/#name-start-code-point
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isNameStart(int c) {
        return c == '_' || c >= 0x80 || isAlpha(c);
    }

    /**
     * Returns whether the given code point is a <code>name</code> code point
     * as of http://www.w3.org/TR/css-syntax-3/#name-code-point
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isName(int c) {
        return c == '-' || isNameStart(c) || isDigit(c);
    }

    /**
     * Returns whether the given code point is a <code>non-printable</code> code point
     * as of http://www.w3.org/TR/css-syntax-3/#non-printable-code-point
     *
     * @param c Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isNonPrintable(int c) {
        return (c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || c == 0x7F;
    }

    /**
     * Returns whether the two code points are a valid <code>escape</code>
     * as of http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape
     *
     * @param c1 Code point to check
     * @param c2 Code point to check
     * @return {@code true} or {@code false}
     */
    public static boolean isValidEscape(int c1, int c2) {
        return c1 == '\\' && c2 != '\n';
    }

    /**
     * Convert the given code point to its numeric value.
     *
     * @param c The code point to convert.
     * @return The numeric value for the code point.
     */
    public static int hexValue(int c) {
        if (c < 'A') {
            return c - '0';
        }

        return (c - 'A' + 10) & 0xF;
    }

    /**
     * Returns whether all code points in the input have been consumed.
     *
     * @return {@code true} or {@code false}
     */
    public boolean isEof() {
        return this.pos >= this.input.length();
    }

    /**
     * Returns the next token.
     *
     * @return The next token.
     */
    public Token nextToken() {
        if (isEof()) {
            return EOF_TOKEN;
        }

        skipComments();
        if (isEof()) {
            return EOF_TOKEN;
        }

        int p = this.pos;
        int n = skipSpace();
        if (n > 0) {
            return new Token(TokenType.WHITESPACE, p, "");
        }

        mark();
        int c = next();
        switch (c) {
        case '"':
            setPositionToMark();
            return consumeStringToken(false);
        case '#':
            if (isIdentStart()) {
                return new Token.Hash(p, consumeName(), true);
            }

            int[] d = peek2();
            if (isName(d[0]) || isValidEscape(d[0], d[1])) {
                return new Token.Hash(p, consumeName(), false);
            }

            return new Token(TokenType.DELIM, p, "#");
        case '$':
            if (peek() == '=') {
                next();
                return new Token(TokenType.SUFFIX_MATCH, p, "$=");
            }

            return new Token(TokenType.DELIM, p, "$");
        case '\'':
            setPositionToMark();
            return consumeStringToken(true);
        case '(':
            return new Token(TokenType.LEFT_PAREN, p, "(");
        case ')':
            return new Token(TokenType.RIGHT_PAREN, p, ")");
        case '*':
            if (peek() == '=') {
                next();
                return new Token(TokenType.SUBSTRING_MATCH, p, "*=");
            }

            return new Token(TokenType.DELIM, p, "*");
        case '+':
            setPositionToMark();
            if (isNumberStart()) {
                return consumeNumericToken();
            }

            next();
            return new Token(TokenType.DELIM, p, "+");
        case ',':
            return new Token(TokenType.COMMA, p, ",");
        case '-':
            setPositionToMark();
            if (isNumberStart()) {
                return consumeNumericToken();
            }

            if (isIdentStart()) {
                return consumeIdentLikeToken();
            }

            if (consume("-->")) {
                return new Token(TokenType.CDC, p, "-->");
            }

            next();
            return new Token(TokenType.DELIM, p, "-");
        case '.':
            setPositionToMark();
            if (isNumberStart()) {
                return consumeNumericToken();
            }

            next();
            return new Token(TokenType.DELIM, p, ".");
        case ':':
            return new Token(TokenType.COLON, p, ":");
        case ';':
            return new Token(TokenType.SEMICOLON, p, ";");
        case '<':
            if (consume("!--")) {
                return new Token(TokenType.CDO, p, "<!--");
            }

            return new Token(TokenType.DELIM, p, "<");
        case '@':
            if (isIdentStart()) {
                return new Token(TokenType.AT_KEYWORD, p, consumeName());
            }

            return new Token(TokenType.DELIM, p, "@");
        case '[':
            return new Token(TokenType.LEFT_SQUARE_BRACKET, p, "[");
        case ']':
            return new Token(TokenType.RIGHT_SQUARE_BRACKET, p, "]");
        case '\\':
            if (isValidEscape('\\', peek())) {
                setPositionToMark();
                return consumeIdentLikeToken();
            }

            return new Token(TokenType.DELIM, p, "\\");
        case '^':
            if (peek() == '=') {
                next();
                return new Token(TokenType.PREFIX_MATCH, p, "^=");
            }

            return new Token(TokenType.DELIM, p, "^");
        case '{':
            return new Token(TokenType.LEFT_CURLY_BRACKET, p, "{");
        case '}':
            return new Token(TokenType.RIGHT_CURLY_BRACKET, p, "}");
        case '|':
            int x = peek();
            switch (x) {
            case '=':
                next();
                return new Token(TokenType.DASH_MATCH, p, "|=");
            case '|':
                next();
                return new Token(TokenType.COLUMN, p, "||");
            }

            return new Token(TokenType.DELIM, p, "|");
        case '~':
            if (peek() == '=') {
                next();
                return new Token(TokenType.INCLUDE_MATCH, p, "~=");
            }

            return new Token(TokenType.DELIM, p, "~");
        }

        if (isDigit(c)) {
            setPositionToMark();
            return consumeNumericToken();
        }

        if (c == 'U' || c == 'u') {
            int[] e = peek2();
            if (e[0] == '+' && (e[1] == '?' || isHexDigit(e[1]))) {
                next(); // Consume the '+'
                return consumeUnicodeRangeToken();
            }

            setPositionToMark();
            return consumeIdentLikeToken();
        }

        if (isNameStart(c)) {
            setPositionToMark();
            return consumeIdentLikeToken();
        }

        return new Token(TokenType.DELIM, p, String.copyValueOf(Character.toChars(c)));
    }

    /**
     * Mark the current position in the input.
     */
    private void mark() {
        this.mark = this.pos;
    }

    /**
     * Sets the position to the marked position in the input.
     */
    private void setPositionToMark() {
        this.pos = this.mark;
    }

    /**
     * Consumes and returns the next code point in the input.
     *
     * @return The next code point in the input.
     */
    private int next() {
        if (isEof()) {
            return EOF;
        }

        int c = this.input.codePointAt(this.pos);
        this.pos += Character.charCount(c);
        return c;
    }

    /**
     * Returns the next code point in the input without consuming it.
     *
     * @return The next code point in the input.
     */
    private int peek() {
        int p = this.pos;
        int c = next();
        this.pos = p;
        return c;
    }

    /**
     * Returns the next two code points in the input without consuming them.
     *
     * @return The next two code points in the input.
     */
    private int[] peek2() {
        int p = this.pos;
        int[] c = new int[] { next(), next() };
        this.pos = p;
        return c;
    }

    /**
     * Returns the next three code points in the input without consuming them.
     *
     * @return The next three code points in the input.
     */
    private int[] peek3() {
        int p = this.pos;
        int[] c = new int[] { next(), next(), next() };
        this.pos = p;
        return c;
    }

    /**
     * Skip comments at the current position.
     */
    private void skipComments() {
        if (consume("/*")) {
            while (true) {
                int c = next();
                if (c == EOF) {
                    return;
                }

                if (c == '*' && peek() == '/') {
                    next(); // Consume the '/'
                    return;
                }
            }
        }
    }

    /**
     * Skip whitespace at the current position.
     *
     * @return The number of whitespace code points skipped.
     */
    private int skipSpace() {
        int n = 0;
        while (isSpace(peek())) {
            n += 1;
            next();
        }

        return n;
    }

    /**
     * Tries to consume the string {@code str} at the current position.
     *
     * @param str The string to consume.
     * @return {@code true} on success consuming {@code str}
     */
    private boolean consume(String str) {
        if (!isEof() && this.input.startsWith(str, this.pos)) {
            this.pos += str.length();
            return true;
        }

        return false;
    }

    /**
     * Returns whether the tokenizer could match an identifier at the current position.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
     *
     * @return {@code true} or {@code false}
     */
    private boolean isIdentStart() {
        if (isEof()) {
            return false;
        }

        int[] c = peek3();
        return isNameStart(c[0]) || isValidEscape(c[0], c[1]) ||
                (c[0] == '-' && (isNameStart(c[1]) || isValidEscape(c[1], c[2])));
    }

    /**
     * Returns whether the tokenizer could match a number at the current position.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#starts-with-a-number
     *
     * @return {@code true} or {@code false}
     */
    private boolean isNumberStart() {
        if (isEof()) {
            return false;
        }

        int[] c = peek3();
        if (isDigit(c[0]) || (c[0] == '.' && isDigit(c[1]))) {
            return true;
        }

        if (c[0] == '+' || c[0] == '-') {
            if (isDigit(c[1])) {
                return true;
            }

            if (c[1] == '.' && isDigit(c[2])) {
                return true;
            }
        }

        return false;
    }

    /**
     * Returns whether the next code point at the current position start the exponential part of a number.
     *
     * @return {@code true} or {@code false}
     */
    private boolean isValidExponent() {
        if (isEof()) {
            return false;
        }

        try {
            mark();
            int c = next();
            if (c != 'e' && c != 'E') {
                return false;
            }

            c = next();
            if (c == '+' || c == '-') {
                return isDigit(next());
            }

            return isDigit(c);
        } finally {
            setPositionToMark();
        }
    }

    /**
     * Consume an escaped code point.
     * <p/>
     * It is assumed that the U+005C REVERSE SOLIDUS (\) has already been consumed
     * and that the next code point in the input has been verified to not be a newline.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
     *
     * @return The consumed code point.
     */
    private int consumeEscape() {
        if (isEof()) {
            return REPLACEMENT_CHAR;
        }

        if (isHexDigit(peek())) {
            int uc = 0;
            int len = 6;
            while (len > 0 && isHexDigit(peek())) {
                uc = (uc << 4) + hexValue(next());
                --len;
            }

            if (uc == 0 || uc > Character.MAX_CODE_POINT || (uc >= 0xD800 && uc <= 0xDFFF)) {
                uc = REPLACEMENT_CHAR;
            }

            if (isSpace(peek())) {
                next();
            }

            return uc;
        }

        return next();
    }

    /**
     * Consume a name.
     * <p/>
     * It is assumed that the current position of the tokenizer represents a name.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-a-name
     *
     * @return The consumed name.
     */
    private String consumeName() {
        StringBuilder sb = new StringBuilder();
        while (true) {
            mark();
            int c = next();
            if (isName(c)) {
                sb.appendCodePoint(c);
            } else if (isValidEscape(c, peek())) {
                sb.appendCodePoint(consumeEscape());
            } else {
                setPositionToMark();
                break;
            }
        }

        return sb.toString();
    }

    /**
     * Consume a number.
     * <p/>
     * It is assumed that the current position of the tokenizer represents a number token.
     *
     * @return The consumed number token.
     */
    private Token.Number consumeNumber() {
        StringBuilder sb = new StringBuilder();

        int p = this.pos;
        int c = peek();
        if (c == '+' || c == '-') {
            sb.appendCodePoint(next());
        }

        while (isDigit(peek())) {
            sb.appendCodePoint(next());
        }

        mark();
        boolean integer = true;
        int c1 = next();
        int c2 = next();
        if (c1 == '.' && isDigit(c2)) {
            sb.appendCodePoint(c1).appendCodePoint(c2);
            while (isDigit(peek())) {
                sb.appendCodePoint(next());
            }

            integer = false;
        } else {
            setPositionToMark();
        }

        if (isValidExponent()) {
            integer = false;
            sb.appendCodePoint(next()).appendCodePoint(next());
            while (isDigit(peek())) {
                sb.appendCodePoint(next());
            }
        }

        return Token.Number.number(p, sb.toString(), integer);
    }

    /**
     * Consume a numeric token.
     * <p/>
     * It is assumed that the current position of the tokenizer represents a number token.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-a-numeric-token
     *
     * @return The consumed numeric token.
     */
    private Token consumeNumericToken() {
        Token.Number token = consumeNumber();
        if (peek() == '%') {
            next();
            return Token.Number.percentage(token.position, token.value, token.integer);
        }

        if (isIdentStart()) {
            return new Token.Dimension(token.position, token.value, token.integer, consumeName());
        }

        return token;
    }

    /**
     * Consume an ident-like token.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-an-ident-like-token
     *
     * @return The consumed token.
     */
    private Token consumeIdentLikeToken() {
        int p = this.pos;
        String name = consumeName();
        TokenType type = TokenType.IDENT;
        if (peek() == '(') {
            next(); // Consume the '('
            if ("url".equalsIgnoreCase(name)) {
                return consumeUrlToken();
            } else {
                type = TokenType.FUNCTION;
            }
        }

        return new Token(type, p, name);
    }

    /**
     * Consume a string token.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-a-string-token
     *
     * @param apostrophe If the string contents is surrounded by apostrophes.
     * @return The consumed string token.
     */
    private Token consumeStringToken(boolean apostrophe) {
        StringBuilder sb = new StringBuilder();

        int p = this.pos;
        next(); // Consume the quote
        while (true) {
            mark();
            int c = next();
            if (c == EOF || (c == '\'' && apostrophe) || (c == '"' && !apostrophe)) {
                break;
            }

            if (c == '\n') {
                setPositionToMark();
                return new Token(TokenType.BAD_STRING, p, "");
            }

            if (c == '\\') {
                int d = peek();
                if (d != EOF) {
                    if (d == '\n') {
                        next(); // Consume the newline
                    } else {
                        sb.appendCodePoint(consumeEscape());
                    }
                }
            } else {
                sb.appendCodePoint(c);
            }
        }

        return new Token(TokenType.STRING, p, sb.toString());
    }

    /**
     * Consume a URL token.
     * <p/>
     * It is assumed that the current position of the tokenizer represents a URL token.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-a-url-token
     *
     * @return The consumed URL token.
     */
    private Token consumeUrlToken() {
        skipSpace();
        int p = this.pos;
        if (isEof()) {
            return new Token(TokenType.URL, p, "");
        }

        int c = peek();
        if (c == '\'' || c == '"') {
            Token token = consumeStringToken(c != '"');
            if (token.type == TokenType.BAD_STRING) {
                p = this.pos;
                consumeBadUrl();
                return new Token(TokenType.BAD_URL, p, token.value);
            } else {
                skipSpace();
                c = peek();
                if (c == ')' || c == EOF) {
                    if (c == ')') {
                        next(); // Consume the ')'
                    }

                    return new Token(TokenType.URL, p, token.value);
                }

                p = this.pos;
                consumeBadUrl();
                return new Token(TokenType.BAD_URL, p, token.value);
            }
        }

        StringBuilder sb = new StringBuilder();
        boolean spaceSeen = false;
        while (true) {
            c = next();
            if (c == ')' || c == EOF) {
                return new Token(TokenType.URL, p, sb.toString());
            }

            if (isSpace(c)) {
                spaceSeen = true;
                skipSpace();
                continue;
            }

            if (spaceSeen) {
                p = this.pos;
                consumeBadUrl();
                return new Token(TokenType.BAD_URL, p, "");
            }

            if (c == '\'' || c == '"' || c == '(' || isNonPrintable(c)) {
                p = this.pos;
                consumeBadUrl();
                return new Token(TokenType.BAD_URL, p, "");
            }

            if (c == '\\') {
                if (isValidEscape(c, peek())) {
                    sb.appendCodePoint(consumeEscape());
                } else {
                    p = this.pos;
                    consumeBadUrl();
                    return new Token(TokenType.BAD_URL, p, "");
                }
            } else {
                sb.appendCodePoint(c);
            }
        }
    }

    /**
     * Consume a unicode range.
     * <p/>
     * Is is assumed that the initial {@code u+} has already been consumed and that
     * the next input code point has been verified to be a hex digit or a {@code ?}.
     *
     * @return The consumed Unicode range token.
     */
    private Token.UnicodeRange consumeUnicodeRangeToken() {
        int p = this.pos;
        int start = 0;
        int length = 0;
        while (isHexDigit(peek()) && length < 6) {
            start = (start << 4) + hexValue(next());
            ++length;
        }

        int q = 0;
        if (length < 6) {
            while (peek() == '?' && length < 6) {
                next();
                ++length;
                ++q;
            }
        }

        if (q != 0) {
            int end = start;
            for (int i = 0; i < q; ++i) {
                start = start << 4;
                end = (end << 4) + 15;
            }

            return new Token.UnicodeRange(p, start, end);
        }

        int end = 0;
        int[] c = peek2();
        if (c[0] == '-' && isHexDigit(c[1])) {
            next(); // Consume the '-'
            length = 0;
            while (isHexDigit(peek()) && length < 6) {
                end = (end << 4) + hexValue(next());
                ++length;
            }
        } else {
            end = start;
        }

        return new Token.UnicodeRange(p, start, end);
    }

    /**
     * Consume the remnants of a bad URL.
     * <p/>
     * See http://www.w3.org/TR/css-syntax-3/#consume-the-remnants-of-a-bad-url
     */
    private void consumeBadUrl() {
        while (true) {
            int c = next();
            if (c == ')' || c == EOF) {
                break;
            }

            if (isValidEscape(c, peek())) {
                consumeEscape();
            }
        }
    }

}