package org.reasm.m68k.expressions.internal; import javax.annotation.CheckForNull; import javax.annotation.Nonnull; import org.reasm.commons.source.Syntax; import org.reasm.commons.util.CharSequenceParserReader; import org.reasm.m68k.source.M68KParser; /** * The tokenizer for expressions and effective addresses. * * @author Francis Gagné */ public final class Tokenizer { private CharSequenceParserReader reader; private int endOfBrokenSequence; private TokenType tokenType; private int tokenStart; private int tokenEnd; /** * Initializes a new Tokenizer. */ public Tokenizer() { } /** * Initializes a new Tokenizer from another Tokenizer. * * @param tokenizer * the other tokenizer to copy */ private Tokenizer(@Nonnull Tokenizer tokenizer) { this.reader = tokenizer.reader.duplicate(); this.endOfBrokenSequence = tokenizer.endOfBrokenSequence; this.tokenType = tokenizer.tokenType; this.tokenStart = tokenizer.tokenStart; this.tokenEnd = tokenizer.tokenEnd; } /** * Advances to the next token. * * @see #getTokenType() * @see #getTokenStart() * @see #getTokenEnd() * @see #getTokenLength() * @see #getTokenText() */ public final void advance() { if (this.endOfBrokenSequence != -1) { int start = this.tokenEnd; if (start < this.endOfBrokenSequence) { this.setToken(TokenType.OPERATOR, start, start + 1); return; } this.endOfBrokenSequence = -1; } this.setToken(TokenType.END, this.tokenEnd, this.tokenEnd); while (Syntax.isWhitespace(this.reader.getCurrentCodePoint())) { this.reader.advance(); } final int start = this.reader.getPosition(); TokenType tokenType; final int firstCodePoint = this.reader.getCurrentCodePoint(); int codePoint; switch (firstCodePoint) { case -1: return; case '!': // either "!" or "!=" case '=': // either "=" or "==" tokenType = TokenType.OPERATOR; this.reader.advance(); switch (this.reader.getCurrentCodePoint()) { case '=': this.reader.advance(); break; } break; case '"': // a string delimited by double quotes case '\'': // a string delimited by apostrophes tokenType = TokenType.STRING; this.reader.advance(); boolean lastWasEscape = false; for (;; this.reader.advance()) { codePoint = this.reader.getCurrentCodePoint(); if (codePoint == -1) { // The string is not terminated properly: make the token invalid. tokenType = TokenType.INVALID; break; } if (lastWasEscape) { lastWasEscape = false; } else { if (codePoint == firstCodePoint) { // Finish the string. this.reader.advance(); break; } lastWasEscape = codePoint == '\\'; } } break; case '#': tokenType = TokenType.IMMEDIATE; this.reader.advance(); break; case '$': // an hexadecimal integer literal tokenType = TokenType.HEXADECIMAL_INTEGER; this.reader.advance(); boolean haveHexDigit = false; for (;; this.reader.advance()) { codePoint = this.reader.getCurrentCodePoint(); // If the next character is not a valid identifier character or if it's a period, it's the end of the token. if (!M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint) || codePoint == '.') { break; } // If the next character is not an hexadecimal digit, make the token invalid. if (!Syntax.isHexDigit(codePoint)) { tokenType = TokenType.INVALID; this.finishIdentifier(); break; } haveHexDigit = true; } // If there are no valid digits after the '$', make the token invalid. if (!haveHexDigit) { tokenType = TokenType.INVALID; this.finishIdentifier(); } break; case '%': // the '%' (modulus) operator or a binary integer literal (see changeToBinaryInteger()) case '*': case '/': case '^': case '~': tokenType = TokenType.OPERATOR; this.reader.advance(); break; case '&': // either '&' or '&&' tokenType = TokenType.OPERATOR; this.reader.advance(); switch (this.reader.getCurrentCodePoint()) { case '&': this.reader.advance(); break; } break; case '(': tokenType = TokenType.OPENING_PARENTHESIS; this.reader.advance(); break; case ')': tokenType = TokenType.CLOSING_PARENTHESIS; this.reader.advance(); break; case '+': // one or more '+' case '-': // one or more '-' tokenType = TokenType.PLUS_OR_MINUS_SEQUENCE; this.reader.advance(); while (this.reader.getCurrentCodePoint() == firstCodePoint) { this.reader.advance(); } break; case ',': tokenType = TokenType.COMMA; this.reader.advance(); break; case ':': tokenType = TokenType.CONDITIONAL_OPERATOR_SECOND; this.reader.advance(); break; case ';': // a comment (not supposed to happen!) tokenType = TokenType.INVALID; this.reader.advance(); break; case '<': // either "<", "<<", "<=", or "<>" tokenType = TokenType.OPERATOR; this.reader.advance(); switch (this.reader.getCurrentCodePoint()) { case '<': case '=': case '>': this.reader.advance(); break; } break; case '>': // either ">", ">=" or ">>" tokenType = TokenType.OPERATOR; this.reader.advance(); switch (this.reader.getCurrentCodePoint()) { case '=': case '>': this.reader.advance(); break; } break; case '?': tokenType = TokenType.CONDITIONAL_OPERATOR_FIRST; this.reader.advance(); break; case '[': tokenType = TokenType.OPENING_BRACKET; this.reader.advance(); break; case '\\': tokenType = TokenType.INVALID; this.reader.advance(); for (; M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint = this.reader.getCurrentCodePoint()); this.reader .advance()) { } break; case ']': tokenType = TokenType.CLOSING_BRACKET; this.reader.advance(); break; case '{': tokenType = TokenType.OPENING_BRACE; this.reader.advance(); break; case '|': tokenType = TokenType.OPERATOR; this.reader.advance(); switch (this.reader.getCurrentCodePoint()) { case '|': this.reader.advance(); break; } break; case '}': tokenType = TokenType.CLOSING_BRACE; this.reader.advance(); break; default: if (firstCodePoint == '.' || Syntax.isDigit(firstCodePoint)) { // If it's a digit, then it's an integer or a real. Assume it's a decimal integer literal for now. // If it's a point, then it's an operator or a real. In the first pass in the loop below, the point will be found // and the token type will switch to REAL if there is a valid real. tokenType = TokenType.DECIMAL_INTEGER; codePoint = firstCodePoint; for (; codePoint != -1; this.reader.advance(), codePoint = this.reader.getCurrentCodePoint()) { // If the character is a point, try to parse a real number. if (codePoint == '.') { CharSequenceParserReader reader2 = this.reader.duplicate(); reader2.advance(); codePoint = reader2.getCurrentCodePoint(); // If the decimal point is followed by a character that is not a valid identifier character, keep the point // as part of this token, unless the token is only a point. if (!M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint)) { if (firstCodePoint == '.') { break; } this.reader.copyFrom(reader2); tokenType = TokenType.REAL; break; } // If the decimal point is followed by a valid identifier character that is not a digit, then reject the // point as a decimal separator and stay with the integer. The point will then be parsed as an operator // and an identifier will follow it. if (!Syntax.isDigit(codePoint)) { break; } this.reader.copyFrom(reader2); tokenType = this.readRealDigits(true); if (tokenType == TokenType.INVALID) { break; } codePoint = this.reader.getCurrentCodePoint(); } // If the character is an 'E' or 'e', try to parse the exponential part of a floating-point number. if (codePoint == 'E' || codePoint == 'e') { this.reader.advance(); int codePoint2 = this.reader.getCurrentCodePoint(); // If the 'E' or 'e' is immediately followed by a '+' or '-', accept that character and advance // the reader. if (codePoint2 == '+' || codePoint2 == '-') { this.reader.advance(); codePoint2 = this.reader.getCurrentCodePoint(); if (!Syntax.isDigit(codePoint2)) { // If the '+' or '-' is not followed by a digit, make the token invalid. tokenType = TokenType.INVALID; this.finishIdentifier(); break; } } else if (!Syntax.isDigit(codePoint2)) { // If the 'E' or 'e' is not followed by a '+', a '-' or a digit, make the token invalid. tokenType = TokenType.INVALID; this.finishIdentifier(); break; } tokenType = this.readRealDigits(false); if (tokenType == TokenType.INVALID) { break; } codePoint = this.reader.getCurrentCodePoint(); } // If the token is now a real, we've reached the end of it already. if (tokenType == TokenType.REAL) { break; } // If the next character is not a valid identifier character, it's the end of the integer token. if (!M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint)) { break; } // If it's not a digit, make the token invalid. if (!Syntax.isDigit(codePoint)) { tokenType = TokenType.INVALID; this.finishIdentifier(); break; } } // If the first character was a point and the reader is still at its initial position, parse the period operator. if (firstCodePoint == '.' && this.reader.getPosition() == start) { tokenType = TokenType.PERIOD; this.reader.advance(); } } else { assert M68KParser.SYNTAX.isValidIdentifierCodePoint(firstCodePoint); // If it's a valid code point for an identifier, then it's an identifier. tokenType = TokenType.IDENTIFIER; this.finishIdentifier(); } break; } this.setToken(tokenType, start, this.reader.getPosition()); } /** * Breaks a token of type {@link TokenType#PLUS_OR_MINUS_SEQUENCE} into a series of {@link TokenType#OPERATOR} tokens. * * @throws IllegalStateException * the current token is not of type {@link TokenType#PLUS_OR_MINUS_SEQUENCE} * @see #getTokenType() * @see #getTokenStart() * @see #getTokenEnd() * @see #getTokenLength() * @see #getTokenText() */ public final void breakSequence() { if (this.tokenType != TokenType.PLUS_OR_MINUS_SEQUENCE) { throw new IllegalStateException("The current token's type is not PLUS_OR_MINUS_SEQUENCE"); } this.endOfBrokenSequence = this.tokenEnd; this.setToken(TokenType.OPERATOR, this.tokenStart, this.tokenStart + 1); } /** * Reparses the <code>%</code> operator as a binary integer. * * @throws IllegalStateException * the current token is not the <code>%</code> operator * @see #getTokenType() * @see #getTokenStart() * @see #getTokenEnd() * @see #getTokenLength() * @see #getTokenText() */ public final void changeToBinaryInteger() { if (this.tokenType != TokenType.OPERATOR || !this.tokenEqualsString("%")) { throw new IllegalStateException("The current token is not the '%' operator"); } TokenType tokenType = TokenType.BINARY_INTEGER; boolean haveBinDigit = false; for (;; this.reader.advance()) { int codePoint = this.reader.getCurrentCodePoint(); // If the next character is not a valid identifier character or if it's a period, it's the end of the token. if (!M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint) || codePoint == '.') { break; } // If the next character is not an hexadecimal digit, make the token invalid. if (!Syntax.isBinDigit(codePoint)) { tokenType = TokenType.INVALID; this.finishIdentifier(); break; } haveBinDigit = true; } // If there are no valid digits after the '%', make the token invalid. if (!haveBinDigit) { tokenType = TokenType.INVALID; } this.setToken(tokenType, this.tokenStart, this.reader.getPosition()); } /** * Copies the state from another Tokenizer that reads from the same {@link CharSequence}, usually a Tokenizer returned by * {@link #duplicateAndAdvance()}. * * @param other * the other Tokenizer * @see #duplicateAndAdvance() */ public final void copyFrom(@Nonnull Tokenizer other) { this.reader.copyFrom(other.reader); this.endOfBrokenSequence = other.endOfBrokenSequence; this.tokenType = other.tokenType; this.tokenStart = other.tokenStart; this.tokenEnd = other.tokenEnd; } /** * Creates a copy of this tokenizer and advances it to the next token. * * @return the new Tokenizer * @see #copyFrom(Tokenizer) */ @Nonnull public final Tokenizer duplicateAndAdvance() { final Tokenizer duplicate = new Tokenizer(this); duplicate.advance(); return duplicate; } /** * Gets the ending position of this tokenizer's current token. * * @return the current token's ending position */ public final int getTokenEnd() { return this.tokenEnd; } /** * Gets the length of this tokenizer's current token. * * @return the current token's length */ public final int getTokenLength() { return this.tokenEnd - this.tokenStart; } /** * Gets the starting position of this tokenizer's current token. * * @return the current token's starting position */ public final int getTokenStart() { return this.tokenStart; } /** * Gets the text of this tokenizer's current token. * * @return the current token's text */ @Nonnull public final CharSequence getTokenText() { return this.reader.getCharSequence().subSequence(this.tokenStart, this.tokenEnd); } /** * Gets the type of this tokenizer's current token. * * @return the current token's type */ public final TokenType getTokenType() { return this.tokenType; } /** * Sets that {@link CharSequence} this tokenizer will read from. The first token is parsed. * * @param charSequence * the {@link CharSequence} to read from */ public final void setCharSequence(@Nonnull CharSequence charSequence) { if (charSequence == null) { throw new NullPointerException("charSequence"); } this.reader = new CharSequenceParserReader(charSequence); this.endOfBrokenSequence = -1; this.setToken(TokenType.END, 0, 0); this.advance(); } /** * Gets the character at the specified index in the text of this tokenizer's current token. * * @param index * the index of the character to get * @return the character */ public char tokenCharAt(int index) { return this.reader.getCharSequence().charAt(this.tokenStart + index); } /** * Determines whether the text of this tokenizer's current token is the same as the specified string. * * @param string * the string to compare the token's text with * @return <code>true</code> if the token's text is equal to the string, otherwise <code>false</code> */ public final boolean tokenEqualsString(@CheckForNull String string) { if (string == null) { return false; } if (this.getTokenLength() != string.length()) { return false; } for (int i = 0; i < this.getTokenLength(); i++) { if (this.tokenCharAt(i) != string.charAt(i)) { return false; } } return true; } /** * Advances the reader until a code point that is not valid for an identifier is found. */ private final void finishIdentifier() { int codePoint; do { this.reader.advance(); codePoint = this.reader.getCurrentCodePoint(); } while (M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint)); } @Nonnull private final TokenType readRealDigits(boolean acceptScientificENotation) { for (;;) { this.reader.advance(); int codePoint = this.reader.getCurrentCodePoint(); // If the next character is not a valid identifier character, it's the end of the real token. if (!M68KParser.SYNTAX.isValidIdentifierCodePoint(codePoint)) { break; } // If scientific E notation is allowed at this point, and the next character is 'E' or 'e', stop here. if (acceptScientificENotation && (codePoint == 'E' || codePoint == 'e')) { break; } // If the next character is not a digit, make the token invalid. if (!Syntax.isDigit(codePoint)) { this.finishIdentifier(); return TokenType.INVALID; } } return TokenType.REAL; } private final void setToken(@Nonnull TokenType tokenType, int tokenStart, int tokenEnd) { this.tokenType = tokenType; this.tokenStart = tokenStart; this.tokenEnd = tokenEnd; } }