/******************************************************************************* * Copyright (c) 2007, 2010 Wind River Systems, Inc. and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Markus Schorn - initial API and implementation * Mike Kucera (IBM) - UTF string literals *******************************************************************************/ package org.eclipse.cdt.internal.core.parser.scanner; import org.eclipse.cdt.core.parser.IGCCToken; import org.eclipse.cdt.core.parser.IProblem; import org.eclipse.cdt.core.parser.IToken; import org.eclipse.cdt.core.parser.OffsetLimitReachedException; import org.eclipse.cdt.core.parser.util.CharArrayUtils; /** * In short this class converts line endings (to '\n') and trigraphs * (to their corresponding character), * removes line-splices, comments and whitespace other than newline. * Returns preprocessor tokens. * <p> * In addition to the preprocessor tokens the following tokens may also be returned: * {@link #tBEFORE_INPUT}, {@link IToken#tEND_OF_INPUT}, {@link IToken#tCOMPLETION}. * <p> * Number literals are split up into {@link IToken#tINTEGER} and {@link IToken#tFLOATINGPT}. * No checks are done on the number literals. * <p> * UNCs are accepted, however characters from outside of the basic source character set are * not converted to UNCs. Rather than that they are tested with * {@link Character#isUnicodeIdentifierPart(char)} and may be accepted as part of an * identifier. * <p> * The characters in string literals and char-literals are left as they are found, no conversion to * an execution character-set is performed. */ final public class Lexer implements ITokenSequence { public static final int tBEFORE_INPUT = IToken.FIRST_RESERVED_SCANNER; public static final int tNEWLINE = IToken.FIRST_RESERVED_SCANNER + 1; public static final int tQUOTE_HEADER_NAME = IToken.FIRST_RESERVED_SCANNER + 2; public static final int tSYSTEM_HEADER_NAME = IToken.FIRST_RESERVED_SCANNER + 3; public static final int tOTHER_CHARACTER = IToken.FIRST_RESERVED_SCANNER + 4; private static final int END_OF_INPUT = -1; private static final int ORIGIN_LEXER = OffsetLimitReachedException.ORIGIN_LEXER; public final static class LexerOptions implements Cloneable { public boolean fSupportDollarInIdentifiers= true; public boolean fSupportAtSignInIdentifiers= true; public boolean fSupportMinAndMax= true; public boolean fCreateImageLocations= true; public boolean fSupportSlashPercentComments= false; public boolean fSupportUTFLiterals= true; @Override public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { return null; } } } // configuration private final LexerOptions fOptions; private boolean fSupportContentAssist= false; private final ILexerLog fLog; private final Object fSource; // the input to the lexer private final AbstractCharArray fInput; private int fStart; private int fLimit; // after phase 3 (newline, trigraph, line-splice) private int fOffset; private int fEndOffset; private int fCharPhase3; private boolean fInsideIncludeDirective= false; private Token fToken; private Token fLastToken; // For the few cases where we have to lookahead more than one character private int fMarkOffset; private int fMarkEndOffset; private int fMarkPrefetchedChar; // To store the entire state. private boolean fMarkInsideIncludeDirective; private Token fMarkToken; private Token fMarkLastToken; public Lexer(char[] input, LexerOptions options, ILexerLog log, Object source) { this(new CharArray(input), 0, input.length, options, log, source); } public Lexer(AbstractCharArray input, LexerOptions options, ILexerLog log, Object source) { this(input, 0, input.tryGetLength(), options, log, source); } public Lexer(AbstractCharArray input, int start, int end, LexerOptions options, ILexerLog log, Object source) { fInput= input; fStart= fOffset= fEndOffset= start; fLimit= end; fOptions= options; fLog= log; fSource= source; fLastToken= fToken= new Token(tBEFORE_INPUT, source, start, start); nextCharPhase3(); } private boolean isValidOffset(int pos) { if (fLimit < 0) return fInput.isValidOffset(pos); return pos < fLimit; } /** * Returns the source that is attached to the tokens generated by this lexer */ public Object getSource() { return fSource; } /** * Resets the lexer to the first char and prepares for content-assist mode. */ public void setContentAssistMode(int offset) { fSupportContentAssist= true; if (isValidOffset(offset)) { fLimit= offset; } // re-initialize fOffset= fEndOffset= fStart; nextCharPhase3(); } public boolean isContentAssistMode() { return fSupportContentAssist; } /** * Call this before consuming the name-token in the include directive. It causes the header-file * tokens to be created. */ public void setInsideIncludeDirective(boolean val) { fInsideIncludeDirective= val; } /** * Returns the current preprocessor token, does not advance. */ public Token currentToken() { return fToken; } /** * Returns the endoffset of the token before the current one. */ public int getLastEndOffset() { return fLastToken.getEndOffset(); } /** * Advances to the next token, skipping whitespace other than newline. * @throws OffsetLimitReachedException when completion is requested in a literal or a header-name. */ public Token nextToken() throws OffsetLimitReachedException { fLastToken= fToken; return fToken= fetchToken(); } public boolean currentTokenIsFirstOnLine() { final int type= fLastToken.getType(); return type == tNEWLINE || type == tBEFORE_INPUT; } /** * Advances to the next newline or the end of input. The newline will not be consumed. If the * current token is a newline no action is performed. * Returns the end offset of the last token before the newline. * @param origin parameter for the {@link OffsetLimitReachedException} when it has to be thrown. * @since 5.0 */ public final int consumeLine(int origin) throws OffsetLimitReachedException { Token t= fToken; Token lt= null; while (true) { switch(t.getType()) { case IToken.tCOMPLETION: if (lt != null) { fLastToken= lt; } fToken= t; throw new OffsetLimitReachedException(origin, t); case IToken.tEND_OF_INPUT: if (fSupportContentAssist) { t.setType(IToken.tCOMPLETION); throw new OffsetLimitReachedException(origin, t); } //$FALL-THROUGH$ case Lexer.tNEWLINE: fToken= t; if (lt != null) { fLastToken= lt; } return getLastEndOffset(); } lt= t; t= fetchToken(); } } /** * Advances to the next pound token that starts a preprocessor directive. * @return pound token of the directive or end-of-input. * @throws OffsetLimitReachedException when completion is requested in a literal or an header-name. */ public Token nextDirective() throws OffsetLimitReachedException { fInsideIncludeDirective= false; final Token t= fToken; boolean haveNL= t==null || t.getType() == tNEWLINE; while (true) { final boolean hadNL= haveNL; haveNL= false; final int start= fOffset; final int c= fCharPhase3; // optimization avoids calling nextCharPhase3 int d; final int pos= fEndOffset; if (!isValidOffset(pos+1)) { d= nextCharPhase3(); } else { d= fInput.get(pos); switch(d) { case '\\': d= nextCharPhase3(); break; case '?': if (fInput.get(pos+1) == '?') { d= nextCharPhase3(); break; } fOffset= pos; fCharPhase3= d; fEndOffset= pos+1; break; default: fOffset= pos; fCharPhase3= d; fEndOffset= pos+1; break; } } switch(c) { case END_OF_INPUT: fLastToken= fToken= newToken(IToken.tEND_OF_INPUT, start); return fToken; case '\n': haveNL= true; continue; case ' ': case '\t': case 0xb: // vertical tab case '\f': case '\r': haveNL= hadNL; continue; case 'R': if (d == '"') { nextCharPhase3(); rawStringLiteral(start, 2, IToken.tSTRING); } continue; case '"': stringLiteral(start, 1, IToken.tSTRING); continue; case '\'': charLiteral(start, IToken.tCHAR); continue; case '/': switch (d) { case '/': nextCharPhase3(); lineComment(start); continue; case '*': blockComment(start, '*'); haveNL= hadNL; continue; case '%': if (fOptions.fSupportSlashPercentComments) { blockComment(start, '%'); } continue; } continue; case '%': if (hadNL) { if (d == ':') { // found at least '#' final int e= nextCharPhase3(); if (e == '%') { markPhase3(); if (nextCharPhase3() == ':') { // found '##' nextCharPhase3(); continue; } restorePhase3(); } fLastToken= new Token(tNEWLINE, fSource, 0, start); // offset not significant fToken= newDigraphToken(IToken.tPOUND, start); return fToken; } } continue; case '#': if (hadNL && d != '#') { fLastToken= new Token(tNEWLINE, fSource, 0, start); // offset not significant fToken= newToken(IToken.tPOUND, start); return fToken; } continue; default: continue; } } } /** * Computes the next token. */ private Token fetchToken() throws OffsetLimitReachedException { while (true) { final int start= fOffset; final int c= fCharPhase3; final int d= nextCharPhase3(); switch(c) { case END_OF_INPUT: return newToken(IToken.tEND_OF_INPUT, start); case '\n': fInsideIncludeDirective= false; return newToken(Lexer.tNEWLINE, start); case ' ': case '\t': case 0xb: // vertical tab case '\f': case '\r': continue; case 'L': switch(d) { case 'R': markPhase3(); if (nextCharPhase3() == '"') { nextCharPhase3(); return rawStringLiteral(start, 3, IToken.tLSTRING); } restorePhase3(); break; case '"': nextCharPhase3(); return stringLiteral(start, 2, IToken.tLSTRING); case '\'': nextCharPhase3(); return charLiteral(start, IToken.tLCHAR); } return identifier(start, 1); case 'u': case 'U': if (fOptions.fSupportUTFLiterals) { switch(d) { case 'R': markPhase3(); if (nextCharPhase3() == '"') { nextCharPhase3(); return rawStringLiteral(start, 3, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING); } restorePhase3(); break; case '"': nextCharPhase3(); return stringLiteral(start, 2, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING); case '\'': nextCharPhase3(); return charLiteral(start, c == 'u' ? IToken.tUTF16CHAR : IToken.tUTF32CHAR); case '8': if (c == 'u') { markPhase3(); switch (nextCharPhase3()) { case 'R': if (nextCharPhase3() == '"') { nextCharPhase3(); return rawStringLiteral(start, 4, IToken.tSTRING); } break; case '"': nextCharPhase3(); return stringLiteral(start, 3, IToken.tSTRING); } restorePhase3(); } break; } } return identifier(start, 1); case 'R': if (d == '"') { nextCharPhase3(); return rawStringLiteral(start, 2, IToken.tSTRING); } return identifier(start, 1); case '"': if (fInsideIncludeDirective) { return headerName(start, true); } return stringLiteral(start, 1, IToken.tSTRING); case '\'': return charLiteral(start, IToken.tCHAR); case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'S': case 'T': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': return identifier(start, 1); case '$': if (fOptions.fSupportDollarInIdentifiers) { return identifier(start, 1); } break; case '@': if (fOptions.fSupportAtSignInIdentifiers) { return identifier(start, 1); } break; case '\\': switch(d) { case 'u': case 'U': nextCharPhase3(); return identifier(start, 2); } return newToken(tOTHER_CHARACTER, start, 1); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return number(start, 1, false); case '.': switch(d) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': nextCharPhase3(); return number(start, 2, true); case '.': markPhase3(); if (nextCharPhase3() == '.') { nextCharPhase3(); return newToken(IToken.tELLIPSIS, start); } restorePhase3(); break; case '*': nextCharPhase3(); return newToken(IToken.tDOTSTAR, start); } return newToken(IToken.tDOT, start); case '#': if (d == '#') { nextCharPhase3(); return newToken(IToken.tPOUNDPOUND, start); } return newToken(IToken.tPOUND, start); case '{': return newToken(IToken.tLBRACE, start); case '}': return newToken(IToken.tRBRACE, start); case '[': return newToken(IToken.tLBRACKET, start); case ']': return newToken(IToken.tRBRACKET, start); case '(': return newToken(IToken.tLPAREN, start); case ')': return newToken(IToken.tRPAREN, start); case ';': return newToken(IToken.tSEMI, start); case ':': switch(d) { case ':': nextCharPhase3(); return newToken(IToken.tCOLONCOLON, start); case '>': nextCharPhase3(); return newDigraphToken(IToken.tRBRACKET, start); } return newToken(IToken.tCOLON, start); case '?': return newToken(IToken.tQUESTION, start); case '+': switch (d) { case '+': nextCharPhase3(); return newToken(IToken.tINCR, start); case '=': nextCharPhase3(); return newToken(IToken.tPLUSASSIGN, start); } return newToken(IToken.tPLUS, start); case '-': switch (d) { case '>': int e= nextCharPhase3(); if (e == '*') { nextCharPhase3(); return newToken(IToken.tARROWSTAR, start); } return newToken(IToken.tARROW, start); case '-': nextCharPhase3(); return newToken(IToken.tDECR, start); case '=': nextCharPhase3(); return newToken(IToken.tMINUSASSIGN, start); } return newToken(IToken.tMINUS, start); case '*': if (d == '=') { nextCharPhase3(); return newToken(IToken.tSTARASSIGN, start); } return newToken(IToken.tSTAR, start); case '/': switch (d) { case '=': nextCharPhase3(); return newToken(IToken.tDIVASSIGN, start); case '/': nextCharPhase3(); lineComment(start); continue; case '*': blockComment(start, '*'); continue; case '%': if (fOptions.fSupportSlashPercentComments) { blockComment(start, '%'); continue; } break; } return newToken(IToken.tDIV, start); case '%': switch (d) { case '=': nextCharPhase3(); return newToken(IToken.tMODASSIGN, start); case '>': nextCharPhase3(); return newDigraphToken(IToken.tRBRACE, start); case ':': final int e= nextCharPhase3(); if (e == '%') { markPhase3(); if (nextCharPhase3() == ':') { nextCharPhase3(); return newDigraphToken(IToken.tPOUNDPOUND, start); } restorePhase3(); } return newDigraphToken(IToken.tPOUND, start); } return newToken(IToken.tMOD, start); case '^': if (d == '=') { nextCharPhase3(); return newToken(IToken.tXORASSIGN, start); } return newToken(IToken.tXOR, start); case '&': switch (d) { case '&': nextCharPhase3(); return newToken(IToken.tAND, start); case '=': nextCharPhase3(); return newToken(IToken.tAMPERASSIGN, start); } return newToken(IToken.tAMPER, start); case '|': switch (d) { case '|': nextCharPhase3(); return newToken(IToken.tOR, start); case '=': nextCharPhase3(); return newToken(IToken.tBITORASSIGN, start); } return newToken(IToken.tBITOR, start); case '~': return newToken(IToken.tBITCOMPLEMENT, start); case '!': if (d == '=') { nextCharPhase3(); return newToken(IToken.tNOTEQUAL, start); } return newToken(IToken.tNOT, start); case '=': if (d == '=') { nextCharPhase3(); return newToken(IToken.tEQUAL, start); } return newToken(IToken.tASSIGN, start); case '<': if (fInsideIncludeDirective) { return headerName(start, false); } switch(d) { case '=': nextCharPhase3(); return newToken(IToken.tLTEQUAL, start); case '<': final int e= nextCharPhase3(); if (e == '=') { nextCharPhase3(); return newToken(IToken.tSHIFTLASSIGN, start); } return newToken(IToken.tSHIFTL, start); case '?': if (fOptions.fSupportMinAndMax) { nextCharPhase3(); return newToken(IGCCToken.tMIN, start); } break; case ':': nextCharPhase3(); return newDigraphToken(IToken.tLBRACKET, start); case '%': nextCharPhase3(); return newDigraphToken(IToken.tLBRACE, start); } return newToken(IToken.tLT, start); case '>': switch(d) { case '=': nextCharPhase3(); return newToken(IToken.tGTEQUAL, start); case '>': final int e= nextCharPhase3(); if (e == '=') { nextCharPhase3(); return newToken(IToken.tSHIFTRASSIGN, start); } return newToken(IToken.tSHIFTR, start); case '?': if (fOptions.fSupportMinAndMax) { nextCharPhase3(); return newToken(IGCCToken.tMAX, start); } break; } return newToken(IToken.tGT, start); case ',': return newToken(IToken.tCOMMA, start); default: // in case we have some other letter to start an identifier if (Character.isUnicodeIdentifierStart((char) c)) { return identifier(start, 1); } break; } // handles for instance @ return newToken(tOTHER_CHARACTER, start, 1); } } private Token newToken(int kind, int offset) { return new Token(kind, fSource, offset, fOffset); } private Token newDigraphToken(int kind, int offset) { return new TokenForDigraph(kind, fSource, offset, fOffset); } private Token newToken(final int kind, final int offset, final int imageLength) { final int endOffset= fOffset; final int sourceLen= endOffset-offset; char[] image; if (sourceLen != imageLength) { image= getCharImage(offset, endOffset, imageLength); } else { image= new char[imageLength]; fInput.arraycopy(offset, image, 0, imageLength); } return new TokenWithImage(kind, fSource, offset, endOffset, image); } private void handleProblem(int problemID, char[] arg, int offset) { fLog.handleProblem(problemID, arg, offset, fOffset); } private Token headerName(final int start, final boolean expectQuotes) throws OffsetLimitReachedException { int length= 1; boolean done = false; int c= fCharPhase3; loop: while (!done) { switch (c) { case END_OF_INPUT: if (fSupportContentAssist) { throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), start, length)); } //$FALL-THROUGH$ case '\n': handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start); break loop; case '"': done= expectQuotes; break; case '>': done= !expectQuotes; break; } length++; c= nextCharPhase3(); } return newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), start, length); } private void blockComment(final int start, final char trigger) { // we can ignore line-splices, trigraphs and windows newlines when searching for the '*' int pos= fEndOffset; while (isValidOffset(pos)) { if (fInput.get(pos++) == trigger) { fEndOffset= pos; if (nextCharPhase3() == '/') { nextCharPhase3(); fLog.handleComment(true, start, fOffset); return; } } } fCharPhase3= END_OF_INPUT; fOffset= fEndOffset= pos; fLog.handleComment(true, start, pos); } private void lineComment(final int start) { int c= fCharPhase3; while (true) { switch (c) { case END_OF_INPUT: case '\n': fLog.handleComment(false, start, fOffset); return; } c= nextCharPhase3(); } } private Token stringLiteral(final int start, int length, final int tokenType) throws OffsetLimitReachedException { boolean escaped = false; boolean done = false; int c= fCharPhase3; loop: while (!done) { switch(c) { case END_OF_INPUT: if (fSupportContentAssist) { throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken(tokenType, start, length)); } //$FALL-THROUGH$ case '\n': handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start); break loop; case '\\': escaped= !escaped; break; case '"': if (!escaped) { done= true; } escaped= false; break; default: escaped= false; break; } length++; c= nextCharPhase3(); } return newToken(tokenType, start, length); } private Token rawStringLiteral(final int start, int length, final int tokenType) throws OffsetLimitReachedException { final int delimOffset= fOffset; int delimEndOffset = delimOffset; int offset; for(;; delimEndOffset++) { if (!fInput.isValidOffset(delimEndOffset)) { offset= delimEndOffset; break; } if (fInput.get(delimEndOffset) == '(') { offset= delimEndOffset+1; break; } } final int delimLength= delimEndOffset-delimOffset; for(;; offset++) { if (!fInput.isValidOffset(offset)) { handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, offset), start); break; } final char c= fInput.get(offset); if (c == ')') { final int endingDoubleQuoteOffset= offset+delimLength+1; if (fInput.isValidOffset(endingDoubleQuoteOffset) && fInput.get(endingDoubleQuoteOffset) == '"') { boolean prefixMatches= true; for (int i = 0; i < delimLength; i++) { if (fInput.get(offset + i + 1) != fInput.get(delimOffset+i)) { prefixMatches= false; break; } } if (prefixMatches) { offset= endingDoubleQuoteOffset+1; break; } } } } fOffset= offset-1; fEndOffset= offset; fCharPhase3= 0; nextCharPhase3(); return newToken(tokenType, start, offset-start); } private Token charLiteral(final int start, final int tokenType) throws OffsetLimitReachedException { boolean escaped = false; boolean done = false; int length= tokenType == IToken.tCHAR ? 1 : 2; int c= fCharPhase3; loop: while (!done) { switch(c) { case END_OF_INPUT: if (fSupportContentAssist) { throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken(tokenType, start, length)); } //$FALL-THROUGH$ case '\n': handleProblem(IProblem.SCANNER_BAD_CHARACTER, getInputChars(start, fOffset), start); break loop; case '\\': escaped= !escaped; break; case '\'': if (!escaped) { done= true; } escaped= false; break; default: escaped= false; break; } length++; c= nextCharPhase3(); } return newToken(tokenType, start, length); } private Token identifier(int start, int length) { int tokenKind= IToken.tIDENTIFIER; boolean isPartOfIdentifier= true; int c= fCharPhase3; while (true) { switch(c) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case '\\': // universal character name markPhase3(); switch(nextCharPhase3()) { case 'u': case 'U': length++; break; default: restorePhase3(); isPartOfIdentifier= false; break; } break; case END_OF_INPUT: if (fSupportContentAssist) { tokenKind= IToken.tCOMPLETION; } isPartOfIdentifier= false; break; case ' ': case '\t': case 0xb: case '\f': case '\r': case '\n': isPartOfIdentifier= false; break; case '$': isPartOfIdentifier= fOptions.fSupportDollarInIdentifiers; break; case '@': isPartOfIdentifier= fOptions.fSupportAtSignInIdentifiers; break; case '{': case '}': case '[': case ']': case '#': case '(': case ')': case '<': case '>': case '%': case ':': case ';': case '.': case '?': case '*': case '+': case '-': case '/': case '^': case '&': case '|': case '~': case '!': case '=': case ',': case '"': case '\'': isPartOfIdentifier= false; break; default: isPartOfIdentifier= Character.isUnicodeIdentifierPart((char) c); break; } if (!isPartOfIdentifier) { break; } length++; c= nextCharPhase3(); } return newToken(tokenKind, start, length); } private Token number(final int start, int length, boolean isFloat) throws OffsetLimitReachedException { boolean isPartOfNumber= true; boolean isHex= false; int c= fCharPhase3; while (true) { switch(c) { // non-digit case 'a': case 'b': case 'c': case 'd': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'Y': case 'Z': case '_': // digit case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case 'x': case 'X': isHex= !isFloat; break; // period case '.': isFloat= true; break; // exponents case 'e': case 'E': if (isHex) break; //$FALL-THROUGH$ case 'p': case 'P': length++; c= nextCharPhase3(); switch (c) { case '+': case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': isFloat= true; isHex= false; length++; c= nextCharPhase3(); break; } continue; // universal character name (non-digit) case '\\': markPhase3(); switch(nextCharPhase3()) { case 'u': case 'U': length++; break; default: restorePhase3(); isPartOfNumber= false; break; } break; case END_OF_INPUT: if (fSupportContentAssist) { throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken((isFloat ? IToken.tFLOATINGPT : IToken.tINTEGER), start, length)); } isPartOfNumber= false; break; default: isPartOfNumber= false; break; } if (!isPartOfNumber) { break; } c= nextCharPhase3(); length++; } return newToken((isFloat ? IToken.tFLOATINGPT : IToken.tINTEGER), start, length); } /** * Saves the current state of phase3, necessary for '...', '%:%:', UNCs and string literals * with a long prefix. */ private void markPhase3() { fMarkOffset= fOffset; fMarkEndOffset= fEndOffset; fMarkPrefetchedChar= fCharPhase3; } /** * Restores a previously saved state of phase3. */ private void restorePhase3() { fOffset= fMarkOffset; fEndOffset= fMarkEndOffset; fCharPhase3= fMarkPrefetchedChar; } /** * Perform phase 1-3: Replace \r\n with \n, handle trigraphs, detect line-splicing. * Changes fOffset, fEndOffset and fCharPhase3, state-less otherwise. */ private int nextCharPhase3() { int pos= fEndOffset; do { if (!isValidOffset(pos+1)) { if (!isValidOffset(pos)) { fOffset= pos; fEndOffset= pos; fCharPhase3= END_OF_INPUT; return END_OF_INPUT; } fOffset= pos; fEndOffset= pos+1; fCharPhase3= fInput.get(pos); return fCharPhase3; } final char c= fInput.get(pos); fOffset= pos; fEndOffset= ++pos; fCharPhase3= c; switch(c) { // windows line-ending case '\r': if (fInput.get(pos) == '\n') { fEndOffset= pos+1; fCharPhase3= '\n'; return '\n'; } return c; // trigraph sequences case '?': if (fInput.get(pos) != '?' || !isValidOffset(pos+1)) { return c; } final char trigraph= checkTrigraph(fInput.get(pos+1)); if (trigraph == 0) { return c; } if (trigraph != '\\') { fEndOffset= pos+2; fCharPhase3= trigraph; return trigraph; } pos+= 2; // $FALL-THROUGH$, handle backslash case '\\': final int lsPos= findEndOfLineSpliceSequence(pos); if (lsPos > pos) { pos= lsPos; continue; } fEndOffset= pos; fCharPhase3= '\\'; return '\\'; // don't return c, it may be a '?' default: return c; } } while (true); } /** * Maps a trigraph to the character it encodes. * @param c trigraph without leading question marks. * @return the character encoded or 0. */ private char checkTrigraph(char c) { switch(c) { case '=': return '#'; case '\'':return '^'; case '(': return '['; case ')': return ']'; case '!': return '|'; case '<': return '{'; case '>': return '}'; case '-': return '~'; case '/': return '\\'; } return 0; } /** * Returns the end-offset for a line-splice sequence, or -1 if there is none. */ private int findEndOfLineSpliceSequence(int pos) { boolean haveBackslash= true; int result= -1; loop: while (isValidOffset(pos)) { switch(fInput.get(pos++)) { case '\n': if (haveBackslash) { result= pos; haveBackslash= false; continue loop; } return result; case '\r': case ' ': case '\f': case '\t': case 0xb: // vertical tab if (haveBackslash) { continue loop; } return result; case '?': if (!isValidOffset(pos+1) || fInput.get(pos) != '?' || fInput.get(++pos) != '/') { return result; } // $FALL-THROUGH$ to backslash handling case '\\': if (!haveBackslash) { haveBackslash= true; continue loop; } return result; default: return result; } } return result; } /** * Returns the image from the input without any modification. */ public char[] getInputChars(int offset, int endOffset) { final int length= endOffset-offset; if (length <= 0) { return CharArrayUtils.EMPTY; } final char[] result= new char[length]; fInput.arraycopy(offset, result, 0, length); return result; } AbstractCharArray getInput() { return fInput; } /** * Returns the image with trigraphs replaced and line-splices removed. */ private char[] getCharImage(int offset, int endOffset, int imageLength) { final char[] result= new char[imageLength]; markPhase3(); fEndOffset= offset; for (int idx=0; idx<imageLength; idx++) { result[idx]= (char) nextCharPhase3(); } restorePhase3(); return result; } public void saveState() { fMarkOffset= fOffset; fMarkEndOffset= fEndOffset; fMarkPrefetchedChar= fCharPhase3; fMarkInsideIncludeDirective= fInsideIncludeDirective; fMarkToken= fToken; fMarkLastToken= fLastToken; } public void restoreState() { fOffset= fMarkOffset; fEndOffset= fMarkEndOffset; fCharPhase3= fMarkPrefetchedChar; fInsideIncludeDirective= fMarkInsideIncludeDirective; fToken= fMarkToken; fLastToken= fMarkLastToken; } }