Lexer.java example

Explorer
cdt-master
/*******************************************************************************
 * Copyright (c) 2007, 2015 Wind River Systems, Inc. and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Markus Schorn - initial API and implementation
 *     Mike Kucera (IBM) - UTF string literals
 *     Sergey Prigogin (Google)
 *     Richard Eames
 *******************************************************************************/ 
package org.eclipse.cdt.internal.core.parser.scanner;

import org.eclipse.cdt.core.parser.IGCCToken;
import org.eclipse.cdt.core.parser.IProblem;
import org.eclipse.cdt.core.parser.IToken;
import org.eclipse.cdt.core.parser.IncludeExportPatterns;
import org.eclipse.cdt.core.parser.OffsetLimitReachedException;
import org.eclipse.cdt.core.parser.util.CharArrayUtils;

/**
 * In short this class converts line endings (to '\n') and trigraphs 
 * (to their corresponding character), 
 * removes line-splices, comments and whitespace other than newline.
 * Returns preprocessor tokens.
 * <p>
 * In addition to the preprocessor tokens the following tokens may also be returned:
 * {@link #tBEFORE_INPUT}, {@link IToken#tEND_OF_INPUT}, {@link IToken#tCOMPLETION}.
 * <p>
 * Number literals are split up into {@link IToken#tINTEGER} and {@link IToken#tFLOATINGPT}. 
 * No checks are done on the number literals.
 * <p>
 * UNCs are accepted, however characters from outside of the basic source character set are
 * not converted to UNCs. Rather than that they are tested with 
 * {@link Character#isUnicodeIdentifierPart(char)} and may be accepted as part of an 
 * identifier.
 * <p>
 * The characters in string literals and char-literals are left as they are found, no conversion to
 * an execution character-set is performed.
 */
final public class Lexer implements ITokenSequence {
	public static final int tBEFORE_INPUT   = IToken.FIRST_RESERVED_SCANNER;
	public static final int tNEWLINE		= IToken.FIRST_RESERVED_SCANNER + 1;
	public static final int tQUOTE_HEADER_NAME    = IToken.FIRST_RESERVED_SCANNER + 2;
	public static final int tSYSTEM_HEADER_NAME   = IToken.FIRST_RESERVED_SCANNER + 3;
	public static final int tOTHER_CHARACTER 	  = IToken.FIRST_RESERVED_SCANNER + 4;
	
	private static final int END_OF_INPUT = -1;
	private static final int ORIGIN_LEXER = OffsetLimitReachedException.ORIGIN_LEXER;
	
	public final static class LexerOptions implements Cloneable {
		public boolean fSupportDollarInIdentifiers= true;
		public boolean fSupportAtSignInIdentifiers= true;
		public boolean fSupportMinAndMax= true;
		public boolean fCreateImageLocations= true;
		public boolean fSupportSlashPercentComments= false;
		public boolean fSupportUTFLiterals= true;
		public boolean fSupportRawStringLiterals= false;
		public boolean fSupportUserDefinedLiterals = false; 
		public IncludeExportPatterns fIncludeExportPatterns;
		
		@Override
		public Object clone() {
			try {
				return super.clone();
			} catch (CloneNotSupportedException e) {
				return null;
			}
		}
	}

	// configuration
	private final LexerOptions fOptions;
	private boolean fSupportContentAssist= false;
	private final ILexerLog fLog;
	private final Object fSource;
	
	// the input to the lexer
	private final AbstractCharArray fInput;
	private final int fStart;
	private int fLimit;

	// after phase 3 (newline, trigraph, line-splice)
	private int fOffset;
	private int fEndOffset;
	private int fCharPhase3;
	
	private boolean fInsideIncludeDirective= false;
	private Token fToken;
	private Token fLastToken;
	
	// For the few cases where we have to lookahead more than one character
	private int fMarkPhase3Offset;
	private int fMarkPhase3EndOffset;
	private int fMarkPhase3PrefetchedChar;
	
	// To store the entire state. Note that we don't reuse the variables
	// used for saving the phase3 because calls to markPhase3() and
	// restorePhase3() can occur in between calls to saveState() and
	// restoreState().
	private int fMarkOffset;
	private int fMarkEndOffset;
	private int fMarkPrefetchedChar;
	private boolean fMarkInsideIncludeDirective;
	private Token fMarkToken;
	private Token fMarkLastToken;
	
	public Lexer(char[] input, LexerOptions options, ILexerLog log, Object source) {
		this(new CharArray(input), 0, input.length, options, log, source);
	}

	public Lexer(AbstractCharArray input, LexerOptions options, ILexerLog log, Object source) {
		this(input, 0, input.tryGetLength(), options, log, source);
	}
	
	public Lexer(AbstractCharArray input, int start, int end, LexerOptions options, ILexerLog log, Object source) {
		fInput= input;
		fStart= fOffset= fEndOffset= start;
		fLimit= end;
		fOptions= options;
		fLog= log;
		fSource= source;
		fLastToken= fToken= new Token(tBEFORE_INPUT, source, start, start);
		nextCharPhase3();
	}
	
	private boolean isValidOffset(int pos) {
		if (fLimit < 0)
			return fInput.isValidOffset(pos);
		
		return pos < fLimit;
	}

	/**
	 * Returns the source that is attached to the tokens generated by this lexer
	 */
	public Object getSource() {
		return fSource;
	}

	/**
	 * Resets the lexer to the first char and prepares for content-assist mode. 
	 */
	public void setContentAssistMode(int offset) {
		fSupportContentAssist= true;
		if (isValidOffset(offset)) {
			fLimit= offset;
		}
		// re-initialize 
		fOffset= fEndOffset= fStart;
		nextCharPhase3();
	}
	
	public boolean isContentAssistMode() {
		return fSupportContentAssist;
	}

	/**
	 * Call this before consuming the name-token in the include directive. It causes the header-file 
	 * tokens to be created. 
	 */
	public void setInsideIncludeDirective(boolean val) {
		fInsideIncludeDirective= val;
	}
	
	/** 
	 * Returns the current preprocessor token, does not advance.
	 */
	@Override
	public Token currentToken() {
		return fToken;
	}

	/**
	 * Returns the endoffset of the token before the current one.
	 */
	@Override
	public int getLastEndOffset() {
		return fLastToken.getEndOffset();
	}

	/**
	 * Advances to the next token, skipping whitespace other than newline.
	 * @throws OffsetLimitReachedException when completion is requested in a literal or a header-name.
	 */
	@Override
	public Token nextToken() throws OffsetLimitReachedException {
		fLastToken= fToken;
		return fToken= fetchToken();
	}

	public boolean currentTokenIsFirstOnLine() {
		final int type= fLastToken.getType();
		return type == tNEWLINE || type == tBEFORE_INPUT;
	}
	
	/**
	 * Advances to the next newline or the end of input. The newline will not be consumed. If the
	 * current token is a newline no action is performed.
	 * Returns the end offset of the last token before the newline. 
	 * @param origin parameter for the {@link OffsetLimitReachedException} when it has to be thrown.
	 * @since 5.0
	 */
	public final int consumeLine(int origin) throws OffsetLimitReachedException {
		Token t= fToken;
		Token lt= null;
		while (true) {
			switch (t.getType()) {
			case IToken.tCOMPLETION:
				if (lt != null) {
					fLastToken= lt;
				}
				fToken= t;
				throw new OffsetLimitReachedException(origin, t);
			case IToken.tEND_OF_INPUT:
				if (fSupportContentAssist) {
					t.setType(IToken.tCOMPLETION);
					throw new OffsetLimitReachedException(origin, t);
				}
				//$FALL-THROUGH$
			case Lexer.tNEWLINE:
				fToken= t;
				if (lt != null) {
					fLastToken= lt;
				}
				return getLastEndOffset();
			}
			lt= t;
			t= fetchToken();
		}
	}

	/** 
	 * Advances to the next pound token that starts a preprocessor directive. 
	 * @return pound token of the directive or end-of-input.
	 * @throws OffsetLimitReachedException when completion is requested in a literal or an header-name.
	 */
	public Token nextDirective() throws OffsetLimitReachedException {
		Token t0;
		Token t1= fToken;
		for (;;) {
			t0= t1;
			t1= fetchToken();
			final int tt1 = t1.getType();
			if (tt1 == IToken.tEND_OF_INPUT)
				break;
			if (tt1 == IToken.tPOUND) {
				final int tt0= t0.getType();
				if (tt0 == tNEWLINE || tt0 == tBEFORE_INPUT)
					break;
			}
		}
		fLastToken= t0;
		return fToken=t1;
	}
	
	/**
	 * Computes the next token.
	 */
	private Token fetchToken() throws OffsetLimitReachedException {
		while (true) {
			final int start= fOffset;
			final int c= fCharPhase3;
			final int d= nextCharPhase3();
			switch (c) {
			case END_OF_INPUT:
				return newToken(IToken.tEND_OF_INPUT, start);
			case '\n':
				fInsideIncludeDirective= false;
				return newToken(Lexer.tNEWLINE, start);
			case ' ':
			case '\t':
			case 0xb:  // vertical tab
			case '\f': 
			case '\r':
				continue;

			case 'L':
				switch (d) {
				case 'R':
					if (fOptions.fSupportRawStringLiterals) {
						markPhase3();
						if (nextCharPhase3() == '"') {
							nextCharPhase3();
							return rawStringLiteral(start, 3, IToken.tLSTRING);
						}
						restorePhase3();
					}
					break;
				case '"':
					nextCharPhase3();
					return stringLiteral(start, 2, IToken.tLSTRING);
				case '\'':
					nextCharPhase3();
					return charLiteral(start, IToken.tLCHAR);
				}
				return identifier(start, 1);

			case 'u': 	
			case 'U':
				if (fOptions.fSupportUTFLiterals) {
					switch (d) {
					case 'R':
						if (fOptions.fSupportRawStringLiterals) {
							markPhase3();
							if (nextCharPhase3() == '"') {
								nextCharPhase3();
								return rawStringLiteral(start, 3, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING);
							}
							restorePhase3();
						}
						break;
					case '"':
						nextCharPhase3();
						return stringLiteral(start, 2, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING);
					case '\'':
						nextCharPhase3();
						return charLiteral(start, c == 'u' ? IToken.tUTF16CHAR : IToken.tUTF32CHAR);
					case '8':
						if (c == 'u') {
							markPhase3();
							switch (nextCharPhase3()) {
							case 'R':
								if (fOptions.fSupportRawStringLiterals && nextCharPhase3() == '"') {
									nextCharPhase3();
									return rawStringLiteral(start, 4, IToken.tSTRING);
								}
								break;
							case '"':
								nextCharPhase3();
								return stringLiteral(start, 3, IToken.tSTRING);
							}
							restorePhase3();
						}
						break;
					}
				}
				return identifier(start, 1);
				
			case 'R':
				if (fOptions.fSupportRawStringLiterals && d == '"') {
					nextCharPhase3();
					return rawStringLiteral(start, 2, IToken.tSTRING);
				}
				return identifier(start, 1);
				
			case '"':
				if (fInsideIncludeDirective) {
					return headerName(start, true);
				}
 				return stringLiteral(start, 1, IToken.tSTRING);

			case '\'':
				return charLiteral(start, IToken.tCHAR);

			case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': 
			case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 
			case 's': case 't':           case 'v': case 'w': case 'x': case 'y': case 'z':
			case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I':
			case 'J': case 'K':           case 'M': case 'N': case 'O': case 'P': case 'Q':  
			case 'S': case 'T':           case 'V': case 'W': case 'X': case 'Y': case 'Z':
			case '_':
				return identifier(start, 1);

			case '$':
				if (fOptions.fSupportDollarInIdentifiers) {
					return identifier(start, 1);
				}
				break;
			case '@':
				if (fOptions.fSupportAtSignInIdentifiers) {
					return identifier(start, 1);
				}
				break;

			case '\\':
				switch (d) {
				case 'u': case 'U':
					nextCharPhase3();
					return identifier(start, 2);
				}
				return newToken(tOTHER_CHARACTER, start, 1);

			case '0': case '1': case '2': case '3': case '4':
			case '5': case '6': case '7': case '8': case '9':
				return number(start, 1, false);

			case '.':
				switch (d) {
				case '0': case '1': case '2': case '3': case '4':
				case '5': case '6': case '7': case '8': case '9':
					nextCharPhase3();
					return number(start, 2, true);

				case '.':
					markPhase3();
					if (nextCharPhase3() == '.') {
						nextCharPhase3();
						return newToken(IToken.tELLIPSIS, start);
					}
					restorePhase3();
					break;

				case '*':
					nextCharPhase3();
					return newToken(IToken.tDOTSTAR, start);
				}
				return newToken(IToken.tDOT, start);

			case '#':
				if (d == '#') {
					nextCharPhase3();
					return newToken(IToken.tPOUNDPOUND, start);
				}
				return newToken(IToken.tPOUND, start);

			case '{':
				return newToken(IToken.tLBRACE, start);
			case '}':
				return newToken(IToken.tRBRACE, start);
			case '[':
				return newToken(IToken.tLBRACKET, start);
			case ']':
				return newToken(IToken.tRBRACKET, start);
			case '(':
				return newToken(IToken.tLPAREN, start);
			case ')':
				return newToken(IToken.tRPAREN, start);
			case ';':
				return newToken(IToken.tSEMI, start);

			case ':':
				switch (d) {
				case ':':
					nextCharPhase3();
					return newToken(IToken.tCOLONCOLON, start);
				case '>': 
					nextCharPhase3();
					return newDigraphToken(IToken.tRBRACKET, start);
				}
				return newToken(IToken.tCOLON, start);

			case '?':
				return newToken(IToken.tQUESTION, start);

			case '+':
				switch (d) {
				case '+':
					nextCharPhase3();
					return newToken(IToken.tINCR, start);
				case '=':
					nextCharPhase3();
					return newToken(IToken.tPLUSASSIGN, start);
				}
				return newToken(IToken.tPLUS, start);

			case '-':
				switch (d) {
				case '>': 
					int e= nextCharPhase3();
					if (e == '*') {
						nextCharPhase3();
						return newToken(IToken.tARROWSTAR, start);
					}
					return newToken(IToken.tARROW, start);

				case '-':
					nextCharPhase3();
					return newToken(IToken.tDECR, start);
				case '=':
					nextCharPhase3();
					return newToken(IToken.tMINUSASSIGN, start);
				}
				return newToken(IToken.tMINUS, start);

			case '*':
				if (d == '=') {
					nextCharPhase3();
					return newToken(IToken.tSTARASSIGN, start);
				}
				return newToken(IToken.tSTAR, start);

			case '/':
				switch (d) {
				case '=':
					nextCharPhase3();
					return newToken(IToken.tDIVASSIGN, start);
				case '/':
					nextCharPhase3();
					lineComment(start);
					continue; 
				case '*':
					blockComment(start, '*');
					continue;
				case '%':
					if (fOptions.fSupportSlashPercentComments) {
						blockComment(start, '%');
						continue;
					}
					break;
				}
				return newToken(IToken.tDIV, start);

			case '%':
				switch (d) {
				case '=':
					nextCharPhase3();
					return newToken(IToken.tMODASSIGN, start);
				case '>':
					nextCharPhase3();
					return newDigraphToken(IToken.tRBRACE, start);
				case ':':
					final int e= nextCharPhase3();
					if (e == '%') {
						markPhase3();
						if (nextCharPhase3() == ':') {
							nextCharPhase3();
							return newDigraphToken(IToken.tPOUNDPOUND, start);
						}
						restorePhase3();
					}
					return newDigraphToken(IToken.tPOUND, start);
				}
				return newToken(IToken.tMOD, start);

			case '^':
				if (d == '=') {
					nextCharPhase3();
					return newToken(IToken.tXORASSIGN, start);
				}
				return newToken(IToken.tXOR, start);

			case '&':
				switch (d) {
				case '&':
					nextCharPhase3();
					return newToken(IToken.tAND, start);
				case '=':
					nextCharPhase3();
					return newToken(IToken.tAMPERASSIGN, start);
				}
				return newToken(IToken.tAMPER, start);

			case '|':
				switch (d) {
				case '|':
					nextCharPhase3();
					return newToken(IToken.tOR, start);
				case '=':
					nextCharPhase3();
					return newToken(IToken.tBITORASSIGN, start);
				}
				return newToken(IToken.tBITOR, start);

			case '~':
				return newToken(IToken.tBITCOMPLEMENT, start);

			case '!':
				if (d == '=') {
					nextCharPhase3();
					return newToken(IToken.tNOTEQUAL, start);
				}
				return newToken(IToken.tNOT, start);

			case '=':
				if (d == '=') {
					nextCharPhase3();
					return newToken(IToken.tEQUAL, start);
				}
				return newToken(IToken.tASSIGN, start);

			case '<':
				if (fInsideIncludeDirective) {
					return headerName(start, false);
				}

				switch (d) {
				case '=':
					nextCharPhase3();
					return newToken(IToken.tLTEQUAL, start);
				case '<':
					final int e= nextCharPhase3();
					if (e == '=') {
						nextCharPhase3();
						return newToken(IToken.tSHIFTLASSIGN, start);
					} 
					return newToken(IToken.tSHIFTL, start);
				case '?':
					if (fOptions.fSupportMinAndMax) {
						nextCharPhase3();
						return newToken(IGCCToken.tMIN, start);
					} 
					break;
				case ':':
					// 2.5-3
					markPhase3();
					if (nextCharPhase3() != ':') {
						return newDigraphToken(IToken.tLBRACKET, start);
					}
					switch (nextCharPhase3()) {
					case ':': case '>':
						restorePhase3();
						nextCharPhase3();
						return newDigraphToken(IToken.tLBRACKET, start);
					}
					restorePhase3();
					break;
				case '%':
					nextCharPhase3();
					return newDigraphToken(IToken.tLBRACE, start);
				}
				return newToken(IToken.tLT, start);

			case '>':
				switch (d) {
				case '=':
					nextCharPhase3();
					return newToken(IToken.tGTEQUAL, start);
				case '>':
					final int e= nextCharPhase3();
					if (e == '=') {
						nextCharPhase3();
						return newToken(IToken.tSHIFTRASSIGN, start);
					} 
					return newToken(IToken.tSHIFTR, start);
				case '?':
					if (fOptions.fSupportMinAndMax) {
						nextCharPhase3();
						return newToken(IGCCToken.tMAX, start);
					} 
					break;
				}
				return newToken(IToken.tGT, start);

			case ',':
				return newToken(IToken.tCOMMA, start);

			default:
				// in case we have some other letter to start an identifier
				if (Character.isUnicodeIdentifierStart((char) c)) {
					return identifier(start, 1);
				}
				break;
			}
			// handles for instance @
			return newToken(tOTHER_CHARACTER, start, 1);
		}
    }

	private Token newToken(int kind, int offset) {
    	return new Token(kind, fSource, offset, fOffset);
    }

	private Token newDigraphToken(int kind, int offset) {
    	return new TokenForDigraph(kind, fSource, offset, fOffset);
    }

    private Token newToken(final int kind, final int offset, final int imageLength) {
    	final int endOffset= fOffset;
    	final int sourceLen= endOffset - offset;
    	char[] image;
    	if (sourceLen != imageLength) {
    		image= getCharImage(offset, endOffset, imageLength);
    	}
    	else {
			image= new char[imageLength];
			fInput.arraycopy(offset, image, 0, imageLength);
    	}
    	return new TokenWithImage(kind, fSource, offset, endOffset, image);
    }

    private void handleProblem(int problemID, char[] arg, int offset) {
    	fLog.handleProblem(problemID, arg, offset, fOffset);
    }

	private Token headerName(final int start, final boolean expectQuotes) throws OffsetLimitReachedException {
    	int length= 1;
		boolean done = false;
		int c= fCharPhase3;
		loop: while (!done) {
			switch (c) {
			case END_OF_INPUT:
				if (fSupportContentAssist) {
					throw new OffsetLimitReachedException(ORIGIN_LEXER, 
							newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), start, length));
				}
				//$FALL-THROUGH$
			case '\n':
				handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start);
				break loop;
				
			case '"':
				done= expectQuotes;
				break;
			case '>':
				done= !expectQuotes;
				break;
			}
			length++;
			c= nextCharPhase3();
		}
		return newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), start, length);
	}

	private void blockComment(final int start, final char trigger) {
		// We can ignore line-splices, trigraphs and windows newlines when searching for the '*'
		int pos= fEndOffset;
		while (isValidOffset(pos)) {
			if (fInput.get(pos++) == trigger) {
				fEndOffset= pos;
				if (nextCharPhase3() == '/') {
					nextCharPhase3();
					fLog.handleComment(true, start, fOffset, fInput);
					return;
				}
			}
		}
		fCharPhase3= END_OF_INPUT;
		fOffset= fEndOffset= pos;
		fLog.handleComment(true, start, pos, fInput);
	}

	private void lineComment(final int start) {
		int c= fCharPhase3;
		while (true) {
			switch (c) {
			case END_OF_INPUT:
			case '\n':
				fLog.handleComment(false, start, fOffset, fInput);
				return;
			}
			c= nextCharPhase3();
		}
	}
	
	private boolean isIdentifierStart(int c) {
		return Character.isLetter((char)c) || 
				Character.isDigit((char)c) || 
				Character.isUnicodeIdentifierPart(c) ||
				(fOptions.fSupportDollarInIdentifiers && c == '$') ||
				(fOptions.fSupportAtSignInIdentifiers && c == '@') ||
				c == '_';
	}
	
	private Token stringLiteral(final int start, int length, int tokenType) throws OffsetLimitReachedException {
		boolean escaped = false;
		boolean done = false;
		
		int c= fCharPhase3;
		
		loop: while (!done) {
			switch (c) {
			case END_OF_INPUT:
				if (fSupportContentAssist) {
					throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken(tokenType, start, length));
				}
				//$FALL-THROUGH$
			case '\n':
				handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start);
				break loop;
				
			case '\\': 
				escaped= !escaped;
				break;
			case '"':
				if (!escaped) {
					done= true;
				}
				escaped= false;
				break;
			default:
				escaped= false;
				break;
			}
			length++;
			c= nextCharPhase3();
		}
		
		if (fOptions.fSupportUserDefinedLiterals && isUDLSuffixStart(c)) {
			Token t = identifier(start + length, 0);
			tokenType = IToken.tUSER_DEFINED_STRING_LITERAL;
			length += t.getLength();
		}
		
		return newToken(tokenType, start, length);
	}

	private boolean isUDLSuffixStart(int c) {
		// C++11 introduced a backward incompatible change breaking the following code:
		// #define __STDC_FORMAT_MACROS
		// #include <inttypes.h>
		// #include <stdio.h>
		//
		// void test() {
		//   int64_t i64 = 123;
		//   printf("My int64: %"PRId64"\n", i64);
		// }
		//
		// We follow the example of Clang and GCC that are working around this by interpreting literal
		// suffixes that don't start with underscores as separate tokens, which allows them to expand
		// as macros: http://llvm.org/viewvc/llvm-project?view=rev&revision=152287 and
		// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52538
		return c == '_';
	}

	private Token rawStringLiteral(final int start, int length, int tokenType) throws OffsetLimitReachedException {
		final int delimOffset= fOffset;
		int delimEndOffset = delimOffset;
		int offset;
		for (;; delimEndOffset++) {
			if (!fInput.isValidOffset(delimEndOffset)) {
				offset= delimEndOffset;
				break;
			}
			if (fInput.get(delimEndOffset) == '(') {
				offset= delimEndOffset + 1;
				break;
			}
		}
		
		final int delimLength= delimEndOffset - delimOffset;
		for (;; offset++) {
			if (!fInput.isValidOffset(offset)) {
				handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, offset), start);
				break;
			} 
				
			final char c= fInput.get(offset);
			if (c == ')') {
				final int endingDoubleQuoteOffset= offset + delimLength + 1;
				if (fInput.isValidOffset(endingDoubleQuoteOffset) && fInput.get(endingDoubleQuoteOffset) == '"') {
					boolean prefixMatches= true;
					for (int i = 0; i < delimLength; i++) {
						if (fInput.get(offset + i + 1) != fInput.get(delimOffset + i)) {
							prefixMatches= false;
							break;
						}
					}
					if (prefixMatches) {
						offset= endingDoubleQuoteOffset + 1;
						break;
					}
				}
			}
		}
		fOffset= offset - 1;
		fEndOffset= offset;
		fCharPhase3=  0;
		nextCharPhase3();
		
		if (fOptions.fSupportUserDefinedLiterals && isUDLSuffixStart(fCharPhase3)) {
			Token t = identifier(offset, 0);
			tokenType = IToken.tUSER_DEFINED_STRING_LITERAL;
			offset += t.getLength();
		}
		
		return newToken(tokenType, start, offset - start);
	}

	private Token charLiteral(final int start, int tokenType) throws OffsetLimitReachedException {
		boolean escaped = false;
		boolean done = false;
		int length= tokenType == IToken.tCHAR ? 1 : 2;
		int c= fCharPhase3;
		
		loop: while (!done) {
			switch (c) {
			case END_OF_INPUT:
				if (fSupportContentAssist) {
					throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken(tokenType, start, length));
				}
				//$FALL-THROUGH$
			case '\n':
				handleProblem(IProblem.SCANNER_BAD_CHARACTER, getInputChars(start, fOffset), start);
				break loop;
			case '\\': 
				escaped= !escaped;
				break;
			case '\'':
				if (!escaped) {
					done= true;
				}
				escaped= false;
				break;
			default:
				escaped= false;
				break;
			}
			length++;
			c= nextCharPhase3();
		}
		
		if (fOptions.fSupportUserDefinedLiterals && isIdentifierStart(c)) {
			Token t = identifier(start+length, 0);
			tokenType = IToken.tUSER_DEFINED_CHAR_LITERAL;
			length += t.getLength();
		}
		
		return newToken(tokenType, start, length);
	}
	
	private Token identifier(int start, int length) {
		int tokenKind= IToken.tIDENTIFIER;
    	boolean isPartOfIdentifier= true;
    	int c= fCharPhase3;
        while (true) {
        	switch (c) {
            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': 
            case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 
            case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I':
            case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 
            case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
            case '_': 
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
            	break;
            	
            case '\\': // universal character name
            	markPhase3();
            	switch (nextCharPhase3()) {
            	case 'u': case 'U':
            		length++;
            		break;
            	default:
            		restorePhase3();
            		isPartOfIdentifier= false;
            		break;
            	}
            	break;

            case END_OF_INPUT:
				if (fSupportContentAssist) {
					tokenKind= IToken.tCOMPLETION;
				}
				isPartOfIdentifier= false;
				break;
            case ' ': case '\t': case 0xb: case '\f': case '\r': case '\n':
                isPartOfIdentifier= false;
            	break;

            case '$':
            	isPartOfIdentifier= fOptions.fSupportDollarInIdentifiers;
            	break;
            case '@':
            	isPartOfIdentifier= fOptions.fSupportAtSignInIdentifiers;
            	break;
            	
            case '{': case '}': case '[': case ']': case '#': case '(': case ')': case '<': case '>':
            case '%': case ':': case ';': case '.': case '?': case '*': case '+': case '-': case '/':
            case '^': case '&': case '|': case '~': case '!': case '=': case ',': case '"': case '\'':
            	isPartOfIdentifier= false;
            	break;
            	
            default:
            	isPartOfIdentifier= Character.isUnicodeIdentifierPart((char) c);
            	break;
        	}
        	
        	if (!isPartOfIdentifier) {
        		break;
        	}
        	
        	length++;
        	c= nextCharPhase3();
        }

        return newToken(tokenKind, start, length);
	}
	
	private Token number(final int start, int length, boolean isFloat) throws OffsetLimitReachedException {
		boolean isPartOfNumber= true;
		boolean isHex= false;
		int c= fCharPhase3;
		while (true) {
			switch (c) {
			// non-digit
            case 'a': case 'b': case 'c': case 'd':           case 'f': case 'g': case 'h': case 'i': 
            case 'j': case 'k': case 'l': case 'm': case 'n': case 'o':           case 'q': case 'r': 
            case 's': case 't': case 'u': case 'v': case 'w':           case 'y': case 'z':
            case 'A': case 'B': case 'C': case 'D':           case 'F': case 'G': case 'H': case 'I':
            case 'J': case 'K': case 'L': case 'M': case 'N': case 'O':           case 'Q': case 'R': 
            case 'S': case 'T': case 'U': case 'V': case 'W': 		    case 'Y': case 'Z':
            case '_': 
            	
            // digit
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
            	break;
            	
            case 'x': case 'X':
            	isHex= !isFloat;
            	break;
            	
            // period
            case '.':
            	isFloat= true;
            	break;
            	
            // exponents
            case 'e':
            case 'E':
            	if (isHex)
            		break;
            	//$FALL-THROUGH$
            case 'p':
            case 'P':
            	length++;
            	c= nextCharPhase3();
            	switch (c) {
            	case '+': case '-':
            	case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
            		isFloat= true;
            		isHex= false;
            		length++;
                	c= nextCharPhase3();
            		break;
            	}
            	continue;
            	
            // universal character name (non-digit)
            case '\\':
            	markPhase3();
            	switch (nextCharPhase3()) {
            	case 'u': case 'U':
            		length++;
            		break;
            	default:
            		restorePhase3();
            		isPartOfNumber= false;
            		break;
            	}
            	break;
            
            case END_OF_INPUT:
				if (fSupportContentAssist) {
					throw new OffsetLimitReachedException(ORIGIN_LEXER, 
							newToken((isFloat ? IToken.tFLOATINGPT : IToken.tINTEGER), start, length));
				}
				isPartOfNumber= false;
				break;
				
            default:
            	isPartOfNumber= false;
            	break;
			}
        	if (!isPartOfNumber) {
        		break;
        	}
        	
        	c= nextCharPhase3();
        	length++;
		}
		
        return newToken((isFloat ? IToken.tFLOATINGPT : IToken.tINTEGER), start, length);
	}
	
	
	/**
	 * Saves the current state of phase3, necessary for '...', '%:%:', UNCs and string literals
	 * with a long prefix.
	 */
	private void markPhase3() {
		fMarkPhase3Offset= fOffset;
		fMarkPhase3EndOffset= fEndOffset;
		fMarkPhase3PrefetchedChar= fCharPhase3;
	}
	
	/**
	 * Restores a previously saved state of phase3.
	 */
	private void restorePhase3() {
		fOffset= fMarkPhase3Offset;
		fEndOffset= fMarkPhase3EndOffset;
		fCharPhase3= fMarkPhase3PrefetchedChar;
	}
	
	/**
	 * Perform phase 1-3: Replace \r\n with \n, handle trigraphs, detect line-splicing.
	 * Changes fOffset, fEndOffset and fCharPhase3, state-less otherwise.
	 */
	private int nextCharPhase3() {
		int pos= fEndOffset;
		do {
			if (!isValidOffset(pos + 1)) {
				if (!isValidOffset(pos)) {
					fOffset= pos;
					fEndOffset= pos;
					fCharPhase3= END_OF_INPUT;
					return END_OF_INPUT;
				}
				fOffset= pos;
				fEndOffset= pos + 1;
				fCharPhase3= fInput.get(pos);
				return fCharPhase3;
			}
			
			final char c= fInput.get(pos);
			fOffset= pos;
			fEndOffset= ++pos;
			fCharPhase3= c;
			switch (c) {
			case '\r':
				// windows line-ending
				if (fInput.get(pos) == '\n') {	
					fEndOffset= pos+1;
					fCharPhase3= '\n';
					return '\n';
				}
				// mac os 9 line ending
				fCharPhase3= '\n';
				return '\n';

				// trigraph sequences
			case '?':
				if (fInput.get(pos) != '?' || !isValidOffset(pos+1)) {
					return c;
				}
				final char trigraph= checkTrigraph(fInput.get(pos+1));
				if (trigraph == 0) {
					return c;
				}
				if (trigraph != '\\') {
					fEndOffset= pos+2;
					fCharPhase3= trigraph;
					return trigraph;
				}
				pos+= 2;
				// $FALL-THROUGH$, handle backslash

			case '\\':
				final int lsPos= findEndOfLineSpliceSequence(pos);
				if (lsPos > pos) {
					pos= lsPos;
					continue;
				}
				fEndOffset= pos;
				fCharPhase3= '\\';
				return '\\';	// don't return c, it may be a '?'

			default:
				return c;
			}
		}
		while (true);
	}
	
	/**
	 * Maps a trigraph to the character it encodes.
	 * @param c trigraph without leading question marks.
	 * @return the character encoded or 0.
	 */
	private char checkTrigraph(char c) {
		switch (c) {
		case '=': return '#';
		case '\'':return '^';
		case '(': return '[';
		case ')': return ']';
		case '!': return '|';
		case '<': return '{';
		case '>': return '}';
		case '-': return '~';
		case '/': return '\\';
		}
		return 0;
	}

	/**
	 * Returns the end-offset for a line-splice sequence, or -1 if there is none.
	 */
	private int findEndOfLineSpliceSequence(int pos) {
		boolean haveBackslash= true;
		int result= -1;
		loop: while (isValidOffset(pos)) {
			switch (fInput.get(pos++)) {
			case '\n':	
				if (haveBackslash) {
					result= pos;
					haveBackslash= false;
					continue loop;
				}
				return result; 					
		
			case '\r': case ' ': case '\f': case '\t': case 0xb: // vertical tab  
				if (haveBackslash) {
					continue loop;
				}
				return result;
			
			case '?':
				if (!isValidOffset(pos+1) || fInput.get(pos) != '?' || fInput.get(++pos) != '/') {
					return result;
				}
				// $FALL-THROUGH$ to backslash handling
					
			case '\\':
				if (!haveBackslash) {
					haveBackslash= true;
					continue loop;
				}
				return result;

			default:
				return result;
			}
		}
		return result;
	}

	/**
	 * Returns the image from the input without any modification.
	 */
	public char[] getInputChars(int offset, int endOffset) {
		final int length= endOffset - offset;
		if (length <= 0) {
			return CharArrayUtils.EMPTY;
		}
		final char[] result= new char[length];
		fInput.arraycopy(offset, result, 0, length);
		return result;
	}

	AbstractCharArray getInput() {
		return fInput;
	}
	
	/**
	 * Returns the image with trigraphs replaced and line-splices removed.
	 */
	private char[] getCharImage(int offset, int endOffset, int imageLength) {
		final char[] result= new char[imageLength];
		markPhase3();
		fEndOffset= offset;
		for (int idx= 0; idx < imageLength; idx++) {
			result[idx]= (char) nextCharPhase3();
		}
		restorePhase3();
		return result;
	}

	public void saveState() {
		fMarkOffset= fOffset;
		fMarkEndOffset= fEndOffset;
		fMarkPrefetchedChar= fCharPhase3;
		fMarkInsideIncludeDirective= fInsideIncludeDirective;
		fMarkToken= fToken;
		fMarkLastToken= fLastToken;
	}

	public void restoreState() {
		fOffset= fMarkOffset;
		fEndOffset= fMarkEndOffset;
		fCharPhase3= fMarkPrefetchedChar;
		fInsideIncludeDirective= fMarkInsideIncludeDirective;
		fToken= fMarkToken;
		fLastToken= fMarkLastToken;
	}
}