QMLDirectoryLexer.java example

Explorer
cdt-master
/*******************************************************************************
 * Copyright (c) 2015 QNX Software Systems and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 * QNX Software Systems - Initial API and implementation
 *******************************************************************************/
package org.eclipse.cdt.qt.core.qmldir;

import java.io.InputStream;
import java.util.Scanner;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;

import org.eclipse.cdt.internal.qt.core.location.Position;
import org.eclipse.cdt.internal.qt.core.location.SourceLocation;
import org.eclipse.cdt.qt.core.location.ISourceLocation;

/**
 * Converts an <code>InputStream</code> representing a qmldir file into a stream of tokens through successive calls to
 * <code>nextToken</code>. This lexer uses regular expressions to match its 16 valid token types:
 * <ul>
 * <li><b>COMMENT</b>: A single line comment that begins with '#'
 * <li><b>MODULE</b>: Keyword 'module'
 * <li><b>TYPEINFO</b>: The keyword 'typeinfo'
 * <li><b>SINGLETON</b>: The keyword 'singleton'
 * <li><b>INTERNAL</b>: The keyword 'internal'
 * <li><b>PLUGIN</b>: The keyword 'plugin'
 * <li><b>CLASSNAME</b>: The keyword 'classname'
 * <li><b>DEPENDS</b>: The keyword 'depends'
 * <li><b>DESIGNERSUPPORTED</b>: The keyword 'designersupported'
 * <li><b>WORD</b>: A group of characters that form an identifier, filename, or path
 * <li><b>DECIMAL</b>: A number of the form [0-9]+ '.' [0-9]+
 * <li><b>INTEGER</b>: An integer of the form [0-9]+
 * <li><b>WHITESPACE</b>: A group of whitespace characters (not including newlines)
 * <li><b>COMMAND_END</b>: A newline character
 * <li><b>UNKNOWN</b>: A group of characters that does not match any of the preceding tokens
 * <li><b>EOF</b>: End of File
 * </ul>
 */
public class QMLDirectoryLexer {
	/**
	 * A single matched token returned by a <code>QMLDirectoryLexer</code>. A <code>Token</code> stores information on how it was
	 * matched including the type of token, the exact text that was matched, and its position in the <code>InputStream</code> .
	 */
	public static class Token {
		private final TokenType tokType;
		private final String raw;
		private final ISourceLocation location;
		private final int start;
		private final int end;

		private Token(TokenType type, MatchResult match, int line, int lineStart) {
			this(type, match.group(), match.start(), match.end(), line, lineStart);
		}

		private Token(TokenType type, String raw, int start, int end, int line, int lineStart) {
			this.tokType = type;
			raw = raw.replaceAll("\n", "\\\\n"); //$NON-NLS-1$ //$NON-NLS-2$
			raw = raw.replaceAll("\r", "\\\\r"); //$NON-NLS-1$ //$NON-NLS-2$
			this.raw = raw;
			this.start = start;
			this.end = end;
			this.location = new SourceLocation(null,
					new Position(line, start - lineStart),
					new Position(line, end - lineStart));
		}

		/**
		 * Get the type of token that was matched.
		 *
		 * @return the type of token
		 */
		public TokenType getType() {
			return tokType;
		}

		/**
		 * Gets the raw text that this token was matched with.
		 *
		 * @return a String representing the matched text
		 */
		public String getText() {
			return raw;
		}

		/**
		 * Gets a more detailed description of this token's location in the <code>InputStream</code> than {@link Token#getStart()}
		 * and {@link Token#getEnd()}. This method allows the retrieval of line and column information in order to make output for
		 * syntax errors and the like more human-readable.
		 *
		 * @return the {@link ISourceLocation} representing this token's location in the <code>InputStream</code>
		 */
		public ISourceLocation getLocation() {
			return location;
		}

		/**
		 * Gets the zero-indexed offset indicating the start of this token in the <code>InputStream</code>.
		 *
		 * @return the token's start offset
		 */
		public int getStart() {
			return start;
		}

		/**
		 * Gets the zero-indexed offset indicating the end of this token in the <code>InputStream</code>.
		 *
		 * @return the token's end offset
		 */
		public int getEnd() {
			return end;
		}
	}

	/**
	 * An Enumeration encompassing the 16 possible types of tokens returned by a <code>QMLDirectoryLexer</code>.
	 *
	 * @see org.eclipse.cdt.qt.core.qmldir.QMLDirectoryLexer
	 */
	public static enum TokenType {
		COMMENT("#.*$"), //$NON-NLS-1$
		MODULE("module(?=\\s|$)"), //$NON-NLS-1$
		TYPEINFO("typeinfo(?=\\s|$)"), //$NON-NLS-1$
		SINGLETON("singleton(?=\\s|$)"), //$NON-NLS-1$
		INTERNAL("internal(?=\\s|$)"), //$NON-NLS-1$
		PLUGIN("plugin(?=\\s|$)"), //$NON-NLS-1$
		CLASSNAME("classname(?=\\s|$)"), //$NON-NLS-1$
		DEPENDS("depends(?=\\s|$)"), //$NON-NLS-1$
		DESIGNERSUPPORTED("designersupported(?=\\s|$)"), //$NON-NLS-1$
		WORD("[^0-9\\s][^\\s]*"), //$NON-NLS-1$
		DECIMAL("[0-9]+\\.[0-9]+"), //$NON-NLS-1$
		INTEGER("[0-9]+"), //$NON-NLS-1$
		WHITESPACE("\\h+"), //$NON-NLS-1$
		COMMAND_END("(?:\r\n)|\n"), //$NON-NLS-1$
		UNKNOWN(".+"), //$NON-NLS-1$
		EOF(null);

		private static Pattern pattern;

		private static Pattern patternForAllTerminals() {
			if (pattern == null) {
				String regex = ""; //$NON-NLS-1$
				TokenType[] tokens = TokenType.values();
				for (int i = 0; i < TokenType.values().length; i++) {
					TokenType tok = tokens[i];
					if (tok.regex != null) {
						if (i != 0) {
							regex += "|"; //$NON-NLS-1$
						}
						regex += "(" + tok.regex + ")"; //$NON-NLS-1$ //$NON-NLS-2$
					}
				}
				pattern = Pattern.compile(regex, Pattern.MULTILINE);
			}
			return pattern;
		}

		private final String regex;

		private TokenType(String regex) {
			this.regex = regex;
		}
	}

	private Scanner input;
	private MatchResult lastMatch;
	private int currentLine;
	private int currentLineStart;

	/**
	 * Creates a new <code>QMLDirectoryLexer</code> without initializing any of the its internal state. A call to
	 * <code>setInput</code> is necessary to fully initialize the lexer before any calls to <code>nextToken</code>.
	 */
	public QMLDirectoryLexer() {
	}

	/**
	 * Prepares for lexical analysis by giving the lexer an <code>InputStream</code> to retrieve text from.
	 *
	 * @param input
	 *            the input to perform lexical analysis on
	 */
	public void setInput(InputStream input) {
		this.input = new Scanner(input);
		this.lastMatch = null;
		this.currentLine = 1;
		this.currentLineStart = 0;
	}

	/**
	 * Retrieves the next valid token from the <code>InputStream</code> given by <code>setInput</code>. This is a helper method to
	 * skip whitespace that is equivalent to <code>QMLDirectoryLexer.nextToken(true)</code>.
	 *
	 * @return the next token in the <code>InputStream</code>
	 * @throws IllegalArgumentException
	 *             if <code>setInput</code> has not been called
	 */
	public Token nextToken() throws IllegalArgumentException {
		return nextToken(true);
	}

	/**
	 * Retrieves the next valid token from the <code>InputStream</code> given by <code>setInput</code>. This method has the ability
	 * to skip over whitespace tokens by setting <code>skipWhitespace</code> to <code>true</code>.
	 *
	 * @param skipWhitespace
	 *            whether or not the lexer should skip whitespace tokens
	 * @return the next token in the <code>InputStream</code>
	 * @throws IllegalArgumentException
	 *             if <code>setInput</code> has not been called
	 */
	public Token nextToken(boolean skipWhitespace) throws IllegalArgumentException {
		if (input == null) {
			throw new IllegalArgumentException("Input cannot be null"); //$NON-NLS-1$
		}
		if (input.findWithinHorizon(TokenType.patternForAllTerminals(), 0) == null) {
			if (lastMatch != null) {
				return new Token(TokenType.EOF, "", lastMatch.end(), lastMatch.end(), currentLine, currentLineStart); //$NON-NLS-1$
			} else {
				return new Token(TokenType.EOF, "", 0, 0, 1, 0); //$NON-NLS-1$
			}
		} else {
			int groupNo = 1;
			for (TokenType t : TokenType.values()) {
				if (t.regex != null) {
					if (input.match().start(groupNo) != -1) {
						lastMatch = input.match();
						Token next = null;
						if (!(t.equals(TokenType.WHITESPACE) && skipWhitespace)) {
							next = new Token(t, input.match(), currentLine, currentLineStart);
						} else {
							next = nextToken(skipWhitespace);
						}
						if (t.equals(TokenType.COMMAND_END)) {
							// Advance the line number information
							currentLine++;
							currentLineStart = input.match().end();
						}
						return next;
					}
					groupNo++;
				}
			}
			return new Token(TokenType.UNKNOWN, input.match(), currentLine, currentLineStart);
		}
	}
}