/** * This file Copyright (c) 2005-2008 Aptana, Inc. This program is * dual-licensed under both the Aptana Public License and the GNU General * Public license. You may elect to use one or the other of these licenses. * * This program is distributed in the hope that it will be useful, but * AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or * NONINFRINGEMENT. Redistribution, except as permitted by whichever of * the GPL or APL you select, is prohibited. * * 1. For the GPL license (GPL), you can redistribute and/or modify this * program under the terms of the GNU General Public License, * Version 3, as published by the Free Software Foundation. You should * have received a copy of the GNU General Public License, Version 3 along * with this program; if not, write to the Free Software Foundation, Inc., 51 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Aptana provides a special exception to allow redistribution of this file * with certain other free and open source software ("FOSS") code and certain additional terms * pursuant to Section 7 of the GPL. You may view the exception and these * terms on the web at http://www.aptana.com/legal/gpl/. * * 2. For the Aptana Public License (APL), this program and the * accompanying materials are made available under the terms of the APL * v1.0 which accompanies this distribution, and is available at * http://www.aptana.com/legal/apl/. * * You may view the GPL, Aptana's exception and additional terms, and the * APL in the file titled license.html at the root of the corresponding * plugin containing this source file. * * Any modifications to this file must keep this entire header intact. */ package com.aptana.ide.lexer; import java.util.HashMap; import java.util.Map; import java.util.Set; /** * @author Kevin Lindsey */ public abstract class AbstractLexer implements ILexer { private Map<String,ITokenList> _tokensByLanguage; private ITokenList _currentTokenList; private int _currentGroupIndex; private LexemeList _lexemeCache; public int hitCount; public int missCount; /** * The source code being lexed */ protected char[] source; /** * The current offset that indicates the end of file */ protected int eofOffset; /** * The token index of the last match. This value will be -1 if getNextLexeme did not find a new lexeme */ protected int lastMatchedTokenIndex; /** * Current offset within the source code where the next match will begin */ protected int currentOffset; /** * Create a new instance of Lexer */ public AbstractLexer() { this._tokensByLanguage = new HashMap<String,ITokenList>(); this.setSource(new char[0]); } /** * @see com.aptana.ide.lexer.ILexer#setLexemeCache(com.aptana.ide.lexer.LexemeList) */ public void setLexemeCache(LexemeList lexemeCache) { this._lexemeCache = lexemeCache; } /** * getCachedLexeme * * @return Lexeme */ protected Lexeme getCachedLexeme() { Lexeme result = null; if (this._lexemeCache != null) { // search for lexeme at the current offset in our lexeme cache int index = this._lexemeCache.getLexemeIndex(this.currentOffset); // the lexeme already exists, if the resulting index is positive if (index >= 0) { // grab the result result = this._lexemeCache.get(index); // update our current position this.currentOffset += result.length; } else { // get range Range range = this._lexemeCache.getAffectedRegion(); // make sure we're not in the affected region if (range.containsOffset(this.currentOffset) == false) { // we're aren't in the affected region, so adjust // the index to the next item in our cache index = -(index + 1); // make sure our index is not off the end of the // cache list if (index < this._lexemeCache.size()) { // get the starting offset of the affected // region int startingOffset = range.getStartingOffset(); // get our candidate lexeme from the cache Lexeme candidate = this._lexemeCache.get(index); // make sure we're either already past the // affected region OR that candidate in the // cache does not cross through the affected // region if (this.currentOffset >= range.getEndingOffset() || (this.currentOffset < startingOffset && candidate.getEndingOffset() <= startingOffset)) { result = candidate; this.currentOffset = result.getEndingOffset(); } } } } } if (result == null) { missCount++; } else { hitCount++; } return result; } /** * @see com.aptana.ide.lexer.ILexer#getCharacterAt(int) */ public char getCharacterAt(int offset) { char result = '\0'; if (offset < this.source.length) { result = this.source[offset]; } return result; } /** * @see com.aptana.ide.lexer.ILexer#getCurrentOffset() */ public int getCurrentOffset() { return this.currentOffset; } /** * @see com.aptana.ide.lexer.ILexer#getCurrentTokenList() */ public ITokenList getCurrentTokenList() { return this._currentTokenList; } /** * @see com.aptana.ide.lexer.ILexer#setCurrentOffset(int) */ public void setCurrentOffset(int offset) { this.currentOffset = offset; } /** * @see com.aptana.ide.lexer.ILexer#getEOFOffset() */ public int getEOFOffset() { return this.eofOffset; } /** * @see com.aptana.ide.lexer.ILexer#setEOFOffset(int) */ public void setEOFOffset(int offset) { this.eofOffset = offset; } /** * @see com.aptana.ide.lexer.ILexer#getGroup() */ public String getGroup() { return this._currentTokenList.getCurrentGroup(); } /** * @see com.aptana.ide.lexer.ILexer#setGroup(java.lang.String) */ public void setGroup(String groupName) throws LexerException { this._currentTokenList.setCurrentGroup(groupName); this._currentGroupIndex = -1; } /** * @see com.aptana.ide.lexer.ILexer#setIgnoreSet(java.lang.String, int[]) */ public void setIgnoreSet(String language, int[] set) { ITokenList tokens = this.getTokenList(language); tokens.setIgnoreSet(set); } /** * @see com.aptana.ide.lexer.ILexer#getLanguage() */ public String getLanguage() { return this._currentTokenList.getLanguage(); } /** * @see com.aptana.ide.lexer.ILexer#getLanguages() */ public String[] getLanguages() { Set<String> keySet = this._tokensByLanguage.keySet(); return keySet.toArray(new String[keySet.size()]); } /** * @see com.aptana.ide.lexer.ILexer#setLanguage(java.lang.String) */ public void setLanguage(String language) throws LexerException { ITokenList tokenList = this._tokensByLanguage.get(language); if (tokenList == null) { throw new LexerException(Messages.Lexer_Unrecognized_Language + language, null); } this._currentTokenList = tokenList; // reset stats for this language. We may want to emit the old values to the log this.hitCount = 0; this.missCount = 0; } /** * @see com.aptana.ide.lexer.ILexer#setLanguageAndGroup(java.lang.String, java.lang.String) */ public void setLanguageAndGroup(String language, String group) throws LexerException { this.setLanguage(language); this.setGroup(group); } /** * @see com.aptana.ide.lexer.ILexer#getSource() */ public String getSource() { return new String(this.source); } /** * @see com.aptana.ide.lexer.ILexer#getSourceLength() */ public int getSourceLength() { return this.source.length; } /** * @see com.aptana.ide.lexer.ILexer#setSource(char[]) */ public void setSource(char[] value) { this.source = value; this.currentOffset = 0; this.setEOFOffset(this.source.length); } /** * @see com.aptana.ide.lexer.ILexer#setSource(java.lang.String) */ public void setSource(String value) { this.setSource(value.toCharArray()); } /** * @see com.aptana.ide.lexer.ILexer#getTokenList(java.lang.String) */ public ITokenList getTokenList(String language) { ITokenList result = null; if (this._tokensByLanguage.containsKey(language)) { result = this._tokensByLanguage.get(language); } return result; } /** * @see com.aptana.ide.lexer.ILexer#isEOS() */ public boolean isEOS() { // return this.getSourceLength() == 0 || this.currentOffset >= this._eofOffset; return this.currentOffset >= this.eofOffset; } /** * @see com.aptana.ide.lexer.ILexer#addLanguage(com.aptana.ide.lexer.ITokenList) */ public void addLanguage(ITokenList tokens) { this._tokensByLanguage.put(tokens.getLanguage(), tokens); } /** * @see com.aptana.ide.lexer.ILexer#seal() */ public void seal() throws LexerException { String[] languages = this.getLanguages(); for (int i = 0; i < languages.length; i++) { String language = languages[i]; ITokenList tokens = this.getTokenList(language); tokens.seal(); } } /** * Create a new lexeme. Sub-classes will need to override this method to create their own lexeme sub-classes * * @param token * The token class for this lexeme * @param text * The token's associated text * @param offset * The token's offset within the source file * @return Returns a newly created lexeme */ protected Lexeme createLexeme(IToken token, String text, int offset) { return new Lexeme(token, text, offset); } /** * @see com.aptana.ide.lexer.ILexer#find(java.lang.String) */ public abstract Range find(String groupName) throws LexerException; /** * match * * @return Returns the position of the last failed or successful match */ protected abstract int match(); /** * @see com.aptana.ide.lexer.ILexer#setLexerState(java.lang.String, int) */ public void setLexerState(String group, int offset) throws LexerException { this.setGroup(group); this.setCurrentOffset(offset); } /** * @see com.aptana.ide.lexer.ILexer#setLexerState(java.lang.String, char[], int, com.aptana.ide.lexer.LexemeList) */ public void setLexerState(String group, char[] source, int offset, LexemeList cache) throws LexerException { this.setGroup(group); this.setSource(source); this.setCurrentOffset(offset); this._lexemeCache = cache; } /** * @see com.aptana.ide.lexer.ILexer#getNextLexeme() */ public Lexeme getNextLexeme() { Lexeme result = null; // Start out as if we've found a token we want to ignore boolean inIgnoreSet = true; // keep advancing until we find a token that is not in our ignore // set // while (inIgnoreSet && this.isEOS() == false) while (inIgnoreSet && this.currentOffset < this.eofOffset) { result = this.getCachedLexeme(); if (result != null) { break; } // cache our current "start" position int start = this.currentOffset; // perform a match at the current offset int position = this.match(); // get the index of the token that matched, if any int tokenIndex = this.lastMatchedTokenIndex; // process token type, if we had match if (tokenIndex != -1) { ITokenList tokenList = this.getCurrentTokenList(); IToken token = tokenList.get(tokenIndex); int[] ignoreSet = tokenList.getIgnoreSet(); boolean desirableType = true; // NOTE: These sets should be small (<= 3) and linear search wins over // binary search with those sizes. Plus our lists are sorted, so we can // stop as soon as we reach values greater than what we're looking for if (ignoreSet != null) { int typeIndex = token.getTypeIndex(); for (int i = 0; i < ignoreSet.length; i++) { int current = ignoreSet[i]; if (current >= typeIndex) { desirableType = (current > typeIndex); break; } } } // See if this token is in our set of tokens to ignore if (desirableType) { // determine text length of this token instance int lexemeLength = position - start; // grab lexeme text String text = new String(this.source, start, lexemeLength); // create resulting lexeme result = this.createLexeme(token, text, start); // flag to exit loop inIgnoreSet = false; } // update current position in the source text this.currentOffset = position; // switch to new lexer group associated with the matched token int groupIndex = token.getNewLexerGroupIndex(); if (groupIndex != this._currentGroupIndex) { tokenList.setCurrentGroup(groupIndex); this._currentGroupIndex = groupIndex; } } else { inIgnoreSet = false; } } return result; } }