/** * Copyright 2012 Tobias Gierke <tobias.gierke@code-sourcery.de> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.codesourcery.jasm16.lexer; import java.util.*; import org.apache.commons.lang.StringUtils; import de.codesourcery.jasm16.OpCode; import de.codesourcery.jasm16.exceptions.EOFException; import de.codesourcery.jasm16.exceptions.ParseException; import de.codesourcery.jasm16.parser.Operator; import de.codesourcery.jasm16.scanner.IScanner; import de.codesourcery.jasm16.utils.NumberLiteralHelper; /** * Default {@link ILexer} implementation. * * @author tobias.gierke@code-sourcery.de */ public final class Lexer implements ILexer { private final IScanner scanner; private final StringBuilder buffer = new StringBuilder(); private final Set<LexerOption> options = new HashSet<LexerOption>(); private boolean caseSensitiveOpCodes = true; // internal state private final List<IToken> currentTokens=new ArrayList<IToken>(); private final Stack<State> marks = new Stack<State>(); private final ParseOffset parseOffset; public static final class ParseOffset { // offset relative to actual scanner offset, used // when expanding macro invocations private int baseOffset; private int currentLineNumber; private int currentLineStartOffset; public ParseOffset() { this(0,1,0); } public ParseOffset(int baseOffset, int currentLineNumber,int currentLineStartOffset) { this.baseOffset = baseOffset; this.currentLineNumber = currentLineNumber; this.currentLineStartOffset = currentLineStartOffset; } public ParseOffset(ParseOffset offset) { this.baseOffset = offset.baseOffset; this.currentLineNumber = offset.currentLineNumber; this.currentLineStartOffset = offset.currentLineStartOffset; } @Override public String toString() { return "ParseOffset[ base_offset="+baseOffset+", line_nr="+currentLineNumber+",lineStartingOffset="+currentLineStartOffset+"]"; } public int baseOffset() { return baseOffset; } public int currentLineNumber() { return currentLineNumber;} public int currentLineStartOffset() { return currentLineStartOffset; } public void apply(ParseOffset offset) { this.baseOffset = offset.baseOffset; this.currentLineNumber = offset.currentLineNumber; this.currentLineStartOffset = offset.currentLineStartOffset; } public void newLine(int newLineStartOffset) { this.currentLineNumber++; this.currentLineStartOffset = newLineStartOffset; } } protected final class State { private final List<IToken> markedTokens = new ArrayList<IToken>(); private final int scannerOffset; private final ParseOffset offset; private final Set<LexerOption> options; protected State() { this.markedTokens.addAll( Lexer.this.currentTokens ); this.scannerOffset = Lexer.this.scanner.currentParseIndex(); this.offset = new ParseOffset( Lexer.this.parseOffset ); this.options = new HashSet<>( Lexer.this.options ); } public void apply() { Lexer.this.scanner.setCurrentParseIndex( this.scannerOffset ); Lexer.this.currentTokens.clear(); Lexer.this.currentTokens.addAll( this.markedTokens ); Lexer.this.parseOffset.apply( this.offset ); Lexer.this.options.clear(); Lexer.this.options.addAll( this.options ); } } public Lexer(IScanner scanner) { this(scanner,new ParseOffset()); } public Lexer(IScanner scanner,ParseOffset offset) { this.scanner = scanner; this.parseOffset = offset; } @Override public void mark() { marks.push( new State() ); } @Override public void clearMark() { if ( marks.isEmpty() ) { throw new IllegalStateException("Must call mark() first"); } marks.pop(); } @Override public void reset() throws IllegalStateException { if ( marks.isEmpty() ) { throw new IllegalStateException("Must call mark() first"); } // TODO: Maybe should be pop() here ??? marks.peek().apply(); } private void parseNextToken() { if ( scanner.eof() ) { return; } // clear buffer buffer.setLength(0); // skip whitespace int startIndex = relativeParseIndex(); while ( ! scanner.eof() && isWhitespace( scanner.peek() ) ) { buffer.append( scanner.read() ); } if ( buffer.length() > 0 ) { currentTokens.add( new Token( TokenType.WHITESPACE , buffer.toString(), startIndex ) ); } if ( scanner.eof() ) { return; } startIndex = relativeParseIndex(); char currentChar = scanner.peek(); buffer.setLength( 0 ); while ( ! scanner.eof() ) { currentChar = scanner.peek(); switch( currentChar ) { case ' ': // whitespace case '\t': // whitespace handleString( buffer.toString() , startIndex ); return; case ';': // single-line comment handleString( buffer.toString() , startIndex ); startIndex = relativeParseIndex(); scanner.read(); currentTokens.add( new Token(TokenType.SINGLE_LINE_COMMENT, ";" , relativeParseIndex()-1 ) ); return; case '\\': handleString( buffer.toString() , startIndex ); startIndex = relativeParseIndex(); scanner.read(); currentTokens.add( new Token(TokenType.STRING_ESCAPE, "\\", relativeParseIndex()-1 ) ); return; case '\'': case '"': // string delimiter handleString( buffer.toString() , startIndex ); startIndex = relativeParseIndex(); scanner.read(); currentTokens.add( new Token(TokenType.STRING_DELIMITER, Character.toString( currentChar ) , relativeParseIndex()-1 ) ); return; case '\n': // parse unix-style newline handleString( buffer.toString() , startIndex ); startIndex = relativeParseIndex(); scanner.read(); currentTokens.add( new Token(TokenType.EOL, "\n" , relativeParseIndex()-1 ) ); return; case '\r': // parse DOS-style newline buffer.append( scanner.read() ); if ( ! scanner.eof() && scanner.peek() == '\n' ) { handleString( buffer.toString() , buffer.length()-1 , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.EOL, "\r\n" , relativeParseIndex()-2 ) ); return; } continue; case ':': handleString( buffer.toString() , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.COLON , ":" , relativeParseIndex()-1 ) ); return; case '(': handleString( buffer.toString() , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.PARENS_OPEN , "(" , relativeParseIndex()-1) ); return; case ')': handleString( buffer.toString() , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.PARENS_CLOSE, ")" , relativeParseIndex()-1 ) ); return; case '[': handleString( buffer.toString() , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.ANGLE_BRACKET_OPEN , "[" , relativeParseIndex()-1) ); return; case ']': handleString( buffer.toString() , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.ANGLE_BRACKET_CLOSE, "]" , relativeParseIndex()-1 ) ); return; case ',': handleString( buffer.toString() , startIndex ); scanner.read(); currentTokens.add( new Token(TokenType.COMMA , "," , relativeParseIndex()-1 ) ); return; } if ( Operator.isOperatorPrefix( currentChar ) ) { parseOperator( startIndex ); return; } // ...keep the rest...some unrecognized character sequence buffer.append( scanner.read() ); } handleString( buffer.toString() , startIndex ); } /** * Returns the scanner's current parse offset plus the parsing base offset. * @return */ private int relativeParseIndex() { return this.parseOffset.baseOffset+scanner.currentParseIndex(); } private void parseOperator(int lastStartIndex) { handleString( buffer.toString() , lastStartIndex ); buffer.setLength( 0 ); // consume first character final int startIndex = relativeParseIndex(); buffer.append( scanner.read() ); List<Operator> possibleOperators = Operator.getPossibleOperatorsByPrefix( buffer.toString() ); while ( ! scanner.eof() && ( possibleOperators.size() > 1 || ( possibleOperators.size() == 1 && ! Operator.isValidOperator( buffer.toString() ) ) ) ) { char peek = scanner.peek(); if ( Operator.isOperatorPrefix( buffer.toString()+peek ) ) { buffer.append( scanner.read() ); possibleOperators = Operator.getPossibleOperatorsByPrefix( buffer.toString() ); } else { break; } } final String operator; if ( possibleOperators.size() > 1 ) { operator = Operator.pickOperatorWithLongestMatch( buffer.toString() ).getLiteral(); } else { operator = buffer.toString(); } currentTokens.add( new Token( TokenType.OPERATOR , operator , startIndex ) ); } private void handleString(String buffer, int startIndex) { handleString(buffer,buffer.length() , startIndex ); } private void handleString(String s, int length , int startIndex) { /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! * MAKE SURE TO ADJUST isKeyword(String) when changing keywords here * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ /* * Note that all comparisons here are ordered by * their probabilities (more likely checks come first). */ if ( s.length() == 0 || length <= 0 ) { return; } final String buffer = s.substring(0,length); OpCode opCode = caseSensitiveOpCodes ? OpCode.fromIdentifier( buffer ) : OpCode.fromIdentifier( buffer.toUpperCase() ); if ( opCode != null ) { currentTokens.add( new Token( TokenType.INSTRUCTION , buffer , startIndex ) ); return; } if ( NumberLiteralHelper.isNumberLiteral( buffer ) ) { currentTokens.add( new Token(TokenType.NUMBER_LITERAL , buffer , startIndex ) ); return; } if ( "push".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.PUSH , buffer , startIndex ) ); return ; } if ( "pop".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.POP , buffer , startIndex ) ); return ; } if ( ".word".equalsIgnoreCase( buffer ) || "dat".equalsIgnoreCase( buffer ) || ".dat".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.INITIALIZED_MEMORY_WORD , buffer , startIndex ) ); return ; } if ( ".equ".equalsIgnoreCase( buffer ) || "#define".equalsIgnoreCase(buffer) ) { currentTokens.add( new Token(TokenType.EQUATION , buffer , startIndex ) ); return ; } if ( "pick".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.PICK , buffer , startIndex ) ); return ; } if ( "peek".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.PEEK , buffer , startIndex ) ); return ; } if ( ".byte".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.INITIALIZED_MEMORY_BYTE , buffer , startIndex ) ); return ; } if ( "pack".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.INITIALIZED_MEMORY_PACK , buffer , startIndex ) ); return ; } if ( "reserve".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.UNINITIALIZED_MEMORY_WORDS , buffer , startIndex ) ); return ; } if ( ".bss".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.UNINITIALIZED_MEMORY_BYTES , buffer , startIndex ) ); return ; } if ( "#include".equals( buffer ) || ".include".equals( buffer ) || "include".equalsIgnoreCase( buffer) || ".incsource".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.INCLUDE_SOURCE, buffer , startIndex ) ); return ; } if ( ".incbin".equalsIgnoreCase( buffer ) || "incbin".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.INCLUDE_BINARY , buffer , startIndex ) ); return ; } if ( "org".equalsIgnoreCase( buffer ) || ".org".equalsIgnoreCase( buffer ) || ".origin".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.ORIGIN , buffer , startIndex ) ); return ; } if ( ".macro".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.START_MACRO , buffer , startIndex ) ); return ; } if ( ".endmacro".equalsIgnoreCase( buffer ) ) { currentTokens.add( new Token(TokenType.END_MACRO , buffer , startIndex ) ); return ; } if ( buffer.contains("." ) ) { int idx = startIndex; int lastIndex = startIndex; final StringBuilder tmp = new StringBuilder(); final int len = buffer.length(); for ( int i = 0 ; i <len ; i++ , idx++) { final char c = buffer.charAt( i ); if ( c == '.' ) { if ( tmp.length() > 0 ) { currentTokens.add( new Token(TokenType.CHARACTERS, tmp.toString() , lastIndex ) ); tmp.setLength(0); } currentTokens.add( new Token(TokenType.DOT, "." , idx ) ); lastIndex = idx+1; continue; } tmp.append( c ); } if ( tmp.length() > 0 ) { currentTokens.add( new Token(TokenType.CHARACTERS, tmp.toString() , lastIndex ) ); } return; } currentTokens.add( new Token( TokenType.CHARACTERS , buffer , startIndex ) ); } /** * Returns whether a given string matches a keyword (case-insensitive). * * @param s * @return */ public boolean isKeyword(String buffer) { if ( StringUtils.isBlank(buffer) ) { return false; } if ( OpCode.fromIdentifier( buffer ) != null ) { return true; } if ( "push".equalsIgnoreCase( buffer ) ) { return true; } if ( "pop".equalsIgnoreCase( buffer ) ) { return true; } if ( ".word".equalsIgnoreCase( buffer ) || "dat".equalsIgnoreCase( buffer ) || ".dat".equalsIgnoreCase( buffer ) ) { return true; } if ( ".equ".equalsIgnoreCase( buffer ) || "#define".equalsIgnoreCase(buffer) ) { return true; } if ( "pick".equalsIgnoreCase( buffer ) ) { return true; } if ( "peek".equalsIgnoreCase( buffer ) ) { return true; } if ( ".byte".equalsIgnoreCase( buffer ) ) { return true; } if ( "pack".equalsIgnoreCase( buffer ) ) { return true; } if ( "reserve".equalsIgnoreCase( buffer ) ) { return true ; } if ( ".bss".equalsIgnoreCase( buffer ) ) { return true; } if ( "#include".equals( buffer ) || ".include".equals( buffer ) || "include".equalsIgnoreCase( buffer) || ".incsource".equalsIgnoreCase( buffer ) ) { return true; } if ( ".incbin".equalsIgnoreCase( buffer ) || "incbin".equalsIgnoreCase( buffer ) ) { return true; } if ( "org".equalsIgnoreCase( buffer ) || ".org".equalsIgnoreCase( buffer ) || ".origin".equalsIgnoreCase( buffer ) ) { return true; } if ( ".macro".equalsIgnoreCase( buffer ) ) { return true; } if ( ".endmacro".equalsIgnoreCase( buffer ) ) { return true; } return false; } private static boolean isWhitespace(char c ) { return c == ' ' || c == '\t'; } private IToken currentToken() { if ( currentTokens.isEmpty() ) { parseNextToken(); if ( currentTokens.isEmpty() ) { return null; } return currentTokens.get(0); } return currentTokens.get(0); } @Override public boolean eof() { return currentToken() == null; } @Override public IToken peek() throws EOFException { if ( eof() ) { throw new EOFException("Premature end of file",currentParseIndex() ); } return currentToken(); } @Override public boolean peek(TokenType t) throws EOFException { if ( eof() ) { throw new EOFException("Premature end of file",currentParseIndex() ); } return currentToken().hasType(t); } @Override public IToken read() throws EOFException { if ( eof() ) { throw new EOFException("Premature end of file",currentParseIndex() ); } final IToken result = currentToken(); currentTokens.remove( 0 ); if ( result.isEOL() ) { this.parseOffset.newLine( result.getStartingOffset()+1); } return result; } @Override public int currentParseIndex() { final IToken tok = currentToken(); return tok != null ? tok.getStartingOffset() : relativeParseIndex(); } @Override public IToken read(TokenType expectedType) throws ParseException,EOFException { return read((String) null,expectedType); } @Override public IToken read(String errorMessage, TokenType expectedType) throws ParseException,EOFException { final IToken tok = peek(); if ( tok.getType() != expectedType ) { if ( StringUtils.isBlank( errorMessage ) ) { if ( expectedType != TokenType.EOL && expectedType != TokenType.WHITESPACE ) { throw new ParseException( "Expected token of type "+expectedType+" but got '"+tok.getContents()+"'", tok ); } throw new ParseException( "Expected token of type "+expectedType+" but got "+tok.getType(), tok ); } throw new ParseException( errorMessage, tok ); } return read(); } @Override public List<IToken> advanceTo(TokenType[] expectedTypes,boolean advancePastMatchedToken) { if ( expectedTypes == null ) { throw new IllegalArgumentException("expectedTokenTypes must not be NULL."); } boolean expectingEOL = false; for ( TokenType t : expectedTypes ) { if ( TokenType.EOL == t ) { expectingEOL = true; break; } } final List<IToken> result = new ArrayList<IToken>(); while( ! eof() ) { if ( peek().isEOL() ) { if ( expectingEOL ) { if ( advancePastMatchedToken ) { result.add( read() ); } } return result; // RETURN } for ( TokenType expectedType : expectedTypes ) { if ( peek().hasType( expectedType ) ) { if ( advancePastMatchedToken ) { result.add( read() ); } return result; // RETURN ! } } result.add( read() ); } return result; } @Override public int getCurrentLineNumber() { return parseOffset.currentLineNumber(); } @Override public int getCurrentLineStartOffset() { return parseOffset.currentLineStartOffset(); } @Override public String toString() { return eof() ? "Lexer is at EOF" : peek().toString(); } @Override public boolean hasLexerOption(LexerOption option) { if (option == null) { throw new IllegalArgumentException("option must not be NULL"); } return this.options.contains( option ); } @Override public void setLexerOption(LexerOption option, boolean enabled) { if ( option == null ) { throw new IllegalArgumentException("option must not be NULL"); } if ( enabled ) { options.add( option ); } else { options.remove( option ); } if ( option == LexerOption.CASE_INSENSITIVE_OPCODES ) { caseSensitiveOpCodes = ! enabled; } } @Override public List<IToken> skipWhitespace(boolean skipEOL) { List<IToken> result = new ArrayList<>(); while ( ! eof() && ( peek().isWhitespace() || (skipEOL && peek().isEOL() ) ) ) { result.add( read() ); } return result; } }