RuleLexer.java example

Explorer
CIMTool-master
/*
 * This software is Copyright 2005,2006,2007,2008 Langdale Consultants.
 * Langdale Consultants can be contacted at: http://www.langdale.com.au
 */
package au.com.langdale.inference;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
 * Lexical analyser for the rule language.  Splits a stream of characters into tokens.
 */
public class RuleLexer {

	// this character class is defined in XML
	public static final String NCNAME_CHARS = "_-."; // plus the alphanumerics 

	// these character classes are defined in rfc3986 for URI's
	public static final String UNRESERVED_CHARS = NCNAME_CHARS + "~";
	public static final String SUB_DELIM_CHARS = "!$&'()*+,;=";
	public static final String PATH_CHARS = UNRESERVED_CHARS + SUB_DELIM_CHARS + ":@"; 
	public static final String FRAGMENT_CHARS = PATH_CHARS + "/?";

	// these are delimiters we can use in a rule following to a qname without whitespace
	public static final String RULE_DELIM_CHARS = "(,)";

	/**
	 * Determine if the character may occour in a URI fragment field
	 * as defined by  rfc3986.
	 */
	public static boolean isFragmentChar(int code) {
		return (Character.isLetterOrDigit(code) || FRAGMENT_CHARS.indexOf(code) != -1) && RULE_DELIM_CHARS.indexOf(code) == -1;
	}

	// End of line and end of input special characters
	private static final int EOL = '\n';
	private static final int EOI = -1;
	
	private Reader source;
	StringBuffer lookahead = new StringBuffer();
	List comments = new ArrayList();
	int cursor = 0;

	private int nextLineNumber = 1;
	private int lineNumber = 1;
	/**
	 * Apply lexical analysis to a character stream.
	 * @param source: the character stream.
	 * @throws IOException
	 */
	public RuleLexer( Reader source) throws IOException {
		this.source = source;
	}

	/**
	 * Return the current character or -1 for end of input.  
	 * 
	 * The current character is initially the first character 
	 * in the input stream.  This is changed by next() and revert().
	 * 
	 * This method fills a lookahead buffer by reading the stream
	 * if necessary.
	 */
	private int get() throws IOException {
		while( cursor >= lookahead.length()) {
			int code = source.read();
			if( code == EOI )
				break;
			lookahead.appendCodePoint(code);
		}
		
		if( cursor >= lookahead.length()) 
			return EOI;
		else
			return lookahead.charAt(cursor);
	}
	
	/**
	 * Queue the current character and make the following character
	 * in the input stream current.  
	 * 
	 * Return this character or -1 for end of input.
	 */
	private int next() throws IOException {
		int code = get();
		if( code == EOI) 
			return EOI;
		if( code == EOL )
			nextLineNumber ++;
		
		cursor++;
		return get();
	}
	
	/**
	 * Revert the lexer to its state following the last take(). 
	 * The current character becomes the character after the 
	 * string returned by take(). 
	 */
	private void revert() {
		cursor = 0;
		nextLineNumber = lineNumber;
	}
	
	/**
	 * Return the string of characters that follow the last take() result
	 * in the input stream up to but excluding the current character.
	 * 
	 * That is, the characters queued by next() since the last 
	 * take() or revert().  
	 */
	private String take() {
		String result = lookahead.substring(0, cursor);
		lookahead.delete(0, cursor);
		cursor = 0;
		lineNumber = nextLineNumber;
		return result;
	}
	/**
	 * @return: the line number at which the last reported token was found.
	 */
	public int getLineNumber() {
		return lineNumber;
	}
	/**
	 * Extract a token.  This may advance the input stream beyond the
	 * token in order to find a match.
	 * 
	 * @return: the next token or an empty string on end of input.
	 *  
	 * @throws IOException
	 */
	public String nextToken() throws IOException {
		comments.clear();
		
		for(;;) {
			if( Character.isWhitespace(get())) {
				next();
				take();
			}
			else if( get() == '#') {
				next();
				take();
				String comment = interpolate();
				while(comment.length() > 0) {
					comments.add(comment);
					comment = interpolate();
				}
			}
			else {
				break;
			}
		}
		
		if( get() == '<' && next() == '-') {
			next();
			return take();
		}
		else
			revert();
		
		if(get() == '-' && next() == '>') {
			next();
			return take();
		}
		else
			revert();
		
		if(get() == '^' && next() == '^') {
			next();
			return take();
		}
		else
			revert();
		
		if(get() == '|' && next() == '|') {
			next();
			return take();
		}
		else
			revert();

		if( get() == '<' )
			return quoted('>');
		
		if( get() == '"')
			return quoted('"');
		
		if( get() == '\'')
			return quoted('\'');
		
		if(isWord()) { 
			next();
			while( isWord())
				next();
			return take();
		}
		
		next();
		return take();
	}
	
	/**
	 * The comments that preceded the token delivered by the most recent call to nextToken()
	 */
	public List getComments() {
		return comments;
	}
	
	private String interpolate() throws IOException {
		while(get() != EOL && Character.isWhitespace(get())) {
			next();
			take();
		}
		
		if( get() == '?') {
			next();
			while( isWord())
				next();
			return take();
		}
		else {
			while(get() != EOL && get() != EOI && get() != '?')
				next();
			return take().trim();
		}
	}
	
	private String quoted(char delim) throws IOException {
		next();
		for(;;) {
			if(get() == EOI) {
				break;
			}
			else if(get() ==  '\\') {
				next();
				next();
			}
			else if( get() == delim ) {
				next();
				break;
			}
			else {
				next();
			}
		}
		String result = take();
		result.replace("\\\\", "\\");
		result.replace("\\" + delim, "" + delim);
		return result;
	}

	private boolean isWord() throws IOException {
		int code = get();
		return isFragmentChar(code);
	}
	
	public static void main(String[] args) {
		RuleLexer  l;
		
		try {
			l = new RuleLexer( new BufferedReader(new InputStreamReader( RuleLexer.class.getResourceAsStream("/au/com/langdale/cim/cimtool-simple.rules"))));
		} catch (IOException e) {
			e.printStackTrace();
			return;
		}
		
		for(;;) {
			String token;
			try {
				token = l.nextToken();
			} catch (IOException e) {
				e.printStackTrace();
				return;
			}
			
			if( token.length() == 0)
				return;
			System.out.println(token + "\t" + l.getLineNumber());
		}
	}
}