/*
* This software is Copyright 2005,2006,2007,2008 Langdale Consultants.
* Langdale Consultants can be contacted at: http://www.langdale.com.au
*/
package au.com.langdale.inference;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
* Lexical analyser for the rule language. Splits a stream of characters into tokens.
*/
public class RuleLexer {
// this character class is defined in XML
public static final String NCNAME_CHARS = "_-."; // plus the alphanumerics
// these character classes are defined in rfc3986 for URI's
public static final String UNRESERVED_CHARS = NCNAME_CHARS + "~";
public static final String SUB_DELIM_CHARS = "!$&'()*+,;=";
public static final String PATH_CHARS = UNRESERVED_CHARS + SUB_DELIM_CHARS + ":@";
public static final String FRAGMENT_CHARS = PATH_CHARS + "/?";
// these are delimiters we can use in a rule following to a qname without whitespace
public static final String RULE_DELIM_CHARS = "(,)";
/**
* Determine if the character may occour in a URI fragment field
* as defined by rfc3986.
*/
public static boolean isFragmentChar(int code) {
return (Character.isLetterOrDigit(code) || FRAGMENT_CHARS.indexOf(code) != -1) && RULE_DELIM_CHARS.indexOf(code) == -1;
}
// End of line and end of input special characters
private static final int EOL = '\n';
private static final int EOI = -1;
private Reader source;
StringBuffer lookahead = new StringBuffer();
List comments = new ArrayList();
int cursor = 0;
private int nextLineNumber = 1;
private int lineNumber = 1;
/**
* Apply lexical analysis to a character stream.
* @param source: the character stream.
* @throws IOException
*/
public RuleLexer( Reader source) throws IOException {
this.source = source;
}
/**
* Return the current character or -1 for end of input.
*
* The current character is initially the first character
* in the input stream. This is changed by next() and revert().
*
* This method fills a lookahead buffer by reading the stream
* if necessary.
*/
private int get() throws IOException {
while( cursor >= lookahead.length()) {
int code = source.read();
if( code == EOI )
break;
lookahead.appendCodePoint(code);
}
if( cursor >= lookahead.length())
return EOI;
else
return lookahead.charAt(cursor);
}
/**
* Queue the current character and make the following character
* in the input stream current.
*
* Return this character or -1 for end of input.
*/
private int next() throws IOException {
int code = get();
if( code == EOI)
return EOI;
if( code == EOL )
nextLineNumber ++;
cursor++;
return get();
}
/**
* Revert the lexer to its state following the last take().
* The current character becomes the character after the
* string returned by take().
*/
private void revert() {
cursor = 0;
nextLineNumber = lineNumber;
}
/**
* Return the string of characters that follow the last take() result
* in the input stream up to but excluding the current character.
*
* That is, the characters queued by next() since the last
* take() or revert().
*/
private String take() {
String result = lookahead.substring(0, cursor);
lookahead.delete(0, cursor);
cursor = 0;
lineNumber = nextLineNumber;
return result;
}
/**
* @return: the line number at which the last reported token was found.
*/
public int getLineNumber() {
return lineNumber;
}
/**
* Extract a token. This may advance the input stream beyond the
* token in order to find a match.
*
* @return: the next token or an empty string on end of input.
*
* @throws IOException
*/
public String nextToken() throws IOException {
comments.clear();
for(;;) {
if( Character.isWhitespace(get())) {
next();
take();
}
else if( get() == '#') {
next();
take();
String comment = interpolate();
while(comment.length() > 0) {
comments.add(comment);
comment = interpolate();
}
}
else {
break;
}
}
if( get() == '<' && next() == '-') {
next();
return take();
}
else
revert();
if(get() == '-' && next() == '>') {
next();
return take();
}
else
revert();
if(get() == '^' && next() == '^') {
next();
return take();
}
else
revert();
if(get() == '|' && next() == '|') {
next();
return take();
}
else
revert();
if( get() == '<' )
return quoted('>');
if( get() == '"')
return quoted('"');
if( get() == '\'')
return quoted('\'');
if(isWord()) {
next();
while( isWord())
next();
return take();
}
next();
return take();
}
/**
* The comments that preceded the token delivered by the most recent call to nextToken()
*/
public List getComments() {
return comments;
}
private String interpolate() throws IOException {
while(get() != EOL && Character.isWhitespace(get())) {
next();
take();
}
if( get() == '?') {
next();
while( isWord())
next();
return take();
}
else {
while(get() != EOL && get() != EOI && get() != '?')
next();
return take().trim();
}
}
private String quoted(char delim) throws IOException {
next();
for(;;) {
if(get() == EOI) {
break;
}
else if(get() == '\\') {
next();
next();
}
else if( get() == delim ) {
next();
break;
}
else {
next();
}
}
String result = take();
result.replace("\\\\", "\\");
result.replace("\\" + delim, "" + delim);
return result;
}
private boolean isWord() throws IOException {
int code = get();
return isFragmentChar(code);
}
public static void main(String[] args) {
RuleLexer l;
try {
l = new RuleLexer( new BufferedReader(new InputStreamReader( RuleLexer.class.getResourceAsStream("/au/com/langdale/cim/cimtool-simple.rules"))));
} catch (IOException e) {
e.printStackTrace();
return;
}
for(;;) {
String token;
try {
token = l.nextToken();
} catch (IOException e) {
e.printStackTrace();
return;
}
if( token.length() == 0)
return;
System.out.println(token + "\t" + l.getLineNumber());
}
}
}