/*
* This software and all files contained in it are distrubted under the MIT license.
*
* Copyright (c) 2013 Cogito Learning Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package uk.co.cogitolearning.cogpar;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A class for reading an input string and separating it into tokens that can be
* fed into Parser.
*
* The user can add regular expressions that will be matched against the front
* of the string. Regular expressions should not contain beginning-of-string or
* end-of-string anchors or any capturing groups as these will be added by the
* tokenizer itslef.
*/
public class Tokenizer
{
/**
* Internal class holding the information about a token type.
*/
private class TokenInfo
{
/** the regular expression to match against */
public final Pattern regex;
/** the token id that the regular expression is linked to */
public final int token;
/**
* Construct TokenInfo with its values
*/
public TokenInfo(Pattern regex, int token)
{
super();
this.regex = regex;
this.token = token;
}
}
/**
* a list of TokenInfo objects
*
* Each token type corresponds to one entry in the list
*/
private LinkedList<TokenInfo> tokenInfos;
/** the list of tokens produced when tokenizing the input */
private LinkedList<Token> tokens;
/** a tokenizer that can handle mathematical expressions */
private static Tokenizer expressionTokenizer = null;
/**
* Default constructor
*/
public Tokenizer()
{
super();
tokenInfos = new LinkedList<TokenInfo>();
tokens = new LinkedList<Token>();
}
/**
* A static method that returns a tokenizer for mathematical expressions
* @return a tokenizer that can handle mathematical expressions
*/
public static Tokenizer getExpressionTokenizer()
{
if (expressionTokenizer == null)
expressionTokenizer = createExpressionTokenizer();
return expressionTokenizer;
}
/**
* A static method that actually creates a tokenizer for mathematical expressions
* @return a tokenizer that can handle mathematical expressions
*/
private static Tokenizer createExpressionTokenizer()
{
Tokenizer tokenizer = new Tokenizer();
tokenizer.add("[+-]", Token.PLUSMINUS);
tokenizer.add("[*/]", Token.MULTDIV);
tokenizer.add("\\^", Token.RAISED);
String funcs = FunctionExpressionNode.getAllFunctions();
tokenizer.add("(" + funcs + ")(?!\\w)", Token.FUNCTION);
tokenizer.add("\\(", Token.OPEN_BRACKET);
tokenizer.add("\\)", Token.CLOSE_BRACKET);
tokenizer.add("(?:\\d+\\.?|\\.\\d)\\d*(?:[Ee][-+]?\\d+)?", Token.NUMBER);
tokenizer.add("[a-zA-Z]\\w*", Token.VARIABLE);
return tokenizer;
}
/**
* Add a regular expression and a token id to the internal list of recognized tokens
* @param regex the regular expression to match against
* @param token the token id that the regular expression is linked to
*/
public void add(String regex, int token)
{
tokenInfos.add(new TokenInfo(Pattern.compile("^(" + regex+")"), token));
}
/**
* Tokenize an input string.
*
* The reult of tokenizing can be accessed via getTokens
*
* @param str the string to tokenize
*/
public void tokenize(String str)
{
String s = str.trim();
int totalLength = s.length();
tokens.clear();
while (!s.equals(""))
{
int remaining = s.length();
boolean match = false;
for (TokenInfo info : tokenInfos)
{
Matcher m = info.regex.matcher(s);
if (m.find())
{
match = true;
String tok = m.group().trim();
// System.out.println("Success matching " + s + " against " +
// info.regex.pattern() + " : " + tok);
s = m.replaceFirst("").trim();
tokens.add(new Token(info.token, tok, totalLength - remaining));
break;
}
}
if (!match)
throw new ParserException("Unexpected character in input: " + s);
}
}
/**
* Get the tokens generated in the last call to tokenize.
* @return a list of tokens to be fed to Parser
*/
public LinkedList<Token> getTokens()
{
return tokens;
}
}