/*
* Copyright 2014 Igor Maznitsa (http://www.igormaznitsa.com).
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.igormaznitsa.prol.parser;
import com.igormaznitsa.prol.containers.KnowledgeBase;
import com.igormaznitsa.prol.containers.OperatorContainer;
import com.igormaznitsa.prol.data.Term;
import com.igormaznitsa.prol.data.TermFloat;
import com.igormaznitsa.prol.data.TermInteger;
import com.igormaznitsa.prol.data.Var;
import com.igormaznitsa.prol.exceptions.ProlCriticalError;
import com.igormaznitsa.prol.exceptions.ParserException;
import java.io.IOException;
/**
* The class implements a tokenizer which can parse a prolog source
*
* @author Igor Maznitsa (igor.maznitsa@igormaznitsa.com)
*/
public final class ProlTokenizer {
/**
* Inside class which used to present a token read from the source input stream
*
* @author Igor Maznitsa (igor.maznitsa@igormaznitsa.com)
*/
public final static class ProlTokenizerResult {
/**
* A type for the result. The type shows the "look for" mode
*/
public static final int STATE_LOOKFOR = 0;
/**
* A type for the result. The type shows that a text atom has been found
*/
public static final int STATE_ATOM = 1;
/**
* A type for the result. The type shows that a string (an atom bounded by \') has been found
*/
public static final int STATE_STRING = 2;
/**
* A type for the result. The type shows that an operator has been found (the operator has been found at current context)
*/
public static final int STATE_OPERATOR = 3;
/**
* A type for the result. The type shows that a variable has been found
*/
public static final int STATE_VARIABLE = 4;
/**
* The variable contains read atom
*/
private final Term term;
/**
* The variable contains the state which was associated by the state machine with the read atom
*/
private final int state;
/**
* The constructor
*
* @param term the read term, must not be null
* @param state the state of the state machine
*/
public ProlTokenizerResult(final Term term, final int state) {
this.term = term;
this.state = state;
}
/**
* Get the result term type
*
* @return the result term type as integer
* @see com.igormaznitsa.prol.data.Term
*/
public final int getTermType() {
return term.getTermType();
}
/**
* Get the text of read term
*
* @return the text of the read term
*/
public final String getText() {
return term.getText();
}
/**
* Get the state of the state machine associated with te read term
*
* @return the state as integer
*/
public final int getState() {
return state;
}
/**
* Get the read term
*
* @return the read term
*/
public final Term getTerm() {
return term;
}
}
/**
* The variable contains the last pushed term. The term had been read but the reader pushed it back to reread it lately
*/
private ProlTokenizerResult lastPushedTerm;
/**
* The variable saves the previous value of the read token line number
*/
private int prevReadTokenLineNum;
/**
* The variable saves the previous value of the read token string position
*/
private int prevReadTokenStrPos;
/**
* The variable saves the last value of the read token line number
*/
private int lastReadTokenLineNum;
/**
* The variable saves the last value of the read token string position
*/
private int lastReadTokenStrPos;
/**
* Inside state for the state machine shows that the state machine is looking for the next token
*/
private static final int INSIDE_STATE_LOOKFOR = 0;
/**
* Inside state for the state machine shows that the state machine has an atom in its buffer
*/
private static final int INSIDE_STATE_ATOM = 1;
/**
* Inside state for the state machine shows that the state machine has a string in its buffer
*/
private static final int INSIDE_STATE_STRING = 2;
/**
* Inside state for the state machine shows that the state machine has an operator in its buffer
*/
private static final int INSIDE_STATE_OPERATOR = 3;
/**
* Inside state for the state machine shows that the state machine has a variable in its buffer
*/
private static final int INSIDE_STATE_VARIABLE = 4;
/**
* Inside state for the state machine shows that the state machine has an integer value in its buffer
*/
private static final int INSIDE_STATE_INTEGER = 5;
/**
* Inside state for the state machine shows that the state machine has an float value in its buffer
*/
private static final int INSIDE_STATE_FLOAT = 6;
/**
* The constructor
*/
public ProlTokenizer() {
super();
}
/**
* Push a read object back into buffer to read it lately
*
* @param object the object to be pushed back into buffer, null will clear the buffer
*/
public void pushTermBack(final ProlTokenizerResult object) {
if (lastPushedTerm != null) {
throw new IllegalStateException("An object has been pushed already");
}
lastPushedTerm = object;
}
/**
* Peek the next token from the incomming stream. The token will be read and available but it will not be removed from the incomming stream.
*
* @param reader the reader to get the incomming token, must not be null
* @param voc the knowledge base which will be used for the operation, must not be null
* @return a read token as a ProlTokenizerResult, or null if there is not any token in the stream
* @throws IOException it will be throws if there is any transport problem
*/
public ProlTokenizerResult peekToken(final ProlReader reader, final KnowledgeBase voc) throws IOException {
final ProlTokenizerResult result;
if (lastPushedTerm == null) {
result = nextToken(reader, voc);
pushTermBack(result);
}
else {
result = lastPushedTerm;
}
return result;
}
/**
* Get the last string position of the read token
*
* @return the last string position for the read token as integer
*/
public int getLastTokenStrPos() {
return lastPushedTerm == null ? lastReadTokenStrPos : prevReadTokenStrPos;
}
/**
* Get the last line number for the read token
*
* @return the last line number for the read token as integer
*/
public int getLastTokenLineNum() {
return lastPushedTerm == null ? lastReadTokenLineNum : prevReadTokenLineNum;
}
/**
* Inside function to fix current read position of string and line numbers
*
* @param reader the reader which position shoul be fixed in the inside variables, must not be null
*/
private void fixPosition(final ProlReader reader) {
prevReadTokenLineNum = lastReadTokenLineNum;
prevReadTokenStrPos = lastReadTokenStrPos;
lastReadTokenLineNum = reader.getLineNumber();
lastReadTokenStrPos = reader.getStrPos();
}
/**
* Skip all comments (started with %) in the incomming stream
*
* @param reader the reader whose comments should be skipped, must nit be null
* @throws IOException it will be thrown if there will be any transport problem during the operation
*/
private void skipComments(final ProlReader reader) throws IOException {
while (true) {
final int readchar = reader.read();
if (readchar < 0 || readchar == '\n') {
break;
}
}
}
/**
* Read next token from a reader
*
* @param reader the reader which will be used to read next token, must not be null
* @param voc the knowledge base which will be used for the operation, must not be null
* @return next token as a ProlTokenizerResult object
* @throws IOException it will be thrown if there is any transport error during the operation
*/
public ProlTokenizerResult nextToken(final ProlReader reader, final KnowledgeBase voc) throws IOException {
if (lastPushedTerm != null) {
try {
return lastPushedTerm;
}
finally {
lastPushedTerm = null;
}
}
int state = INSIDE_STATE_LOOKFOR;
boolean specialchar = false;
final StringBuilder strbuffer = new StringBuilder();
OperatorContainer lastFoundFullOperator = null;
boolean letterOrDigitOnly = false;
while (true) {
final int readchar = reader.read();
if (readchar < 0) {
final String str = strbuffer.toString();
switch (state) {
case INSIDE_STATE_LOOKFOR:
return null;
case INSIDE_STATE_FLOAT: {
if (str.charAt(str.length() - 1) == '.') {
// non ended float then it integer + '.'
reader.pushCharBack('.');
// it is Integer
return new ProlTokenizerResult(makeTermFromString(str.substring(0, str.length() - 1), INSIDE_STATE_INTEGER), INSIDE_STATE_ATOM);
}
}
case INSIDE_STATE_INTEGER:
return new ProlTokenizerResult(makeTermFromString(str, state), INSIDE_STATE_ATOM);
case INSIDE_STATE_ATOM:
return new ProlTokenizerResult(makeTermFromString(str, state), INSIDE_STATE_ATOM);
case INSIDE_STATE_VARIABLE:
if (str.equals("_")) {
return new ProlTokenizerResult(new Var(), state);
}
else {
return new ProlTokenizerResult(new Var(str), state);
}
case INSIDE_STATE_STRING:
throw new ParserException("Unclosed string found", lastReadTokenLineNum, lastReadTokenStrPos);
case INSIDE_STATE_OPERATOR: {
if (lastFoundFullOperator == null) {
return new ProlTokenizerResult(makeTermFromString(str, state), state);
}
else {
reader.pushBufferDifference(lastFoundFullOperator.getText(), strbuffer);
return new ProlTokenizerResult(lastFoundFullOperator, state);
}
}
default:
throw new ProlCriticalError("Unknown reader state");
}
}
final char chr = (char) readchar;
switch (state) {
case INSIDE_STATE_LOOKFOR: {
if (Character.isISOControl(chr) || Character.isWhitespace(chr)) {
continue;
}
switch (chr) {
case '%': {
// comments
skipComments(reader);
}
break;
case '_': {
fixPosition(reader);
strbuffer.append(chr);
state = INSIDE_STATE_VARIABLE;
}
break;
case '\'': {
fixPosition(reader);
state = INSIDE_STATE_STRING;
}
break;
default: {
fixPosition(reader);
strbuffer.append(chr);
if (Character.isLetter(chr) && Character.isUpperCase(chr)) {
state = INSIDE_STATE_VARIABLE;
}
else {
letterOrDigitOnly = Character.isLetterOrDigit(chr);
String operator = Character.toString(chr);
if (voc.hasOperatorStartsWith(operator)) {
lastFoundFullOperator = voc.findOperatorForName(operator);
state = INSIDE_STATE_OPERATOR;
}
else {
if (Character.isDigit(chr)) {
state = INSIDE_STATE_INTEGER;
}
else {
state = INSIDE_STATE_ATOM;
}
}
}
}
}
}
break;
case INSIDE_STATE_ATOM: {
if (chr == '_') {
strbuffer.append(chr);
}
else if (Character.isWhitespace(chr) || Character.isISOControl(chr)) {
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
}
else if (chr == '\'' || (letterOrDigitOnly != Character.isLetterOrDigit(chr)) || voc.findOperatorForName(Character.toString(chr)) != null) {
reader.pushCharBack(chr);
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
}
else {
strbuffer.append(chr);
}
}
break;
case INSIDE_STATE_INTEGER: {
if (Character.isDigit(chr)) {
strbuffer.append(chr);
}
else {
if (chr == '.' || chr == 'e' || chr == 'E') {
strbuffer.append(chr);
state = INSIDE_STATE_FLOAT;
}
else {
reader.pushCharBack(chr);
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), INSIDE_STATE_ATOM);
}
}
}
break;
case INSIDE_STATE_FLOAT: {
if (Character.isDigit(chr)) {
strbuffer.append(chr);
}
else {
switch (chr) {
case '-':
case '+':
if (strbuffer.charAt(strbuffer.length() - 1) == 'e') {
strbuffer.append(chr);
}
else {
reader.pushCharBack(chr);
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), INSIDE_STATE_FLOAT), INSIDE_STATE_ATOM);
}
break;
case 'e':
case 'E':
if (strbuffer.indexOf("e") < 0) {
strbuffer.append('e');
}
else {
reader.pushCharBack(chr);
return new ProlTokenizerResult(makeTermFromString(strbuffer.substring(0, strbuffer.length() - 1), INSIDE_STATE_FLOAT), INSIDE_STATE_ATOM);
}
break;
default:
reader.pushCharBack(chr);
if (strbuffer.charAt(strbuffer.length() - 1) == '.') {
// it was an integer
reader.pushCharBack('.');
return new ProlTokenizerResult(makeTermFromString(strbuffer.substring(0, strbuffer.length() - 1), INSIDE_STATE_INTEGER), INSIDE_STATE_ATOM);
}
else {
// it is float
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), INSIDE_STATE_ATOM);
}
}
}
}
break;
case INSIDE_STATE_OPERATOR: {
if (chr != '_' && letterOrDigitOnly != Character.isLetterOrDigit(chr)) {
reader.pushCharBack(chr);
if (lastFoundFullOperator != null) {
return new ProlTokenizerResult(lastFoundFullOperator, state);
}
else {
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
}
}
else {
final OperatorContainer prevoperators = lastFoundFullOperator;
strbuffer.append(chr);
final String operator = strbuffer.toString();
lastFoundFullOperator = voc.findOperatorForName(operator);
if (prevoperators != null) {
if (lastFoundFullOperator == null) {
if (!voc.hasOperatorStartsWith(operator)) {
if (letterOrDigitOnly) {
state = INSIDE_STATE_ATOM;
}
else {
reader.pushBufferDifference(prevoperators.getText(), strbuffer);
return new ProlTokenizerResult(prevoperators, state);
}
}
else {
lastFoundFullOperator = prevoperators;
}
}
else {
if (!voc.hasOperatorStartsWith(operator)) {
reader.pushBufferDifference(prevoperators.getText(), strbuffer);
return new ProlTokenizerResult(prevoperators, state);
}
}
}
else {
if (!voc.hasOperatorStartsWith(operator)) {
if (voc.hasOperatorStartsWith(Character.toString(chr))) {
// next char can be the start char of an operator so we need get back it into the buffer
strbuffer.setLength(strbuffer.length() - 1);
reader.pushCharBack(chr);
}
state = INSIDE_STATE_ATOM;
}
}
}
}
break;
case INSIDE_STATE_STRING: {
if (specialchar) {
switch (chr) {
case '\'':
strbuffer.append('\'');
break;
case '\"':
strbuffer.append('\"');
break;
case 'n':
strbuffer.append('\n');
break;
case 'f':
strbuffer.append('\f');
break;
case 'r':
strbuffer.append('\r');
break;
case 't':
strbuffer.append('\t');
break;
case '\\':
strbuffer.append('\\');
break;
default:
throw new ParserException("Unsupported special char", reader.getPrevLineNumber(), reader.getPrevStrPos());
}
specialchar = false;
}
else {
switch (chr) {
case '\'':
return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
case '\\': {
specialchar = true;
}
break;
default: {
strbuffer.append(chr);
}
}
}
}
break;
case INSIDE_STATE_VARIABLE: {
if (Character.isISOControl(chr) || Character.isWhitespace(chr)) {
final String name = strbuffer.toString();
if (name.equals("_")) {
return new ProlTokenizerResult(new Var(), state);
}
return new ProlTokenizerResult(new Var(name), state);
}
else if (chr != '_' && !Character.isLetterOrDigit(chr)) {
reader.pushCharBack(chr);
final String name = strbuffer.toString();
if (name.equals("_")) {
return new ProlTokenizerResult(new Var(), state);
}
return new ProlTokenizerResult(new Var(name), state);
}
else {
strbuffer.append(chr);
}
}
break;
}
}
}
/**
* Inside auxulary function to make a term from a String
*
* @param string the source string object, must not be null
* @param state the state of inside state machine which was used to read the term
* @return a Term object as the result, must not be null
*/
private Term makeTermFromString(final String string, final int state) {
Term result = null;
switch (state) {
case INSIDE_STATE_INTEGER: {
try {
result = new TermInteger(string);
}
catch (NumberFormatException ex) {
}
}
break;
case INSIDE_STATE_FLOAT: {
try {
result = new TermFloat(string);
}
catch (NumberFormatException ex) {
}
}
break;
}
if (result == null) {
result = new Term(string);
}
return result;
}
}