ProlTokenizer.java example

Explorer

jprol-master
- engine
  - src
    - main
      - java
        com
        igormaznitsa
        prol
        annotations
        Consult.java
        Determined.java
        Evaluable.java
        ItChangesGoalChain.java
        Predicate.java
        PredicateSynonyms.java
        ProlOperator.java
        ProlOperators.java
        WrappedPredicate.java
        containers
        ClauseIterator.java
        FactIterator.java
        InsideClauseListItem.java
        KnowledgeBase.java
        KnowledgeBaseInsideClauseList.java
        MemoryClauseIterator.java
        MemoryFactIterator.java
        MemoryKnowledgeBase.java
        MemoryRuleIterator.java
        OperatorContainer.java
        RuleIterator.java
        data
        ConvertableToTerm.java
        NumericTerm.java
        Operator.java
        Term.java
        TermFloat.java
        TermInteger.java
        TermList.java
        TermStruct.java
        Var.java
        easygui
        AbstractProlEditor.java
        DialogEditor.java
        EditorPane.java
        FontChooserDialog.java
        HelpDialog.java
        JHtmlLabel.java
        KnowledgeBaseSnapshotViewDialog.java
        LibraryInfoDialog.java
        MainFrame.java
        MessageEditor.java
        OptionsDialog.java
        RecentlyOpenedFileFixedList.java
        TraceDialog.java
        UIUtils.java
        main.java
        exceptions
        ParserException.java
        ProlAbstractCatcheableException.java
        ProlCriticalError.java
        ProlCustomErrorException.java
        ProlDomainErrorException.java
        ProlEvaluationErrorException.java
        ProlException.java
        ProlExistenceErrorException.java
        ProlForkExecutionException.java
        ProlHaltExecutionException.java
        ProlInstantiationErrorException.java
        ProlKnowledgeBaseException.java
        ProlPermissionErrorException.java
        ProlRepresentationErrorException.java
        ProlTypeErrorException.java
        ProlWrongGoalException.java
        io
        DefaultProlStreamManagerImpl.java
        ProlMemoryPipe.java
        ProlStream.java
        ProlStreamManager.java
        ProlTextInputStream.java
        ProlTextOutputStream.java
        ProlTextReader.java
        ProlTextWriter.java
        libraries
        PredicateProcessor.java
        PredicateTemplate.java
        ProlAbstractLibrary.java
        ProlCoreLibrary.java
        ProlGraphicLibrary.java
        ProlLibraryWrapper.java
        ProlStringLibrary.java
        logic
        DefaultKnowledgeBaseFactory.java
        Goal.java
        IsolatedGoal.java
        KnowledgeBaseFactory.java
        PreparedGoal.java
        ProlContext.java
        ProlMappedObjectSearcher.java
        VariableStateSnapshot.java
        triggers
        AbstractProlTrigger.java
        ProlTrigger.java
        ProlTriggerGoal.java
        ProlTriggerType.java
        TriggerEvent.java
        parser
        ProlConsult.java
        ProlReader.java
        ProlTokenizer.java
        ProlTreeBuilder.java
        trace
        TraceListener.java
        utils
        IntegerHashSet.java
        Utils.java
        wordpress
        tips4java
        TextLineNumber.java
    - test
      - java
        com
        igormaznitsa
        prol
        test
        AllTests.java
        DeepStackTest.java
        EightQueens.java
        EinsteinTest.java
        HanoiTowers.java
        IOPipeMemoryTest.java
        LibraryWrapperTest.java
        ListTest.java
        MiscAlgorithms.java
        MiscTest.java
        NonDeterministicAutomata.java
        OperatorTest.java
        PreparedGoalTest.java
        PrimitiveTest.java
        PuzzleTest.java
        SomeFromISOTest.java
        StrongTest.java
        TriggerTest.java
- examples
  - java
    - elife
      - src
        main
        java
        com
        igormaznitsa
        elife
        AboutDialog.java
        ElifeModelViewer.java
        MainForm.java
        WorldModel.java
        main.java

/* 
 * Copyright 2014 Igor Maznitsa (http://www.igormaznitsa.com).
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.igormaznitsa.prol.parser;

import com.igormaznitsa.prol.containers.KnowledgeBase;
import com.igormaznitsa.prol.containers.OperatorContainer;
import com.igormaznitsa.prol.data.Term;
import com.igormaznitsa.prol.data.TermFloat;
import com.igormaznitsa.prol.data.TermInteger;
import com.igormaznitsa.prol.data.Var;
import com.igormaznitsa.prol.exceptions.ProlCriticalError;
import com.igormaznitsa.prol.exceptions.ParserException;
import java.io.IOException;

/**
 * The class implements a tokenizer which can parse a prolog source
 *
 * @author Igor Maznitsa (igor.maznitsa@igormaznitsa.com)
 */
public final class ProlTokenizer {

  /**
   * Inside class which used to present a token read from the source input stream
   *
   * @author Igor Maznitsa (igor.maznitsa@igormaznitsa.com)
   */
  public final static class ProlTokenizerResult {

    /**
     * A type for the result. The type shows the "look for" mode
     */
    public static final int STATE_LOOKFOR = 0;
    /**
     * A type for the result. The type shows that a text atom has been found
     */
    public static final int STATE_ATOM = 1;
    /**
     * A type for the result. The type shows that a string (an atom bounded by \') has been found
     */
    public static final int STATE_STRING = 2;
    /**
     * A type for the result. The type shows that an operator has been found (the operator has been found at current context)
     */
    public static final int STATE_OPERATOR = 3;
    /**
     * A type for the result. The type shows that a variable has been found
     */
    public static final int STATE_VARIABLE = 4;
    /**
     * The variable contains read atom
     */
    private final Term term;
    /**
     * The variable contains the state which was associated by the state machine with the read atom
     */
    private final int state;

    /**
     * The constructor
     *
     * @param term the read term, must not be null
     * @param state the state of the state machine
     */
    public ProlTokenizerResult(final Term term, final int state) {
      this.term = term;
      this.state = state;
    }

    /**
     * Get the result term type
     *
     * @return the result term type as integer
     * @see com.igormaznitsa.prol.data.Term
     */
    public final int getTermType() {
      return term.getTermType();
    }

    /**
     * Get the text of read term
     *
     * @return the text of the read term
     */
    public final String getText() {
      return term.getText();
    }

    /**
     * Get the state of the state machine associated with te read term
     *
     * @return the state as integer
     */
    public final int getState() {
      return state;
    }

    /**
     * Get the read term
     *
     * @return the read term
     */
    public final Term getTerm() {
      return term;
    }
  }
  /**
   * The variable contains the last pushed term. The term had been read but the reader pushed it back to reread it lately
   */
  private ProlTokenizerResult lastPushedTerm;
  /**
   * The variable saves the previous value of the read token line number
   */
  private int prevReadTokenLineNum;
  /**
   * The variable saves the previous value of the read token string position
   */
  private int prevReadTokenStrPos;
  /**
   * The variable saves the last value of the read token line number
   */
  private int lastReadTokenLineNum;
  /**
   * The variable saves the last value of the read token string position
   */
  private int lastReadTokenStrPos;
  /**
   * Inside state for the state machine shows that the state machine is looking for the next token
   */
  private static final int INSIDE_STATE_LOOKFOR = 0;
  /**
   * Inside state for the state machine shows that the state machine has an atom in its buffer
   */
  private static final int INSIDE_STATE_ATOM = 1;
  /**
   * Inside state for the state machine shows that the state machine has a string in its buffer
   */
  private static final int INSIDE_STATE_STRING = 2;
  /**
   * Inside state for the state machine shows that the state machine has an operator in its buffer
   */
  private static final int INSIDE_STATE_OPERATOR = 3;
  /**
   * Inside state for the state machine shows that the state machine has a variable in its buffer
   */
  private static final int INSIDE_STATE_VARIABLE = 4;
  /**
   * Inside state for the state machine shows that the state machine has an integer value in its buffer
   */
  private static final int INSIDE_STATE_INTEGER = 5;
  /**
   * Inside state for the state machine shows that the state machine has an float value in its buffer
   */
  private static final int INSIDE_STATE_FLOAT = 6;

  /**
   * The constructor
   */
  public ProlTokenizer() {
    super();
  }

  /**
   * Push a read object back into buffer to read it lately
   *
   * @param object the object to be pushed back into buffer, null will clear the buffer
   */
  public void pushTermBack(final ProlTokenizerResult object) {
    if (lastPushedTerm != null) {
      throw new IllegalStateException("An object has been pushed already");
    }
    lastPushedTerm = object;
  }

  /**
   * Peek the next token from the incomming stream. The token will be read and available but it will not be removed from the incomming stream.
   *
   * @param reader the reader to get the incomming token, must not be null
   * @param voc the knowledge base which will be used for the operation, must not be null
   * @return a read token as a ProlTokenizerResult, or null if there is not any token in the stream
   * @throws IOException it will be throws if there is any transport problem
   */
  public ProlTokenizerResult peekToken(final ProlReader reader, final KnowledgeBase voc) throws IOException {
    final ProlTokenizerResult result;
    if (lastPushedTerm == null) {
      result = nextToken(reader, voc);
      pushTermBack(result);
    }
    else {
      result = lastPushedTerm;
    }
    return result;
  }

  /**
   * Get the last string position of the read token
   *
   * @return the last string position for the read token as integer
   */
  public int getLastTokenStrPos() {
    return lastPushedTerm == null ? lastReadTokenStrPos : prevReadTokenStrPos;
  }

  /**
   * Get the last line number for the read token
   *
   * @return the last line number for the read token as integer
   */
  public int getLastTokenLineNum() {
    return lastPushedTerm == null ? lastReadTokenLineNum : prevReadTokenLineNum;
  }

  /**
   * Inside function to fix current read position of string and line numbers
   *
   * @param reader the reader which position shoul be fixed in the inside variables, must not be null
   */
  private void fixPosition(final ProlReader reader) {
    prevReadTokenLineNum = lastReadTokenLineNum;
    prevReadTokenStrPos = lastReadTokenStrPos;
    lastReadTokenLineNum = reader.getLineNumber();
    lastReadTokenStrPos = reader.getStrPos();
  }

  /**
   * Skip all comments (started with %) in the incomming stream
   *
   * @param reader the reader whose comments should be skipped, must nit be null
   * @throws IOException it will be thrown if there will be any transport problem during the operation
   */
  private void skipComments(final ProlReader reader) throws IOException {
    while (true) {
      final int readchar = reader.read();
      if (readchar < 0 || readchar == '\n') {
        break;
      }
    }
  }

  /**
   * Read next token from a reader
   *
   * @param reader the reader which will be used to read next token, must not be null
   * @param voc the knowledge base which will be used for the operation, must not be null
   * @return next token as a ProlTokenizerResult object
   * @throws IOException it will be thrown if there is any transport error during the operation
   */
  public ProlTokenizerResult nextToken(final ProlReader reader, final KnowledgeBase voc) throws IOException {

    if (lastPushedTerm != null) {
      try {
        return lastPushedTerm;
      }
      finally {
        lastPushedTerm = null;
      }
    }

    int state = INSIDE_STATE_LOOKFOR;
    boolean specialchar = false;

    final StringBuilder strbuffer = new StringBuilder();

    OperatorContainer lastFoundFullOperator = null;

    boolean letterOrDigitOnly = false;

    while (true) {
      final int readchar = reader.read();

      if (readchar < 0) {
        final String str = strbuffer.toString();
        switch (state) {
          case INSIDE_STATE_LOOKFOR:
            return null;
          case INSIDE_STATE_FLOAT: {
            if (str.charAt(str.length() - 1) == '.') {
              // non ended float then it integer + '.'
              reader.pushCharBack('.');
              // it is Integer
              return new ProlTokenizerResult(makeTermFromString(str.substring(0, str.length() - 1), INSIDE_STATE_INTEGER), INSIDE_STATE_ATOM);
            }
          }
          case INSIDE_STATE_INTEGER:
            return new ProlTokenizerResult(makeTermFromString(str, state), INSIDE_STATE_ATOM);
          case INSIDE_STATE_ATOM:
            return new ProlTokenizerResult(makeTermFromString(str, state), INSIDE_STATE_ATOM);
          case INSIDE_STATE_VARIABLE:
            if (str.equals("_")) {
              return new ProlTokenizerResult(new Var(), state);
            }
            else {
              return new ProlTokenizerResult(new Var(str), state);
            }

          case INSIDE_STATE_STRING:
            throw new ParserException("Unclosed string found", lastReadTokenLineNum, lastReadTokenStrPos);
          case INSIDE_STATE_OPERATOR: {
            if (lastFoundFullOperator == null) {
              return new ProlTokenizerResult(makeTermFromString(str, state), state);
            }
            else {
              reader.pushBufferDifference(lastFoundFullOperator.getText(), strbuffer);
              return new ProlTokenizerResult(lastFoundFullOperator, state);
            }

          }
          default:
            throw new ProlCriticalError("Unknown reader state");
        }
      }

      final char chr = (char) readchar;

      switch (state) {
        case INSIDE_STATE_LOOKFOR: {
          if (Character.isISOControl(chr) || Character.isWhitespace(chr)) {
            continue;
          }

          switch (chr) {
            case '%': {
              // comments
              skipComments(reader);
            }
            break;
            case '_': {
              fixPosition(reader);
              strbuffer.append(chr);
              state = INSIDE_STATE_VARIABLE;
            }
            break;
            case '\'': {
              fixPosition(reader);
              state = INSIDE_STATE_STRING;
            }
            break;

            default: {
              fixPosition(reader);

              strbuffer.append(chr);

              if (Character.isLetter(chr) && Character.isUpperCase(chr)) {
                state = INSIDE_STATE_VARIABLE;
              }
              else {
                letterOrDigitOnly = Character.isLetterOrDigit(chr);
                String operator = Character.toString(chr);
                if (voc.hasOperatorStartsWith(operator)) {
                  lastFoundFullOperator = voc.findOperatorForName(operator);
                  state = INSIDE_STATE_OPERATOR;
                }
                else {
                  if (Character.isDigit(chr)) {
                    state = INSIDE_STATE_INTEGER;
                  }
                  else {
                    state = INSIDE_STATE_ATOM;
                  }
                }
              }
            }
          }
        }
        break;
        case INSIDE_STATE_ATOM: {
          if (chr == '_') {
            strbuffer.append(chr);
          }
          else if (Character.isWhitespace(chr) || Character.isISOControl(chr)) {
            return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
          }
          else if (chr == '\'' || (letterOrDigitOnly != Character.isLetterOrDigit(chr)) || voc.findOperatorForName(Character.toString(chr)) != null) {
            reader.pushCharBack(chr);
            return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
          }
          else {
            strbuffer.append(chr);
          }
        }
        break;
        case INSIDE_STATE_INTEGER: {
          if (Character.isDigit(chr)) {
            strbuffer.append(chr);
          }
          else {
            if (chr == '.' || chr == 'e' || chr == 'E') {
              strbuffer.append(chr);
              state = INSIDE_STATE_FLOAT;
            }
            else {
              reader.pushCharBack(chr);
              return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), INSIDE_STATE_ATOM);
            }
          }
        }
        break;
        case INSIDE_STATE_FLOAT: {
          if (Character.isDigit(chr)) {
            strbuffer.append(chr);
          }
          else {
            switch (chr) {
              case '-':
              case '+':
                if (strbuffer.charAt(strbuffer.length() - 1) == 'e') {
                  strbuffer.append(chr);
                }
                else {
                  reader.pushCharBack(chr);
                  return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), INSIDE_STATE_FLOAT), INSIDE_STATE_ATOM);
                }
                break;
              case 'e':
              case 'E':
                if (strbuffer.indexOf("e") < 0) {
                  strbuffer.append('e');
                }
                else {
                  reader.pushCharBack(chr);
                  return new ProlTokenizerResult(makeTermFromString(strbuffer.substring(0, strbuffer.length() - 1), INSIDE_STATE_FLOAT), INSIDE_STATE_ATOM);
                }
                break;
              default:
                reader.pushCharBack(chr);

                if (strbuffer.charAt(strbuffer.length() - 1) == '.') {
                  // it was an integer
                  reader.pushCharBack('.');
                  return new ProlTokenizerResult(makeTermFromString(strbuffer.substring(0, strbuffer.length() - 1), INSIDE_STATE_INTEGER), INSIDE_STATE_ATOM);
                }
                else {
                  // it is float
                  return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), INSIDE_STATE_ATOM);
                }
            }
          }
        }
        break;
        case INSIDE_STATE_OPERATOR: {
          if (chr != '_' && letterOrDigitOnly != Character.isLetterOrDigit(chr)) {
            reader.pushCharBack(chr);

            if (lastFoundFullOperator != null) {
              return new ProlTokenizerResult(lastFoundFullOperator, state);
            }
            else {
              return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
            }
          }
          else {
            final OperatorContainer prevoperators = lastFoundFullOperator;
            strbuffer.append(chr);
            final String operator = strbuffer.toString();
            lastFoundFullOperator = voc.findOperatorForName(operator);
            if (prevoperators != null) {
              if (lastFoundFullOperator == null) {
                if (!voc.hasOperatorStartsWith(operator)) {
                  if (letterOrDigitOnly) {
                    state = INSIDE_STATE_ATOM;
                  }
                  else {
                    reader.pushBufferDifference(prevoperators.getText(), strbuffer);
                    return new ProlTokenizerResult(prevoperators, state);
                  }
                }
                else {
                  lastFoundFullOperator = prevoperators;
                }

              }
              else {
                if (!voc.hasOperatorStartsWith(operator)) {
                  reader.pushBufferDifference(prevoperators.getText(), strbuffer);
                  return new ProlTokenizerResult(prevoperators, state);
                }
              }
            }
            else {
              if (!voc.hasOperatorStartsWith(operator)) {
                if (voc.hasOperatorStartsWith(Character.toString(chr))) {
                  // next char can be the start char of an operator so we need get back it into the buffer
                  strbuffer.setLength(strbuffer.length() - 1);
                  reader.pushCharBack(chr);
                }
                state = INSIDE_STATE_ATOM;
              }
            }
          }
        }
        break;
        case INSIDE_STATE_STRING: {
          if (specialchar) {
            switch (chr) {
              case '\'':
                strbuffer.append('\'');
                break;

              case '\"':
                strbuffer.append('\"');
                break;

              case 'n':
                strbuffer.append('\n');
                break;

              case 'f':
                strbuffer.append('\f');
                break;

              case 'r':
                strbuffer.append('\r');
                break;

              case 't':
                strbuffer.append('\t');
                break;

              case '\\':
                strbuffer.append('\\');
                break;

              default:
                throw new ParserException("Unsupported special char", reader.getPrevLineNumber(), reader.getPrevStrPos());
            }
            specialchar = false;
          }
          else {
            switch (chr) {
              case '\'':
                return new ProlTokenizerResult(makeTermFromString(strbuffer.toString(), state), state);
              case '\\': {
                specialchar = true;
              }
              break;
              default: {
                strbuffer.append(chr);
              }

            }
          }
        }
        break;
        case INSIDE_STATE_VARIABLE: {
          if (Character.isISOControl(chr) || Character.isWhitespace(chr)) {
            final String name = strbuffer.toString();
            if (name.equals("_")) {
              return new ProlTokenizerResult(new Var(), state);
            }
            return new ProlTokenizerResult(new Var(name), state);
          }
          else if (chr != '_' && !Character.isLetterOrDigit(chr)) {
            reader.pushCharBack(chr);
            final String name = strbuffer.toString();
            if (name.equals("_")) {
              return new ProlTokenizerResult(new Var(), state);
            }
            return new ProlTokenizerResult(new Var(name), state);
          }
          else {
            strbuffer.append(chr);
          }
        }
        break;
      }
    }
  }

  /**
   * Inside auxulary function to make a term from a String
   *
   * @param string the source string object, must not be null
   * @param state the state of inside state machine which was used to read the term
   * @return a Term object as the result, must not be null
   */
  private Term makeTermFromString(final String string, final int state) {
    Term result = null;

    switch (state) {
      case INSIDE_STATE_INTEGER: {
        try {
          result = new TermInteger(string);
        }
        catch (NumberFormatException ex) {
        }
      }
      break;
      case INSIDE_STATE_FLOAT: {
        try {
          result = new TermFloat(string);
        }
        catch (NumberFormatException ex) {
        }
      }
      break;
    }

    if (result == null) {
      result = new Term(string);
    }

    return result;
  }
}