/* * Sun Public License Notice * * The contents of this file are subject to the Sun Public License * Version 1.0 (the "License"). You may not use this file except in * compliance with the License. A copy of the License is available at * http://www.sun.com/ * * The Original Code is NetBeans. The Initial Developer of the Original * Code is Sun Microsystems, Inc. Portions Copyright 1997-2003 Sun * Microsystems, Inc. All Rights Reserved. */ package org.netbeans.editor; /** * Lexical analyzer that works on a given text buffer. It allows * to sequentially parse a given character buffer by calling * <tt>nextToken()</tt> that returns the token-ids. * * After the token is found by calling the <tt>nextToken</tt> method, * the <tt>getTokenOffset()</tt> method can be used * to get the starting offset of the current * token in the buffer. The <tt>getTokenLength()</tt> gives the length * of the current token. * * The heart of the analyzer is the <tt>parseToken()</tt> method which * parses the text and returns the token-id of the last token found. * The <tt>parseToken()</tt> method is called from the <tt>nextToken()</tt>. * It operates with two important variables. The <tt>offset</tt> * variable identifies the currently scanned character in the buffer. * The <tt>tokenOffset</tt> is the begining of the current token. * The <tt>state</tt> variable that identifies the current internal * state of the analyzer is set accordingly when the characters are parsed. * If the <tt>parseToken()</tt> recognizes a token, it returns its ID * and the <tt>tokenOffset</tt> is its begining in the buffer and * <tt>offset - tokenOffset</tt> is its length. When the token is processed * the value of <tt>tokenOffset</tt> is set to be the same as current * value of the <tt>offset</tt> and the parsing continues. * * Internal states are the integer constants used internally by analyzer. * They are assigned to the <tt>state</tt> variable to express * that the analyzer has moved from one state to another. * They are usually numbered starting from zero but they don't * have to. The only reserved value is -1 which is reserved * for the INIT state - the initial internal state of the analyzer. * * There is also the support for defining the persistent info about * the current state of the analyzer. This info can be later used * to restore the parsing from some particular state instead of * parsing from the begining of the buffer. This feature is very * useful if there are the modifications performed in the document. * The info is stored in the <tt>StateInfo</tt> interface * with the <tt>BaseStateInfo</tt> as the basic implementation. * It enables to get and set the two important values * from the persistent point of view. * The first one is the value of the <tt>state</tt> variable. * The other one is the difference <tt>offset - tokenOffset</tt> * which is called pre-scan. The particular analyzer can define * additional values important for the persistent storage. * The <tt>createStateInfo()</tt> can be overriden to create * custom state-info and <tt>loadState()</tt> and <tt>storeState()</tt> * can be overriden to get/set the additional values. * * The <tt>load()</tt> method sets the buffer to be parsed. * There is a special parameter in the load() method called position * that allows a relation of the character buffer passed to the load() * method and the position of the buffer's data in the document. * For this extended functionality the document must be passed * to the constructor of the lexical analyzer at some level. * * * @author Miloslav Metelka * @version 1.00 */ public class Syntax { /** Is the state of analyzer equal to a given state info? */ public static final int EQUAL_STATE = 0; /** Is the state of analyzer different from given state info? */ public static final int DIFFERENT_STATE = 1; /** Initial internal state of the analyzer */ public static final int INIT = -1; /** Internal state of the lexical analyzer. At the begining * it's set to INIT value but it is changed by <tt>parseToken()</tt> * as the characters are processed one by one. */ protected int state = INIT; /** Text buffer to scan */ protected char buffer[]; /** Current offset in the buffer */ protected int offset; /** Offset holding the begining of the current token */ protected int tokenOffset; /** This variable is the length of the token that was found */ protected int tokenLength; /** Path from which the found token-id comes from. * The <tt>TokenContext.getContextPath()</tt> can be used * to get the path. If the lexical analyzer doesn't use * any children token-contexts it can assign * the path in the constructor. */ protected TokenContextPath tokenContextPath; /** Setting this flag to true means that there are currently no more * buffers available so that analyzer should return all the tokens * including those whose successful scanning would be otherwise * left for later when the next buffer will be available. Setting * this flag to true ensures that all the characters in the current * buffer will be processed. * The lexical analyzer should on one hand process all the characters * but on the other hand it should "save" its context. For example * if the scanner finds the unclosed comment at the end of the buffer * it should return the comment token but * stay in the "being in comment" internal state. */ protected boolean lastBuffer; /** On which offset in the buffer scanning should stop. */ protected int stopOffset; /** The position in the document that logically corresponds * to the stopOffset value. If there's no relation * to the document, it's -1. The reason why the relation * to the document's data is expressed through * the stopOffset to stopPosition relation is because * the stopOffset is the only offset that doesn't change * rapidly in the operation of the lexical analyzer. */ protected int stopPosition; /** This variable can be populated by the parseToken() method * in case the user types an errorneous construction but * it's clear what correct token he meant to write. * For example if the user writes a single '0x' it's an errorneous * construct but it's clear that the user wants to enter * the hexa-number. In this situation the parseToken() * should report error, but it should also set the supposedTokenID * to the hexa-number token. * This information is used while drawing the text. If the caret * stand inside or around such token, it calls the getSupposedTokenID() * after calling the nextToken() and if it's non-null it uses it * instead of the original token. */ protected TokenID supposedTokenID; /** Function that should be called externally to scan the text. * It manages the call to parseToken() and cares about the proper * setting of the offsets. * It can be extended to support any custom debugging required. */ public TokenID nextToken() { // Return immediately when at the end of buffer if (tokenOffset >= stopOffset) { tokenLength = 0; return null; // signal no token found } // Divide non-debug and debug sections supposedTokenID = null; TokenID tokenID = parseToken(); if (tokenID != null) { // regular token found tokenLength = offset - tokenOffset; tokenOffset = offset; // move to the next token if (tokenLength == 0) { // test for empty token return nextToken(); // repeat until non-empty token is found } } else { // EOT reached tokenLength = 0; } return tokenID; } /** This is core function of analyzer and it returns either the token-id * or null to indicate that the end of buffer was found. * The function scans the active character and does one or more * of the following actions: * 1. change internal analyzer state * 2. set the token-context-path and return token-id * 3. adjust current position to signal different end of token; * the character that offset points to is not included in the token */ protected TokenID parseToken() { return null; } /** Load the state from syntax mark into analyzer. This method is used when * @param stateInfo info about the state of the lexical analyzer to load. * It can be null to indicate there's no previous state so the analyzer * starts from its initial state. * @param buffer buffer that will be scanned * @param offset offset of the first character that will be scanned * @param len length of the area to be scanned * @param lastBuffer whether this is the last buffer in the document. All the tokens * will be returned including the last possibly incomplete one. If the data * come from the document, the simple rule for this parameter * is (doc.getLength() == stop-position) where stop-position * is the position corresponding to the (offset + len) in the buffer * that comes from the document data. * @param stopPosition position in the document that corresponds to (offset + len) offset * in the provided buffer. It has only sense if the data in the buffer come from the document. * It helps in writing the advanced analyzers that need to interact with some other data * in the document than only those provided in the character buffer. * If there is no relation to the document data, the stopPosition parameter * must be filled with -1 which means an invalid value. * The stop-position is passed (instead of start-position) because it doesn't * change through the analyzer operation. It corresponds to the <tt>stopOffset</tt> * that also doesn't change through the analyzer operation so any * buffer-offset can be transferred to position by computing * <tt>stopPosition + buffer-offset - stopOffset</tt> * where stopOffset is the instance variable that is assigned * to <tt>offset + len</tt> in the body of relocate(). */ public void load(StateInfo stateInfo, char buffer[], int offset, int len, boolean lastBuffer, int stopPosition) { this.buffer = buffer; this.offset = offset; this.tokenOffset = offset; this.stopOffset = offset + len; this.lastBuffer = lastBuffer; this.stopPosition = stopPosition; if (stateInfo != null) { loadState(stateInfo); } else { loadInitState(); } } /** Relocate scanning to another buffer. * This is used to continue scanning after previously * reported EOT. Relocation delta between current offset and the requested offset * is computed and all the offsets are relocated. If there's a non-zero preScan * in the analyzer, it is a caller's responsibility to provide all the preScan * characters in the relocation buffer. * @param buffer next buffer where the scan will continue. * @param offset offset where the scan will continue. * It's not decremented by the current preScan. * @param len length of the area to be scanned. * It's not extended by the current preScan. * @param lastBuffer whether this is the last buffer in the document. All the tokens * will be returned including the last possibly incomplete one. If the data * come from the document, the simple rule for this parameter * is (doc.getLength() == stop-position) where stop-position * is the position corresponding to the (offset + len) in the buffer * that comes from the document data. * @param stopPosition position in the document that corresponds to (offset + len) offset * in the provided buffer. It has only sense if the data in the buffer come from the document. * It helps in writing the advanced analyzers that need to interact with some other data * in the document than only those provided in the character buffer. * If there is no relation to the document data, the stopPosition parameter * must be filled with -1 which means an invalid value. * The stop-position is passed (instead of start-position) because it doesn't * change through the analyzer operation. It corresponds to the <tt>stopOffset</tt> * that also doesn't change through the analyzer operation so any * buffer-offset can be transferred to position by computing * <tt>stopPosition + buffer-offset - stopOffset</tt> * where stopOffset is the instance variable that is assigned * to <tt>offset + len</tt> in the body of relocate(). */ public void relocate(char buffer[], int offset, int len, boolean lastBuffer, int stopPosition) { this.buffer = buffer; this.lastBuffer = lastBuffer; int delta = offset - this.offset; // delta according to current offset this.offset += delta; this.tokenOffset += delta; this.stopOffset = offset + len; this.stopPosition = stopPosition; } /** Get the current buffer */ public char[] getBuffer() { return buffer; } /** Get the current scanning offset */ public int getOffset() { return offset; } /** Get start of token in scanned buffer. */ public int getTokenOffset() { return offset - tokenLength; } /** Get length of token in scanned buffer. */ public int getTokenLength() { return tokenLength; } /** Get the token-context-path of the returned token. */ public TokenContextPath getTokenContextPath() { return tokenContextPath; } public TokenID getSupposedTokenID() { return supposedTokenID; } /** Get the pre-scan which is a number * of characters between offset and tokenOffset. * If there's no more characters in the current buffer, * the analyzer returns EOT, but it can be in a state when * there are already some characters parsed at the end of * the current buffer but the token * is still incomplete and it cannot be returned yet. * The pre-scan value helps to determine how many characters * from the end of the current buffer should be present * at the begining of the next buffer so that the current * incomplete token can be returned as the first token * when parsing the next buffer. */ public int getPreScan() { return offset - tokenOffset; } /** Initialize the analyzer when scanning from the begining * of the document or when the state stored in syntax mark * is null for some reason or to explicitly reset the analyzer * to the initial state. The offsets must not be touched by this method. */ public void loadInitState() { state = INIT; } public void reset() { tokenLength = stopOffset = tokenOffset = offset = 0; loadInitState(); } /** Load valid mark state into the analyzer. Offsets * are already initialized when this method is called. This method * must get the state from the mark and set it to the analyzer. Then * it must decrease tokenOffset by the preScan stored in the mark state. * @param markState mark state to be loaded into syntax. It must be non-null value. */ public void loadState(StateInfo stateInfo) { state = stateInfo.getState(); tokenOffset -= stateInfo.getPreScan(); } /** Store state of this analyzer into given mark state. */ public void storeState(StateInfo stateInfo) { stateInfo.setState(state); stateInfo.setPreScan(getPreScan()); } /** Compare state of this analyzer to given state info */ public int compareState(StateInfo stateInfo) { if (stateInfo != null) { return ((stateInfo.getState() == state) && stateInfo.getPreScan() == getPreScan()) ? EQUAL_STATE : DIFFERENT_STATE; } else { return DIFFERENT_STATE; } } /** Create state info appropriate for particular analyzer */ public StateInfo createStateInfo() { return new BaseStateInfo(); } /** Get state name as string. It can be used for debugging purposes * by developer of new syntax analyzer. The states that this function * recognizes can include all constants used in analyzer so that it can * be used everywhere in analyzer to convert numbers to more practical strings. */ public String getStateName(int stateNumber) { switch(stateNumber) { case INIT: return "INIT"; // NOI18N default: return "Unknown state " + stateNumber; // NOI18N } } /** Syntax information as String */ public String toString() { return "tokenOffset=" + tokenOffset // NOI18N + ", offset=" + offset // NOI18N + ", state=" + getStateName(state) // NOI18N + ", stopOffset=" + stopOffset // NOI18N + ", lastBuffer=" + lastBuffer; // NOI18N } /** Interface that stores two basic pieces of information about * the state of the whole lexical analyzer - its internal state and preScan. */ public interface StateInfo { /** Get the internal state */ public int getState(); /** Store the internal state */ public void setState(int state); /** Get the preScan value */ public int getPreScan(); /** Store the preScan value */ public void setPreScan(int preScan); } /** Base implementation of the StateInfo interface */ public static class BaseStateInfo implements StateInfo { /** analyzer state */ private int state; /** Pre-scan length */ private int preScan; public int getState() { return state; } public void setState(int state) { this.state = state; } public int getPreScan() { return preScan; } public void setPreScan(int preScan) { this.preScan = preScan; } public String toString(Syntax syntax) { return "state=" + ((syntax != null) ? syntax.getStateName(getState()) : Integer.toString(getState())) + ", preScan=" + getPreScan(); // NOI18N } public String toString() { return toString(null); } } }