/*
* Copyright (c) 2010-2016, Sikuli.org, sikulix.com
* Released under the MIT License.
*
*/
package org.sikuli.idesupport;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.sikuli.basics.Debug;
/**
* This class is used to determine the state at any given position in a python
* document. Here, state means the following:
* <ul>
* <li>the nesting level of parentheses and string literals at the given
* position,
* <li>whether the given position is inside a comment,
* <li>whether the given position terminates a physical line,
* <li>whether the given position terminates a logical line,
* <li>the indentation of the last physical line,
* <li>the indentation of the last logical line.
* </ul>
* See <a href=
* "http://docs.python.org/reference/lexical_analysis.html#line-structure">line
* structure</a> in the <a href="http://docs.python.org/reference/">python
* language reference</a> for information about physical and logical lines.
* <p>
* To determine the state of a python document at a given position, you feed a
* {@link PythonState} object with the prefix of the document ending at the
* given position. The text can be fed in any number of consecutive chunks
* (input chunks). Typically text is fed to the object in chunks that represent
* (physical) lines, although this is not a requirement.
* <p>
* After each chunk of text, the state is updated and represents the state of
* (the prefix of) the document given by the concatenation of all the chunks
* seen so far. For example, if each chunk is one line of the python document,
* you get information about the state of the document at the end of each line.
* Possible applications are:
* <ul>
* <li>represent a python document as a sequence of logical lines,
* <li>change the indentation level of the next line.
* </ul>
* <p>
* You can retrieve the last complete physical and logical line seen by the
* state object.
* <p>
* You can retrieve a string that represents the structure of the last logical
* line seen by the state object (see {@link #getLastLogicalLineStructure()}.
* The line structure can be used as a hint for automatic code completion, e.g.
* to automatically add a colon to certain python statements (<tt>if</tt>,
* <tt>except</tt>).
* <p>
* This class does not perform syntactic analysis. You cannot use it to find
* syntax errors in a python document.
* <p>
* This class does not perform full lexical analysis. It does not recognize
* keywords, identifiers, or numbers.
*/
public class PythonState {
public static final int DEFAULT_TABSIZE = 4;
public static enum State{
DEFAULT, IN_SINGLE_QUOTED_STRING, IN_DOUBLE_QUOTED_STRING, IN_LONG_SINGLE_QUOTED_STRING, IN_LONG_DOUBLE_QUOTED_STRING, IN_PARENTHESIS, IN_COMMENT
};
// Matchers to decide what to do next in each state.
// Each matcher includes escaped EOL and escaped backslash
// starts a string, parenthesis or comment
private static final Pattern START_DELIMITER = Pattern.compile(
"('''|\"\"\"|['\"(\\[{#]|\\\\?(?:\r|\n|\r\n)|\\\\.)",
Pattern.MULTILINE);
// starts or ends a string, parenthesis, or starts a comment
private static final Pattern DELIMITER = Pattern
.compile("('''|\"\"\"|['\"()\\[\\]{}#]|\\\\?(?:\r|\n|\r\n)|\\\\.)");
// ends a single quoted string
private static final Pattern SINGLE_QUOTE_DELIMITER = Pattern
.compile("('|\\\\?(?:\r|\n|\r\n)|\\\\.)");
// ends a double quoted string
private static final Pattern DOUBLE_QUOTE_DELIMITER = Pattern
.compile("(\"|\\\\?(?:\r|\n|\r\n)|\\\\.)");
// ends a single quoted long string
private static final Pattern LONG_SINGLE_QUOTE_DELIMITER = Pattern
.compile("('''|\\\\?(?:\r|\n|\r\n)|\\\\.)");
// ends a double quoted long string
private static final Pattern LONG_DOUBLE_QUOTE_DELIMITER = Pattern
.compile("(\"\"\"|\\\\?(?:\r|\n|\r\n)|\\\\.)");
// EOL
private static final Pattern END_OF_LINE = Pattern.compile("(?:\n|\r\n?)");
private Matcher startDelimiterMatcher = START_DELIMITER.matcher("")
.useAnchoringBounds(true);
private Matcher delimiterMatcher = DELIMITER.matcher("").useAnchoringBounds(
true);
private Matcher singleQuoteMatcher = SINGLE_QUOTE_DELIMITER.matcher("")
.useAnchoringBounds(true);
private Matcher doubleQuoteMatcher = DOUBLE_QUOTE_DELIMITER.matcher("")
.useAnchoringBounds(true);
private Matcher longSingleQuoteMatcher = LONG_SINGLE_QUOTE_DELIMITER
.matcher("").useAnchoringBounds(true);
private Matcher longDoubleQuoteMatcher = LONG_DOUBLE_QUOTE_DELIMITER
.matcher("").useAnchoringBounds(true);
private Matcher endOfLineMatcher = END_OF_LINE.matcher("")
.useAnchoringBounds(true);
private StringBuilder physicalLine;
private StringBuilder logicalLine;
private StringBuilder unmatchedChunk;
private boolean completePhysicalLine;
private boolean completeLogicalLine;
/**
* Set to true to indicate that the next physical line is a continuation of
* the previous physical line.
*/
private boolean explicitJoining;
/**
* If {@link #explicitJoining} is true, the length of the prefix of
* {@link #unmatchedChunk} that has already been added to
* {@link #logicalLine}.
*/
private int explicitJoinOffset;
private int physicalLineNumber;
private int logicalLineNumber;
private int logicalLinePhysicalStartLineNumber;
private int physicalLineIndentation;
private int logicalLineIndentation;
private int prevPhysicalLineIndentation;
private int prevLogicalLineIndentation;
private int tabsize = DEFAULT_TABSIZE;
private Stack<State> state;
private StringBuilder logicalLineStructure;
public PythonState(){
state = new Stack<State>();
state.push(State.DEFAULT);
physicalLine = new StringBuilder();
logicalLine = new StringBuilder();
unmatchedChunk = new StringBuilder();
logicalLineStructure = new StringBuilder();
reset();
}
/**
* Sets the number of whitespace columns that equals a single tab. This is
* used to calculate the indentation of lines.
*
* @param tabsize
* the number of whitespace columns that equals a single tab
*/
public void setTabSize(int tabsize){
this.tabsize = tabsize;
}
/**
* Returns the number of whitespace columns equalling a single tab that is
* used to calculate the indentation of lines.
*
* @return the number of whitespace columns that equals a single tab
*/
public int getTabSize(){
return tabsize;
}
/**
* Resets the state of this object. The new state is equivalent to an empty
* document.
*/
public void reset(){
state.setSize(1);
physicalLine.setLength(0);
logicalLine.setLength(0);
unmatchedChunk.setLength(0);
logicalLineStructure.setLength(0);
completePhysicalLine = false;
completeLogicalLine = false;
explicitJoining = false;
explicitJoinOffset = 0;
physicalLineNumber = 0;
logicalLineNumber = 0;
logicalLinePhysicalStartLineNumber = 0;
physicalLineIndentation = -1;
logicalLineIndentation = -1;
prevPhysicalLineIndentation = -1;
prevLogicalLineIndentation = -1;
}
private boolean isEOL(String s){
return s.equals("\r") || s.equals("\n") || s.equals("\r\n");
}
private boolean isEscapedEOL(String s){
return s.length() >= 2 && s.charAt(0) == '\\' && isEOL(s.substring(1));
}
private boolean isEscapedChar(String s){
return s.length() == 2 && s.charAt(0) == '\\';
}
/**
* Feeds a chunk of text to this object. The text will be (virtually)
* appended to any text that was fed to this object earlier since the last
* reset.
*
* @param newChunk
* a new chunk of text
*/
public void update(String newChunk){
unmatchedChunk.append(newChunk);
// indexes in unmatchedChunk
int searchStart;
int matchEnd;
int nextSearchStart = 0;
String match = null;
SCAN: while( nextSearchStart < unmatchedChunk.length() ){
searchStart = nextSearchStart;
Debug.log(9, "%s: [%s]", state.peek().name(),
unmatchedChunk.substring(searchStart));
// more input to match
if( completePhysicalLine ){
physicalLine.setLength(0);
completePhysicalLine = false;
physicalLineNumber++;
prevPhysicalLineIndentation = physicalLineIndentation;
}
if( completeLogicalLine ){
logicalLine.setLength(0);
logicalLineStructure.setLength(0);
completeLogicalLine = false;
logicalLineNumber++;
logicalLinePhysicalStartLineNumber = physicalLineNumber;
prevLogicalLineIndentation = logicalLineIndentation;
logicalLineIndentation = -1;
}
explicitJoining = false;
// use different matchers, depending on current state
switch( state.peek() ){
case DEFAULT:
// start a string, parenthesis, comment, or EOL
startDelimiterMatcher.reset(unmatchedChunk);
startDelimiterMatcher.region(searchStart, unmatchedChunk.length());
if( startDelimiterMatcher.find() ){
match = startDelimiterMatcher.group(1);
matchEnd = startDelimiterMatcher.end(1);
if( isEscapedEOL(match) ){
completePhysicalLine = true;
explicitJoining = true;
}else if( isEOL(match) ){
completePhysicalLine = true;
// append scanned input except EOL
logicalLineStructure.append(unmatchedChunk.substring(
searchStart, matchEnd - match.length()));
}else{
if( match.equals("'") ){
state.push(State.IN_SINGLE_QUOTED_STRING);
}else if( match.equals("\"") ){
state.push(State.IN_DOUBLE_QUOTED_STRING);
}else if( match.equals("'''") ){
state.push(State.IN_LONG_SINGLE_QUOTED_STRING);
}else if( match.equals("\"\"\"") ){
state.push(State.IN_LONG_DOUBLE_QUOTED_STRING);
}else if( match.equals("(") || match.equals("[")
|| match.equals("{") ){
state.push(State.IN_PARENTHESIS);
}else if( match.equals("#") ){
state.push(State.IN_COMMENT);
}else if( isEscapedChar(match) ){
// skip
}else{
throw new Error("unexpected match \"" + match + "\"");
}
logicalLineStructure.append(unmatchedChunk.substring(
searchStart, matchEnd));
}
}else{
break SCAN;
}
break;
case IN_PARENTHESIS:
// start string, start/end parenthesis, comment, EOL
delimiterMatcher.reset(unmatchedChunk);
delimiterMatcher.region(searchStart, unmatchedChunk.length());
if( delimiterMatcher.find() ){
match = delimiterMatcher.group(1);
matchEnd = delimiterMatcher.end(1);
if( match.equals("'") ){
state.push(State.IN_SINGLE_QUOTED_STRING);
}else if( match.equals("\"") ){
state.push(State.IN_DOUBLE_QUOTED_STRING);
}else if( match.equals("'''") ){
state.push(State.IN_LONG_SINGLE_QUOTED_STRING);
}else if( match.equals("\"\"\"") ){
state.push(State.IN_LONG_DOUBLE_QUOTED_STRING);
}else if( match.equals("(") || match.equals("[") || match.equals("{") ){
state.push(State.IN_PARENTHESIS);
}else if( match.equals(")") || match.equals("]") || match.equals("}") ){
state.pop();
if( state.peek() == State.DEFAULT ){
logicalLineStructure.append(match);
}
}else if( match.equals("#") ){
state.push(State.IN_COMMENT);
}else if( isEOL(match) ){
completePhysicalLine = true;
}else if( isEscapedEOL(match) ){
completePhysicalLine = true;
explicitJoining = true;
}else if( isEscapedChar(match) ){
// skip
}else{
throw new Error("unexpected match");
}
}else{
break SCAN;
}
break;
case IN_SINGLE_QUOTED_STRING:
// end single quoted string, or EOL
singleQuoteMatcher.reset(unmatchedChunk);
singleQuoteMatcher.region(searchStart, unmatchedChunk.length());
if( singleQuoteMatcher.find() ){
match = singleQuoteMatcher.group(1);
matchEnd = singleQuoteMatcher.end(1);
if( match.equals("'") ){
state.pop();
if( state.peek() == State.DEFAULT ){
logicalLineStructure.append(match);
}
}else if( isEOL(match) ){
completePhysicalLine = true;
}else if( isEscapedEOL(match) ){
completePhysicalLine = true;
explicitJoining = true;
}else if( isEscapedChar(match) ){
// skip
}else{
throw new Error("unexpected match");
}
}else{
break SCAN;
}
break;
case IN_DOUBLE_QUOTED_STRING:
// end double quoted string, or EOL
doubleQuoteMatcher.reset(unmatchedChunk);
doubleQuoteMatcher.region(searchStart, unmatchedChunk.length());
if( doubleQuoteMatcher.find() ){
match = doubleQuoteMatcher.group(1);
matchEnd = doubleQuoteMatcher.end(1);
if( match.equals("\"") ){
state.pop();
if( state.peek() == State.DEFAULT ){
logicalLineStructure.append(match);
}
}else if( isEOL(match) ){
completePhysicalLine = true;
}else if( isEscapedEOL(match) ){
completePhysicalLine = true;
explicitJoining = true;
}else if( isEscapedChar(match) ){
// skip
}else{
throw new Error("unexpected match");
}
}else{
break SCAN;
}
break;
case IN_LONG_SINGLE_QUOTED_STRING:
// end single quoted long strong, or EOL
longSingleQuoteMatcher.reset(unmatchedChunk);
longSingleQuoteMatcher.region(searchStart, unmatchedChunk.length());
if( longSingleQuoteMatcher.find() ){
match = longSingleQuoteMatcher.group(1);
matchEnd = longSingleQuoteMatcher.end(1);
if( match.equals("'''") ){
state.pop();
if( state.peek() == State.DEFAULT ){
logicalLineStructure.append(match);
}
}else if( isEOL(match) ){
completePhysicalLine = true;
}else if( isEscapedEOL(match) ){
completePhysicalLine = true;
explicitJoining = true;
}else if( isEscapedChar(match) ){
// skip
}else{
throw new Error("unexpected match");
}
}else{
break SCAN;
}
break;
case IN_LONG_DOUBLE_QUOTED_STRING:
// end double quoted long string, or EOL
longDoubleQuoteMatcher.reset(unmatchedChunk);
longDoubleQuoteMatcher.region(searchStart, unmatchedChunk.length());
if( longDoubleQuoteMatcher.find() ){
match = longDoubleQuoteMatcher.group(1);
matchEnd = longDoubleQuoteMatcher.end(1);
if( match.equals("\"\"\"") ){
state.pop();
if( state.peek() == State.DEFAULT ){
logicalLineStructure.append(match);
}
}else if( isEOL(match) ){
completePhysicalLine = true;
}else if( isEscapedEOL(match) ){
completePhysicalLine = true;
explicitJoining = true;
}else if( isEscapedChar(match) ){
// skip
}else{
throw new Error("unexpected match");
}
}else{
break SCAN;
}
break;
case IN_COMMENT:
// search EOL
endOfLineMatcher.reset(unmatchedChunk);
endOfLineMatcher.region(searchStart, unmatchedChunk.length());
if( endOfLineMatcher.find() ){
match = endOfLineMatcher.group();
matchEnd = endOfLineMatcher.end();
state.pop();
completePhysicalLine = true;
}else{
break SCAN;
}
break;
default:
throw new Error("This should never happen (probably a bug)");
}
Debug.log(9, "matcher=[%s]", match);
// add matched input to physical line
physicalLine.append(unmatchedChunk
.substring(searchStart + explicitJoinOffset, matchEnd));
if( completePhysicalLine ){
physicalLineIndentation = getPhysicalLineIndentation();
// if this is the first physical line of a logical line, set the
// logical line indentation
if( logicalLineIndentation < 0 ){
logicalLineIndentation = physicalLineIndentation;
}
}
if( explicitJoining ){
// delete backslash-EOL
unmatchedChunk.delete(matchEnd - match.length(), matchEnd);
matchEnd -= match.length();
// add matched input to logical line (minus input that was already
// added)
logicalLine.append(unmatchedChunk.substring(searchStart
+ explicitJoinOffset, matchEnd));
explicitJoinOffset = matchEnd - searchStart;
completeLogicalLine = false;
// deleting the backslash-EOL effectively merges the current line
// with the next line, and we attempt to match it again from the
// start
nextSearchStart = searchStart;
if( matchEnd == unmatchedChunk.length() ){
// no further match is possible until there is new input
break SCAN;
}
}else{
logicalLine.append(unmatchedChunk.substring(searchStart
+ explicitJoinOffset, matchEnd));
completeLogicalLine = completePhysicalLine && inDefaultState();
explicitJoinOffset = 0;
nextSearchStart = matchEnd;
}
} // end SCAN loop
unmatchedChunk.delete(0, nextSearchStart);
Debug.log(9, "%s: unmatched: [%s]", state.peek().name(), unmatchedChunk);
}
/**
* Returns the state of the python document seen so far.
*
* @return the current state
*/
public State getState(){
return state.peek();
}
/**
* Returns true if the state of the document seen by this object is not
* inside any parenthesis, string or comment.
*
* @return true if the current state is the default state
*/
public boolean inDefaultState(){
return state.peek() == State.DEFAULT;
}
/**
* Returns true if the state of the document seen by this object is inside a
* parenthesis (including square brackets and curly braces).
*
* @return true if the current state is inside a parenthesis
*/
public boolean inParenthesis(){
return state.peek() == State.IN_PARENTHESIS;
}
/**
* Returns true if the state of the document seen by this object is inside a
* string (short string or long string).
*
* @return true if the current state is inside a string
*/
public boolean inString(){
switch( state.peek() ){
case IN_DOUBLE_QUOTED_STRING:
case IN_SINGLE_QUOTED_STRING:
case IN_LONG_SINGLE_QUOTED_STRING:
case IN_LONG_DOUBLE_QUOTED_STRING:
return true;
}
return false;
}
/**
* Returns true if the state of the document seen by this object is inside a
* long string.
*
* @return true if the current state is inside a long string
*/
public boolean inLongString(){
return state.peek() == State.IN_LONG_SINGLE_QUOTED_STRING
|| state.peek() == State.IN_LONG_DOUBLE_QUOTED_STRING;
}
/**
* Returns true if the state of the document seen by this object is inside a
* comment.
*
* @return true if the current state is inside a comment
*/
public boolean inComment(){
return state.peek() == State.IN_COMMENT;
}
/**
* Returns the nesting level of parentheses and strings that the state of the
* document seen by this object is in. The nesting level in the default state
* is 0.
* <p>
* Note that parentheses can be nested at any depth, but only one level of
* string can be nested inside the innermost parentheses because anything
* inside a string is not interpreted.
*
* @return the nesting level of parentheses and strings of the current state
*/
public int getDepth(){
return state.size() - 1;
}
/**
* Returns a string that represents the structure of the last logical line.
* The returned string is identical to the last logical line except that the
* contents of any strings or parenthesised expression, and any comment (i.e.
* any input text with a nesting level greater than 0), and the trailing
* end-of-line character, are deleted.
* <table border="1">
* <caption>Examples:</caption>
* <tr>
* <th>Input</th>
* <th>Structure</th>
* </tr>
* <tr>
* <td>{@code print x}</td>
* <td>{@code print x}</td>
* </tr>
* <tr>
* <td>{@code print 'x'}</td>
* <td>{@code print ''}</td>
* </tr>
* <tr>
* <td>{@code print '%s=%d\n' % ('a', f(x[0]))}</td>
* <td>{@code print '' % ()}</td>
* </tr>
* <tr>
* <td>{@code """a long comment"""}</td>
* <td>{@code """"""}</td>
* </tr>
* <tr>
* <td>{@code if x: pass # case 1}</td>
* <td>{@code if x: pass #}</td>
* </tr>
* </table>
*
*/
public String getLastLogicalLineStructure(){
return logicalLineStructure.toString();
}
/**
* Returns the last physical line seen by this object, including the
* terminating end-of-line sequence. If the last line seen by this object is
* not a complete physical line, the return value is undefined.
*
* @return the last complete physical line seen by this object
*/
public String getLastPhysicalLine(){
return physicalLine.toString();
}
/**
* Returns the last logical line seen by this object, including the
* terminating end-of-line sequence. If the input seen by this object does
* not end with a complete logical line, the return value is guaranteed to
* include all complete physical lines seen of which the logical line is
* comprised. If explicit line joining has occurred, any escaped end-of-line
* sequence is not included in the logical line.
*
* @return the last complete logical line seen seen by this instance
*/
public String getLastLogicalLine(){
return logicalLine.toString();
}
/**
* Returns the physical line number of the last physical line seen by this
* object.
*
* @return the physical line number of the line returned by
* {@link #getLastPhysicalLine()} (0-based)
*/
public int getPhysicalLineNumber(){
return physicalLineNumber;
}
/**
* Returns the logical line number of the last logical line seen by this
* object.
*
* @return the logical line number of the line returned by
* {@link #getLastLogicalLine()} (0-based)
*/
public int getLogicalLineNumber(){
return logicalLineNumber;
}
/**
* Returns the physical line number of the first physical line in the last
* logical line seen by this object.
*
* @return the physical line number of the first physical line in the logical
* line returned by {@link #getLastLogicalLine()} (0-based)
*/
public int getLogicalLinePhysicalStartLineNumber(){
return logicalLinePhysicalStartLineNumber;
}
/**
* Returns whether the last physical line seen by this object is complete. A
* physical line is complete if it is terminated by an end-of-line sequence.
*
* @return true if the line returned by {@link #getLastPhysicalLine()} is
* complete
*/
public boolean isPhysicalLineComplete(){
return completePhysicalLine;
}
/**
* Returns whether the last logical line seen by this object is complete. A
* logical line is complete if all of the following are true:
* <ul>
* <li>the physical lines that it is comprised of are complete (i.e. it is
* terminated by an end-of-line sequence)
* <li>it does not end with a physical line that is explicitly joined with
* the following line (i.e. the final end-of-line sequence is not preceded by
* a backslash, unless the backslash is part of a comment)
* <li>it does not contain any open parenthesis or string delimiter without
* the matching closing parenthesis or string delimiter
* </ul>
*
* @return true if the line returned by {@link #getLastLogicalLine()} is
* complete
*/
public boolean isLogicalLineComplete(){
return completeLogicalLine;
}
/**
* Returns whether the last physical line seen by this object is explicitly
* joined with the following line, i.e. whether its end-of-line sequence is
* escaped with a backslash and the backslash is not inside a comment. If the
* last physical line seen is not complete, the return value is undefined.
*
* @return true if the last complete physical line is explicitly joined with
* the following line
*/
public boolean isExplicitLineJoining(){
return explicitJoining;
}
private int getPhysicalLineIndentation(){
int indentation = 0;
for( int i = 0; i < physicalLine.length(); i++ ){
char c = physicalLine.charAt(i);
if( c == ' ' ){
indentation++;
}else if( c == '\t' ){
indentation += tabsize;
}else{
break;
}
}
return indentation;
}
/**
* Returns the indentation (in columns of whitespace) of the last complete
* physical line seen by this object.
* <p>
* Any tab characters in the leading whitespace of the line are counted as
* the equivalent number of blank characters.
*
* @return the indentation of the last complete physical line
* @throws IllegalStateException
* if the last physical line is not complete
*/
public int getLastPhysicalLineIndentation() throws IllegalStateException{
if( !completePhysicalLine ){
throw new IllegalStateException("incomplete physical line");
}
return physicalLineIndentation;
}
/**
* Returns the indentation (in columns of whitespace) of the last logical
* line seen by this object. This is the indentation of the physical line
* which is the first line in the logical line.
* <p>
* Any tab characters in the leading whitespace of the line are counted as
* the equivalent number of blank characters.
*
* @return the indentation of the last logical line
* @throws IllegalStateException
* if the first physical line in the last logical line is not
* complete
*/
public int getLastLogicalLineIndentation() throws IllegalStateException{
if( logicalLineIndentation < 0 ){
throw new IllegalStateException("incomplete logical line");
}
return logicalLineIndentation;
}
/**
* Returns the indentation of the previous physical line.
*
* @return the indentation of the previous physical line
* @throws IllegalStateException
* if no complete physical line or only one complete physical line
* has been seen by this object.
*/
public int getPrevPhysicalLineIndentation() throws IllegalStateException{
if( prevPhysicalLineIndentation < 0 ){
throw new IllegalStateException("not enough physical lines");
}
return prevPhysicalLineIndentation;
}
/**
* Returns the indentation of the previous logical line.
*
* @return the indentation of the previous logical line
* @throws IllegalStateException
* if no logical line or only one logical line has been seen by
* this instance
*/
public int getPrevLogicalLineIndentation() throws IllegalStateException{
if( prevLogicalLineIndentation < 0 ){
throw new IllegalStateException("not enough logical lines");
}
return prevLogicalLineIndentation;
}
}