FastDefinitionsParser.java example

Explorer
Pydev-master
- plugins
/**
 * Copyright (c) 2005-2013 by Appcelerator, Inc. All Rights Reserved.
 * Licensed under the terms of the Eclipse Public License (EPL).
 * Please see the license.txt included with this distribution for details.
 * Any modifications to this file must keep this entire header intact.
 */
package org.python.pydev.parser.fastparser;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.python.pydev.core.ObjectsInternPool;
import org.python.pydev.core.ObjectsInternPool.ObjectsPoolMap;
import org.python.pydev.core.docutils.ParsingUtils;
import org.python.pydev.core.docutils.PySelection;
import org.python.pydev.core.docutils.SyntaxErrorException;
import org.python.pydev.core.log.Log;
import org.python.pydev.parser.jython.SimpleNode;
import org.python.pydev.parser.jython.ast.Assign;
import org.python.pydev.parser.jython.ast.Attribute;
import org.python.pydev.parser.jython.ast.ClassDef;
import org.python.pydev.parser.jython.ast.FunctionDef;
import org.python.pydev.parser.jython.ast.Module;
import org.python.pydev.parser.jython.ast.Name;
import org.python.pydev.parser.jython.ast.NameTok;
import org.python.pydev.parser.jython.ast.exprType;
import org.python.pydev.parser.jython.ast.stmtType;
import org.python.pydev.shared_core.callbacks.ICallback;
import org.python.pydev.shared_core.string.FastStringBuffer;
import org.python.pydev.shared_core.string.StringUtils;
import org.python.pydev.shared_core.structure.FastStack;
import org.python.pydev.shared_core.structure.LowMemoryArrayList;
import org.python.pydev.shared_core.structure.Tuple;

/**
 * @note: Unfinished
 *
 * This class should be able to gather the definitions found in a module in a very fast way.
 *
 * The target is having a performance around 5x faster than doing a regular parse, focusing on getting
 * the name tokens for:
 *
 * classes, functions, class attributes, instance attributes -- basically the tokens that provide a
 * definition that can be 'globally' accessed.
 *
 * This should work the following way:
 *
 * We should have a single stack where all the statements we find are added. When we find a column
 * which indicates a new statement, we close any statement with a column > than the new statement
 * and in the process add those statements to the parent statement as needed (or in some cases,
 * discard it -- i.e.: method inside method is discarded, but attribute inside method is not).
 *
 * This means that we usually do not put the final element there, but a wrapper which has a body
 * where we can add elements (i.e.: array list), which is converted to a body when its own scope ends.
 *
 * @author Fabio
 */
public final class FastDefinitionsParser {

    private static class NodeEntry {

        public final stmtType node;
        public final List<SimpleNode> body = new LowMemoryArrayList<>();

        public final int logicalColumn;

        /**
         * leadingTabs: how many cols in node.beginColumn were found as tabs.
         */
        public NodeEntry(stmtType stmt, int leadingTabs) {
            this.node = stmt;
            this.logicalColumn = (stmt.beginColumn - leadingTabs) + (leadingTabs * 8);
        }

        /**
         * Assign the body if we have something.
         */
        public void onEndScope() {
            if (body.size() > 0) {
                stmtType[] array = body.toArray(new stmtType[body.size()]);
                if (this.node instanceof ClassDef) {
                    ClassDef classDef = (ClassDef) this.node;
                    classDef.body = array;

                } else if (this.node instanceof FunctionDef) {
                    FunctionDef functionDef = (FunctionDef) this.node;
                    functionDef.body = array;

                } else {
                    String msg = "Assign statement is not expected to have body!";
                    if (throwErrorOnWarnings) {
                        throw new RuntimeException(msg);

                    } else {
                        Log.log(msg);

                    }
                    return;
                }
            }
        }

    }

    /**
     * Set and kept in the constructor
     */

    /**
     * The chars we should iterate through.
     */
    final private char[] cs;

    /**
     * May be null (just used for reporting errors).
     */
    final private File file;

    /**
     * The length of the buffer we're iterating.
     */
    final private int length;

    /**
     * Current iteration index
     */
    private int currIndex = 0;

    /**
     * The current column
     */
    private int col;

    /**
     * How many leading tabs we've found.
     */
    private int leadingTabsInLine;

    /**
     * The current row
     */
    private int row = 0;

    /**
     * The column where the 1st char was found
     */
    private int firstCharCol = 1;

    /**
     * Holds things added to the 'global' module
     */
    private final ArrayList<stmtType> body = new ArrayList<stmtType>(16);

    /**
     * Holds a stack of classes so that we create a new one in each new scope to be filled and when the scope is ended,
     * it should have its body filled with the stackBody contents related to each
     */
    private final FastStack<NodeEntry> stack = new FastStack<NodeEntry>(10);

    /**
     * Buffer with the contents of a line.
     */
    private final FastStringBuffer lineBuffer = new FastStringBuffer();

    private final String moduleName;

    public static boolean throwErrorOnWarnings = false;

    /**
     * Should we debug?
     */
    private final static boolean DEBUG = false;

    private FastDefinitionsParser(char[] cs, String moduleName, File f) {
        this(cs, cs.length, moduleName, f);
    }

    /**
     * Constructor
     *
     * @param cs array of chars that should be considered.
     * @param len the number of chars to be used (usually cs.length).
     */
    private FastDefinitionsParser(char[] cs, int len, String moduleName, File f) {
        this.cs = cs;
        this.length = len;
        this.moduleName = moduleName;
        this.file = f;
    }

    /**
     * This is the method that actually extracts things from the passed buffer.
     * @throws SyntaxErrorException
     */
    private void extractBody() throws SyntaxErrorException {
        ParsingUtils parsingUtils = ParsingUtils.create(cs, false, length);

        if (currIndex < length) {
            handleNewLine(parsingUtils);
        }
        //in the 1st attempt to handle the 1st line, if it had nothing we could actually go backward 1 char
        if (currIndex < 0) {
            currIndex = 0;
        }

        for (; currIndex < length; currIndex++, col++) {
            char c = cs[currIndex];

            switch (c) {

                case '\'':
                case '"':
                    if (DEBUG) {
                        System.out.println("literal");
                    }
                    //go to the end of the literal
                    int initialIndex = currIndex;
                    currIndex = parsingUtils.getLiteralEnd(currIndex, c);

                    //keep the row count correct
                    updateCountRow(initialIndex, currIndex);
                    break;

                case '#':
                    if (DEBUG) {
                        System.out.println("comment");
                    }
                    //go to the end of the comment
                    while (currIndex < length) {
                        c = cs[currIndex];
                        if (c == '\r' || c == '\n') {
                            currIndex--;
                            break;
                        }
                        currIndex++;
                    }

                    break;

                case '{':
                case '[':
                case '(':
                    //starting some call, dict, list, tuple... those don't count on getting some actual definition
                    initialIndex = currIndex;
                    currIndex = parsingUtils.eatPar(currIndex, null, c);

                    //keep the row count correct
                    updateCountRow(initialIndex, currIndex);
                    break;

                case '\r':
                    if (currIndex < length - 1 && cs[currIndex + 1] == '\n') {
                        currIndex++;
                    }
                    /*FALLTHROUGH**/
                case '\n':
                    currIndex++;
                    handleNewLine(parsingUtils);
                    if (currIndex < length) {
                        c = cs[currIndex];
                    }

                    break;

                case '=':
                    if ((currIndex < length - 1 && cs[currIndex + 1] != '=' && currIndex > 0
                            && cs[currIndex - 1] != '=')) {
                        //should not be ==
                        //other cases such as !=, +=, -= are already treated because they don't constitute valid
                        //chars for an identifier.

                        if (DEBUG) {
                            System.out.println("Found possible attribute:" + lineBuffer + " col:" + firstCharCol);
                        }

                        //if we've an '=', let's get the whole line contents to analyze...
                        //Note: should have stopped just before the new line (so, as we'll do currIndex++ in the
                        //next loop, that's ok).
                        initialIndex = currIndex;
                        currIndex = parsingUtils.getFullFlattenedLine(currIndex, lineBuffer);

                        //keep the row count correct
                        updateCountRow(initialIndex, currIndex);

                        String equalsLine = lineBuffer.toString().trim();
                        if (!PySelection.startsWithIndentToken(equalsLine)) {

                            lineBuffer.clear();

                            final List<String> splitted = StringUtils.split(equalsLine, '=');
                            final int splittedLen = splitted.size();
                            ArrayList<exprType> targets = new ArrayList<exprType>(2);

                            for (int j = 0; j < splittedLen - 1 || (splittedLen == 1 && j == 0); j++) { //we don't want to get the last one.
                                String lineContents = splitted.get(j).trim();
                                if (lineContents.length() == 0) {
                                    continue;
                                }
                                boolean add = true;
                                int lineContentsLen = lineContents.length();
                                for (int i = 0; i < lineContentsLen; i++) {
                                    char lineC = lineContents.charAt(i);
                                    //can only be made of valid java chars (no spaces or similar things)
                                    if (lineC != '.' && !Character.isJavaIdentifierPart(lineC)) {
                                        add = false;
                                        break;
                                    }
                                }
                                if (add) {
                                    //only add if it was something valid
                                    if (lineContents.indexOf('.') != -1) {
                                        List<String> dotSplit = StringUtils.dotSplit(lineContents);
                                        if (dotSplit.size() == 2 && dotSplit.get(0).equals("self")) {
                                            Attribute attribute = new Attribute(new Name("self", Name.Load, false),
                                                    new NameTok(dotSplit.get(1), NameTok.Attrib), Attribute.Load);
                                            targets.add(attribute);
                                        }

                                    } else {
                                        Name name = new Name(lineContents, Name.Store, false);
                                        targets.add(name);
                                    }
                                }
                            }

                            if (targets.size() > 0) {
                                Assign assign = new Assign(targets.toArray(new exprType[targets.size()]), null, null);
                                assign.beginColumn = this.firstCharCol;
                                assign.beginLine = this.row;
                                stack.push(new NodeEntry(assign, leadingTabsInLine));
                            }
                        }
                    }
                    //No default
            }
            lineBuffer.append(c);
        }

        endScopesInStack(0);
    }

    public void updateCountRow(int initialIndex, int currIndex) {
        char c;
        int len = length;
        for (int k = initialIndex; k < len && k <= currIndex; k++) {
            c = cs[k];
            switch (c) {
                case '\n':
                    row += 1;
                    break;

                case '\r':
                    row += 1;
                    if (k < len - 1 && k <= currIndex - 1) {
                        if (cs[k + 1] == '\n') {
                            k++; //skip the \n after the \r
                        }
                    }
                    break;
            }
        }
    }

    /**
     * Called when a new line is found. Tries to make the match of function and class definitions.
     * @throws SyntaxErrorException
     */
    private void handleNewLine(ParsingUtils parsingUtils) throws SyntaxErrorException {
        if (currIndex >= length - 1) {
            return;
        }

        col = 1;
        leadingTabsInLine = 0;
        row++;
        if (DEBUG) {
            System.out.println("Handling new line:" + row);
        }

        lineBuffer.clear();
        char c = cs[currIndex];

        while (currIndex < length - 1 && Character.isWhitespace(c) && c != '\r' && c != '\n') {
            currIndex++;
            col++;
            if (c == '\t') {
                leadingTabsInLine++;
            }
            c = cs[currIndex];
        }

        if (!Character.isWhitespace(c) && c != '#') {
            endScopesInStack((col - leadingTabsInLine) + (leadingTabsInLine * 8));
        }

        int funcDefIndex = -1;
        if (c == 'c' && matchClass()) {
            int startClassCol = col;
            currIndex += 6;
            col += 6;

            if (this.length <= currIndex) {
                return;
            }
            startClass(getNextIdentifier(c), row, startClassCol, leadingTabsInLine);

        } else if ((c == 'd' && (funcDefIndex = matchFunction()) != -1) ||
                (c == 'a' && (funcDefIndex = matchAsyncFunction()) != -1)) {
            if (DEBUG) {
                System.out.println("Found method");
            }
            int startMethodCol = col;
            currIndex = funcDefIndex + 1;
            col = funcDefIndex + 1;

            if (this.length <= currIndex) {
                return;
            }
            startMethod(getNextIdentifier(c), row, startMethodCol, leadingTabsInLine);
        }
        firstCharCol = col;
        if (currIndex < length) {

            //starting some call, dict, list, tuple... those don't count on getting some actual definition
            int initialIndex = currIndex;

            int tempIndex = skipWhitespaces(currIndex);

            if (tempIndex >= length) {
                return;
            }
            c = cs[tempIndex];

            boolean updateIndex = false;
            switch (c) {
                case '(':
                    tempIndex = parsingUtils.eatPar(tempIndex, null, c);

                    if (tempIndex < length) {
                        tempIndex = skipWhitespaces(tempIndex);

                        c = cs[tempIndex];
                        if (c == ')') {
                            tempIndex++;
                        }
                    }

                    if (tempIndex < length) {
                        tempIndex = skipWhitespaces(tempIndex);

                        c = cs[tempIndex];
                        if (c == ':') {
                            tempIndex++;

                            tempIndex = skipWhitespaces(tempIndex);
                            if (tempIndex < length) {
                                c = cs[tempIndex];
                                if (c != '\r' && c != '\n') {
                                    updateIndex = true;
                                }
                            }
                        }
                    }

                    if (updateIndex) {
                        tempIndex = skipWhitespaces(tempIndex);
                        currIndex = tempIndex;
                        //keep the row count correct
                        updateCountRow(initialIndex, currIndex);

                        //now, update the first char col to be the char after the ':' in "def m2(self):", in a line as
                        //def m2(self): self.a = 10 (all in a single line)
                        int i = tempIndex;
                        while (i > 0 && i < length) {
                            c = cs[i];
                            if (c == '\r' || c == '\n') {
                                break;
                            }
                            i--;
                        }
                        firstCharCol = tempIndex - i;
                    } else {
                        currIndex--;
                    }

                    break;

                default:
                    currIndex--;
                    break;

            }
        }
    }

    /**
     * Note that it'll only skip whitespaces (not newlines)
     */
    private int skipWhitespaces(int tempIndex) {
        char c;
        while (tempIndex < length) {
            c = cs[tempIndex];
            if (c == ' ' || c == '\t') {
                tempIndex++;
            } else {
                break;
            }
        }
        return tempIndex;
    }

    /**
     * Get the next identifier available.
     * @param c the current char
     * @return the identifier found
     */
    private String getNextIdentifier(char c) {
        c = this.cs[currIndex];

        while (currIndex < length && Character.isWhitespace(c)) {
            currIndex++;
            c = this.cs[currIndex];
        }

        int currClassNameCol = currIndex;
        while (Character.isJavaIdentifierPart(c)) {
            currIndex++;
            if (currIndex >= length) {
                break;
            }
            c = this.cs[currIndex];
        }
        return ObjectsInternPool.internLocal(interned,
                new String(this.cs, currClassNameCol, currIndex - currClassNameCol));
    }

    private final ObjectsPoolMap interned = new ObjectsPoolMap();

    /**
     * Start a new method scope with the given row and column.
     * @param startMethodRow the row where the scope should start
     * @param startMethodCol the column where the scope should start
     */
    private void startMethod(String name, int startMethodRow, int startMethodCol, int leadingTabs) {
        NameTok nameTok = new NameTok(name, NameTok.ClassName);
        FunctionDef functionDef = new FunctionDef(nameTok, null, null, null, null, false);
        functionDef.beginLine = startMethodRow;
        functionDef.beginColumn = startMethodCol;

        stack.push(new NodeEntry(functionDef, leadingTabs));
    }

    /**
     * Start a new class scope with the given row and column.
     * @param startClassRow the row where the scope should start
     * @param startClassCol the column where the scope should start
     */
    private void startClass(String name, int startClassRow, int startClassCol, int leadingTabs) {
        NameTok nameTok = new NameTok(name, NameTok.ClassName);
        ClassDef classDef = new ClassDef(nameTok, null, null, null, null, null, null);

        classDef.beginLine = startClassRow;
        classDef.beginColumn = startClassCol;

        stack.push(new NodeEntry(classDef, leadingTabs));
    }

    private void endScopesInStack(int currLogicalCol) {
        while (stack.size() > 0) {
            NodeEntry peek = stack.peek();
            if (peek.logicalColumn < currLogicalCol) {
                break;
            }
            NodeEntry currNode = stack.pop();
            currNode.onEndScope();

            if (stack.size() > 0) {
                NodeEntry parentNode = stack.peek();
                if (parentNode.node instanceof FunctionDef) {
                    // Inside a function def, only deal with attributes (if func inside class)
                    if (currNode.node instanceof Assign) {
                        if (stack.size() > 1) {
                            Assign assign = (Assign) currNode.node;
                            exprType target = assign.targets[0];
                            if (target instanceof Attribute) {
                                NodeEntry parentParents = stack.peek(1);
                                if (parentParents.node instanceof ClassDef) {
                                    parentNode.body.add(currNode.node);
                                }
                            }
                        }
                    }
                } else if (parentNode.node instanceof ClassDef) {
                    parentNode.body.add(currNode.node);
                } else {
                    String msg = "Did not expect to find item below node: " + parentNode.node + " (module: "
                            + this.moduleName + " file: " + this.file + " row: " + row
                            + ").";
                    if (throwErrorOnWarnings) {
                        throw new RuntimeException(msg);

                    } else {
                        Log.log(msg);

                    }
                }
            } else {
                body.add(currNode.node);
            }
        }
    }

    /**
     * @return true if we have a match for 'class' in the current index (the 'c' must be already matched at this point)
     */
    private boolean matchClass() {
        if (currIndex + 5 >= this.length) {
            return false;
        }
        return (this.cs[currIndex + 1] == 'l' && this.cs[currIndex + 2] == 'a' && this.cs[currIndex + 3] == 's'
                && this.cs[currIndex + 4] == 's' && Character.isWhitespace(this.cs[currIndex + 5]));
    }

    /**
     * @return true if we have a match for 'def' in the current index (the 'd' must be already matched at this point)
     */
    private int matchFunction() {
        if (currIndex + 3 >= this.length) {
            return -1;
        }
        if (this.cs[currIndex + 1] == 'e' && this.cs[currIndex + 2] == 'f' && Character
                .isWhitespace(this.cs[currIndex + 3])) {
            return currIndex + 3;
        }
        return -1;
    }

    /**
     * @return true if we have a match for 'async def' in the current index (the 'a' must be already matched at this point)
     */
    private int matchAsyncFunction() {

        if (currIndex + 5 >= this.length) {
            return -1;
        }
        if (this.cs[currIndex + 1] == 's' && this.cs[currIndex + 2] == 'y'
                && this.cs[currIndex + 3] == 'n' && this.cs[currIndex + 4] == 'c' && Character
                        .isWhitespace(this.cs[currIndex + 5])) {
            int i = currIndex + 6;
            while (i < this.length && Character.isWhitespace(this.cs[i])) {
                i += 1;
            }
            if (i + 3 >= this.length) {
                return -1;
            }
            if (this.cs[i] == 'd' && this.cs[i + 1] == 'e' && this.cs[i + 2] == 'f' && Character
                    .isWhitespace(this.cs[i + 3])) {
                return i + 3;
            }
        }
        return -1;
    }

    /**
     * Callbacks called just before returning a parsed object. Used for tests
     */
    public static List<ICallback<Object, Tuple<String, SimpleNode>>> parseCallbacks = new ArrayList<ICallback<Object, Tuple<String, SimpleNode>>>();

    /**
     * Convenience method for parse(s.toCharArray())
     * @param s the string to be parsed
     * @return a Module node with the structure found
     */
    public static SimpleNode parse(String s, String moduleName, File f) {
        return parse(s.toCharArray(), moduleName, f);
    }

    /**
     * This method will parse the char array passed and will build a structure with the contents of the file.
     * @param cs the char array to be parsed
     * @return a Module node with the structure found
     */
    public static SimpleNode parse(char[] cs, String moduleName, File f) {
        return parse(cs, moduleName, cs.length, f);
    }

    public static SimpleNode parse(char[] cs, String moduleName, int len, File f) {
        FastDefinitionsParser parser = new FastDefinitionsParser(cs, len, moduleName, f);
        try {
            parser.extractBody();
        } catch (SyntaxErrorException e) {
            throw new RuntimeException(e);
        } catch (StackOverflowError e) {
            RuntimeException runtimeException = new RuntimeException(e);
            Log.log("Error parsing: " + moduleName + " - " + f + "\nContents:\n"
                    + new String(cs, 0, len > 1000 ? 1000 : len),
                    runtimeException); //report at most 1000 chars...
            throw runtimeException;
        }
        List<stmtType> body = parser.body;
        Module ret = new Module(body.toArray(new stmtType[body.size()]));
        if (parseCallbacks.size() > 0) {
            Tuple<String, SimpleNode> arg = new Tuple<String, SimpleNode>(moduleName, ret);
            for (ICallback<Object, Tuple<String, SimpleNode>> c : parseCallbacks) {
                c.call(arg);
            }
        }
        return ret;
    }

    public static SimpleNode parse(String s) {
        return parse(s.toCharArray(), null, null);
    }

}