CodeAnalyzer.java example

Explorer
Pydev-master
- plugins
/******************************************************************************
* Copyright (C) 2015  Fabio Zadrozny and others
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*     Fabio Zadrozny <fabiofz@gmail.com>    - initial API and implementation
******************************************************************************/
package org.python.pydev.shared_core.index;

import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharTokenizer;

/**
 * The tokenizers are registered externally for this analyzer.
 */
public class CodeAnalyzer extends Analyzer {

    public CodeAnalyzer() {
        super();
        fieldNameToStreamComponents.put("__default__", createDefaultComponents());
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        TokenStreamComponents streamComponents = fieldNameToStreamComponents.get(fieldName);
        if (streamComponents != null) {
            return streamComponents;
        }
        return fieldNameToStreamComponents.get("__default__");
    }

    Map<String, TokenStreamComponents> fieldNameToStreamComponents = new HashMap<>();

    public void registerTokenizer(String fieldName, TokenStreamComponents tokenStream) {
        fieldNameToStreamComponents.put(fieldName, tokenStream);
    }

    // Code in general
    public static TokenStreamComponents createDefaultComponents(String... ignoreWords) {
        Tokenizer src = new CharTokenizer() {

            @Override
            protected boolean isTokenChar(int c) {
                return Character.isJavaIdentifierPart(c);
            }

            @Override
            protected int normalize(int c) {
                return Character.toLowerCase(c);
            }
        };

        TokenFilter tok = new LowerCaseFilter(src);
        CharArraySet stopWords = StopFilter.makeStopSet(ignoreWords);
        tok = new StopFilter(tok, stopWords);

        TokenStreamComponents tokenStreamComponents = new TokenStreamComponents(src, tok);
        return tokenStreamComponents;
    }

    // Python-related
    private static final String[] PYTHON_KEYWORDS = new String[] {
            "False", "None", "True", "and", "as", "assert",
            "break", "class", "continue", "def", "del", "elif",
            "else", "except", "finally", "for", "from", "global",
            "if", "import", "in", "is", "lambda", "nonlocal",
            "not", "or", "pass", "raise", "return", "try", "while",
            "with", "yield" };

    public static TokenStreamComponents createPythonStreamComponents() {
        return createDefaultComponents(PYTHON_KEYWORDS);
    }

    // Things to ignore in comments/strings
    private static final String[] GENERAL_STOP_WORDS = {
            "a", "an", "and", "are", "as", "at", "be", "but",
            "by", "for", "if", "in", "into", "is", "it", "i",
            "no", "not", "of", "on", "or", "s", "such",
            "that", "the", "their", "then", "there", "these",
            "they", "this", "to", "was", "will", "with", "we", "you" };

    public static TokenStreamComponents createStringsOrCommentsStreamComponents() {
        return createDefaultComponents(GENERAL_STOP_WORDS);
    }

}