/******************************************************************************
* Copyright (C) 2015 Fabio Zadrozny and others
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Fabio Zadrozny <fabiofz@gmail.com> - initial API and implementation
******************************************************************************/
package org.python.pydev.shared_core.index;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharTokenizer;
/**
* The tokenizers are registered externally for this analyzer.
*/
public class CodeAnalyzer extends Analyzer {
public CodeAnalyzer() {
super();
fieldNameToStreamComponents.put("__default__", createDefaultComponents());
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
TokenStreamComponents streamComponents = fieldNameToStreamComponents.get(fieldName);
if (streamComponents != null) {
return streamComponents;
}
return fieldNameToStreamComponents.get("__default__");
}
Map<String, TokenStreamComponents> fieldNameToStreamComponents = new HashMap<>();
public void registerTokenizer(String fieldName, TokenStreamComponents tokenStream) {
fieldNameToStreamComponents.put(fieldName, tokenStream);
}
// Code in general
public static TokenStreamComponents createDefaultComponents(String... ignoreWords) {
Tokenizer src = new CharTokenizer() {
@Override
protected boolean isTokenChar(int c) {
return Character.isJavaIdentifierPart(c);
}
@Override
protected int normalize(int c) {
return Character.toLowerCase(c);
}
};
TokenFilter tok = new LowerCaseFilter(src);
CharArraySet stopWords = StopFilter.makeStopSet(ignoreWords);
tok = new StopFilter(tok, stopWords);
TokenStreamComponents tokenStreamComponents = new TokenStreamComponents(src, tok);
return tokenStreamComponents;
}
// Python-related
private static final String[] PYTHON_KEYWORDS = new String[] {
"False", "None", "True", "and", "as", "assert",
"break", "class", "continue", "def", "del", "elif",
"else", "except", "finally", "for", "from", "global",
"if", "import", "in", "is", "lambda", "nonlocal",
"not", "or", "pass", "raise", "return", "try", "while",
"with", "yield" };
public static TokenStreamComponents createPythonStreamComponents() {
return createDefaultComponents(PYTHON_KEYWORDS);
}
// Things to ignore in comments/strings
private static final String[] GENERAL_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but",
"by", "for", "if", "in", "into", "is", "it", "i",
"no", "not", "of", "on", "or", "s", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with", "we", "you" };
public static TokenStreamComponents createStringsOrCommentsStreamComponents() {
return createDefaultComponents(GENERAL_STOP_WORDS);
}
}