/******************************************************************************
* Copyright (C) 2015 Fabio Zadrozny and others
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Fabio Zadrozny <fabiofz@gmail.com> - initial API and implementation
******************************************************************************/
package org.python.pydev.shared_core.index;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;
import org.eclipse.jface.text.rules.IToken;
import org.eclipse.jface.text.rules.ITokenScanner;
import org.python.pydev.shared_core.callbacks.ICallback;
import org.python.pydev.shared_core.io.FileUtils;
import org.python.pydev.shared_core.log.Log;
import org.python.pydev.shared_core.partitioner.IContentsScanner;
import org.python.pydev.shared_core.string.FastStringBuffer;
import org.python.pydev.shared_core.string.StringUtils;
import org.python.pydev.shared_core.structure.OrderedMap;
import org.python.pydev.shared_core.utils.Timer;
public class IndexApi {
public static final boolean DEBUG = false;
private static final String lucene6dot1Suffix = "L6dot1";
private final Directory indexDir;
private IndexWriter writer;
private SearcherManager searchManager;
private SearcherFactory searcherFactory;
private int maxMatches = Integer.MAX_VALUE;
private CodeAnalyzer analyzer;
private final Object lock = new Object();
protected IndexApi(Object /*Directory*/ indexDirObj, boolean applyAllDeletes) throws IOException {
// Note; indexDirOjb must actually be a org.apache.lucene.store.Directory (but we don't export it
// in the API so that it's not in the public API -- that way clients don't need to depend on it
// as they'll usually use the other constructor which receive as File anyways).
Directory indexDir = (Directory) indexDirObj;
Directory resultDir = indexDir;
if (indexDir instanceof FSDirectory) {
FSDirectory dir = (FSDirectory) indexDir;
java.nio.file.Path indexPath = dir.getDirectory();
File indexFile = indexPath.toFile();
if (!indexFile.getAbsolutePath().endsWith(lucene6dot1Suffix)) {
File newIndexFile = new File(indexFile.getAbsolutePath() + lucene6dot1Suffix);
resultDir = FSDirectory.open(newIndexFile.toPath());
}
}
this.indexDir = resultDir;
init(applyAllDeletes);
}
/**
* @return an object which external users can use to synchronize on this lock. Note that
* the methods in the API aren't synchronized (so, if more than one thread can use it in
* the use-case, this lock should be used for synchronization).
*/
public Object getLock() {
return lock;
}
public IndexApi(File indexDir, boolean applyAllDeletes) throws IOException {
this(FSDirectory.open(indexDir.toPath()), applyAllDeletes);
}
public void init(boolean applyAllDeletes) throws IOException {
this.analyzer = new CodeAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setCommitOnClose(true);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
try {
writer = new IndexWriter(this.indexDir, config);
} catch (IOException e) {
config.setOpenMode(OpenMode.CREATE);
writer = new IndexWriter(this.indexDir, config);
}
searcherFactory = new SearcherFactory();
searchManager = new SearcherManager(writer, applyAllDeletes, false, searcherFactory);
}
public void registerTokenizer(String fieldName, TokenStreamComponents tokenStream) {
this.analyzer.registerTokenizer(fieldName, tokenStream);
}
public void commit() throws IOException {
if (this.writer != null) {
this.writer.commit();
}
}
public void dispose() {
if (this.writer != null) {
try {
this.writer.commit();
} catch (IOException e) {
Log.log(e);
}
try {
this.writer.close();
} catch (Exception e) {
Log.log(e);
}
this.writer = null;
}
if (this.searchManager != null) {
try {
this.searchManager.close();
} catch (Exception e) {
Log.log(e);
}
this.searchManager = null;
}
}
private Document createDocument(Map<String, String> fieldsToIndex) {
Document doc = new Document();
Set<Entry<String, String>> entrySet = fieldsToIndex.entrySet();
for (Entry<String, String> entry : entrySet) {
doc.add(new StringField(entry.getKey(), entry.getValue(), Field.Store.YES));
}
return doc;
}
private Document createDocument(IPath filepath, long modifiedTime, Map<String, String> additionalStringFields) {
Document doc = new Document();
doc.add(new StringField(IFields.FILEPATH, filepath.toPortableString(), Field.Store.YES)); // StringField is not analyzed
doc.add(new StringField(IFields.MODIFIED_TIME, String.valueOf(modifiedTime), Field.Store.YES));
String lastSegment = filepath.removeFileExtension().lastSegment();
if (lastSegment == null) {
lastSegment = "";
}
doc.add(new StringField(IFields.FILENAME, lastSegment, Field.Store.YES)); // StringField is not analyzed
String fileExtension = filepath.getFileExtension();
if (fileExtension == null) {
fileExtension = "";
}
if (additionalStringFields != null) {
Set<Entry<String, String>> entrySet = additionalStringFields.entrySet();
for (Entry<String, String> entry : entrySet) {
doc.add(new StringField(entry.getKey(), entry.getValue(), Field.Store.YES));
}
}
doc.add(new StringField(IFields.EXTENSION, fileExtension, Field.Store.YES)); // StringField is not analyzed
return doc;
}
public void index(Path filepath, long modifiedTime, String general) throws IOException {
this.index(filepath, modifiedTime, general, null);
}
public void index(Path filepath, long modifiedTime, String general, Map<String, String> additionalStringFields)
throws IOException {
this.index(filepath, modifiedTime, general, IFields.GENERAL_CONTENTS, additionalStringFields);
}
public void index(Path filepath, long modifiedTime, String general, String fieldName,
Map<String, String> additionalStringFields) throws IOException {
if (this.writer == null) {
return;
}
Document doc = createDocument(filepath, modifiedTime, additionalStringFields);
//Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
doc.add(new TextField(fieldName, general, Field.Store.NO));
this.writer.addDocument(doc);
}
public void index(Map<String, String> fieldsToIndex, Reader reader, String fieldName) throws IOException {
if (this.writer == null) {
return;
}
Document doc = createDocument(fieldsToIndex);
//Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
doc.add(new TextField(fieldName, reader));
this.writer.addDocument(doc);
}
public void index(IPath filepath, long modifiedTime, Reader reader, String fieldName) throws IOException {
if (this.writer == null) {
return;
}
Document doc = createDocument(filepath, modifiedTime, null);
//Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
doc.add(new TextField(fieldName, reader));
this.writer.addDocument(doc);
}
/**
* We index based on what we want to search later on!
*
* We have to index giving the path for the file (workspace-relative path).
*
* The project is not expected to be passed because the idea is having one index
* for each project.
*
* The scanner and the mapper work together: the scanner generates the tokens
* and the mapper maps the token from the scanner to the mapping used for indexing.
*/
public void index(Path filepath, long modifiedTime, ITokenScanner tokenScanner, IFields mapper)
throws IOException {
if (this.writer == null) {
return;
}
IContentsScanner contentsScanner = (IContentsScanner) tokenScanner;
Document doc = createDocument(filepath, modifiedTime, null);
FastStringBuffer buf = new FastStringBuffer();
IToken nextToken = tokenScanner.nextToken();
while (!nextToken.isEOF()) {
if (!nextToken.isUndefined() && !nextToken.isWhitespace()) {
int offset = tokenScanner.getTokenOffset();
int length = tokenScanner.getTokenLength();
contentsScanner.getContents(offset, length, buf.clear());
String fieldName = mapper.getTokenFieldName(nextToken);
if (fieldName != null) {
//Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
doc.add(new TextField(fieldName, buf.toString(), Field.Store.NO));
}
}
nextToken = tokenScanner.nextToken();
}
this.writer.addDocument(doc);
}
public SearchResult searchExact(String string, String fieldName, boolean applyAllDeletes) throws IOException {
return searchExact(string, fieldName, applyAllDeletes, null);
}
public SearchResult searchExact(String string, String fieldName, boolean applyAllDeletes, IDocumentsVisitor visitor,
String... fieldsToLoad)
throws IOException {
Query query = new TermQuery(new Term(fieldName, string));
return search(query, applyAllDeletes, visitor, fieldsToLoad);
}
public SearchResult searchWildcard(Set<String> string, String fieldName, boolean applyAllDeletes,
IDocumentsVisitor visitor, Map<String, String> translateFields, String... fieldsToLoad)
throws IOException {
OrderedMap<String, Set<String>> fieldNameToValues = new OrderedMap<>();
fieldNameToValues.put(fieldName, string);
return searchWildcard(fieldNameToValues, applyAllDeletes, visitor, translateFields, fieldsToLoad);
}
/**
* Search where we return if any of the given strings appear.
*
* Accepts wildcard in queries
*/
public SearchResult searchWildcard(OrderedMap<String, Set<String>> fieldNameToValues, boolean applyAllDeletes,
IDocumentsVisitor visitor, Map<String, String> translateFields, String... fieldsToLoad)
throws IOException {
Builder booleanQueryBuilder = new BooleanQuery.Builder();
Set<Entry<String, Set<String>>> entrySet = fieldNameToValues.entrySet();
for (Entry<String, Set<String>> entry : entrySet) {
Builder fieldQueryBuilder = new BooleanQuery.Builder();
String fieldName = entry.getKey();
if (translateFields != null) {
String newFieldName = translateFields.get(fieldName);
if (newFieldName != null) {
fieldName = newFieldName;
}
}
boolean allNegate = true;
for (String s : entry.getValue()) {
if (s.length() == 0) {
throw new RuntimeException("Unable to create term for searching empty string.");
}
boolean negate = false;
if (s.startsWith("!")) {
// Negation if dealing with paths
if (IFields.FIELDS_NEGATED_WITH_EXCLAMATION.contains(fieldName)) {
s = s.substring(1);
negate = true;
}
}
if (s.length() == 0) {
// Only a single '!' for the negate.
continue;
}
if (s.indexOf('*') != -1 || s.indexOf('?') != -1) {
if (StringUtils.containsOnlyWildCards(s)) {
throw new RuntimeException("Unable to create term for searching only wildcards: " + s);
}
fieldQueryBuilder.add(new WildcardQuery(new Term(fieldName, s)),
negate ? BooleanClause.Occur.MUST_NOT : BooleanClause.Occur.SHOULD);
} else {
fieldQueryBuilder.add(new TermQuery(new Term(fieldName, s)),
negate ? BooleanClause.Occur.MUST_NOT : BooleanClause.Occur.SHOULD);
}
if (!negate) {
allNegate = false;
}
}
BooleanQuery transitiveQuery = fieldQueryBuilder.build();
if (transitiveQuery.clauses().size() != 0) {
if (allNegate) {
// If all are negations, we actually have to add one which would
// match all to remove the negations.
fieldQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
}
booleanQueryBuilder.add(fieldQueryBuilder.build(), BooleanClause.Occur.MUST);
}
}
BooleanQuery booleanQuery = booleanQueryBuilder.build();
if (DEBUG) {
System.out.println("Searching: " + booleanQuery);
}
return search(booleanQuery, applyAllDeletes, visitor, fieldsToLoad);
}
public SearchResult searchRegexp(String string, String fieldName, boolean applyAllDeletes) throws IOException {
return searchRegexp(string, fieldName, applyAllDeletes, null);
}
public SearchResult searchRegexp(String string, String fieldName,
boolean applyAllDeletes, IDocumentsVisitor visitor, String... fieldsToLoad) throws IOException {
Query query = new RegexpQuery(new Term(fieldName, string));
return search(query, applyAllDeletes, visitor, fieldsToLoad);
}
public static class DocumentInfo {
private Document document;
private int documentId;
public DocumentInfo(Document document, int doc) {
this.document = document;
this.documentId = doc;
}
public String get(String field) {
return this.document.get(field);
}
public int getDocId() {
return this.documentId;
}
}
public static interface IDocumentsVisitor {
void visit(DocumentInfo documentInfo);
}
/**
* @param fields the fields to be loaded.
*/
public void visitAllDocs(IDocumentsVisitor visitor, String... fields) throws IOException {
boolean applyAllDeletes = true;
try (IndexReader reader = DirectoryReader.open(writer, applyAllDeletes, false);) {
IndexSearcher searcher = searcherFactory.newSearcher(reader, null);
Query query = new MatchAllDocsQuery();
TopDocs docs = searcher.search(query, Integer.MAX_VALUE);
ScoreDoc[] scoreDocs = docs.scoreDocs;
int length = scoreDocs.length;
for (int i = 0; i < length; i++) {
ScoreDoc scoreDoc = scoreDocs[i];
DocumentStoredFieldVisitor fieldVisitor = new DocumentStoredFieldVisitor(fields);
reader.document(scoreDoc.doc, fieldVisitor);
Document document = fieldVisitor.getDocument();
visitor.visit(new DocumentInfo(document, scoreDoc.doc));
}
}
}
public SearchResult search(Query query, boolean applyAllDeletes, IDocumentsVisitor visitor, String... fields)
throws IOException {
try {
this.writer.commit();
} catch (Exception e) {
Log.log(e);
}
try (IndexReader reader = DirectoryReader.open(writer, applyAllDeletes, false);) {
IndexSearcher searcher = searcherFactory.newSearcher(reader, null);
TopDocs search = searcher.search(query, maxMatches);
ScoreDoc[] scoreDocs = search.scoreDocs;
if (visitor != null) {
int length = scoreDocs.length;
for (int i = 0; i < length; i++) {
ScoreDoc scoreDoc = scoreDocs[i];
DocumentStoredFieldVisitor fieldVisitor = new DocumentStoredFieldVisitor(fields);
reader.document(scoreDoc.doc, fieldVisitor);
Document document = fieldVisitor.getDocument();
visitor.visit(new DocumentInfo(document, scoreDoc.doc));
}
}
return new SearchResult(scoreDocs);
}
}
public void removeDocs(Map<String, Collection<String>> fieldToValuesToRemove) throws IOException {
int total = 0;
Set<Entry<String, Collection<String>>> entrySet = fieldToValuesToRemove.entrySet();
for (Entry<String, Collection<String>> entry : entrySet) {
total += entry.getValue().size();
}
if (total == 0) {
return;
}
ArrayList<Term> lst = new ArrayList<>(total);
for (Entry<String, Collection<String>> entry : entrySet) {
String fieldName = entry.getKey();
for (String string : entry.getValue()) {
lst.add(new Term(fieldName, string));
}
}
Term[] queries = lst.toArray(new Term[0]);
this.writer.deleteDocuments(queries);
}
public void setMaxMatches(int maxMatches) {
this.maxMatches = maxMatches;
}
public int getMaxMatches() {
return maxMatches;
}
public static void main(String[] args) throws IOException {
File f = new File("x:\\index");
final IndexApi indexApi = new IndexApi(f, true);
ICallback<Object, java.nio.file.Path> onFile = new ICallback<Object, java.nio.file.Path>() {
@Override
public Object call(java.nio.file.Path path) {
String string = path.toString();
if (string.endsWith(".py")) {
try (SeekableByteChannel sbc = Files.newByteChannel(path);
InputStream in = Channels.newInputStream(sbc)) {
Reader reader = new BufferedReader(new InputStreamReader(in));
IPath path2 = Path.fromOSString(string);
indexApi.index(path2, FileUtils.lastModified(path.toFile()),
reader, IFields.GENERAL_CONTENTS);
} catch (Exception e) {
Log.log("Error parsing: " + path, e);
}
}
return null;
}
};
Timer timer = new Timer();
// FileUtils.visitDirectory(new File("x:\\etk"), true, onFile);
// indexApi.commit();
indexApi.setMaxMatches(Integer.MAX_VALUE);
SearchResult searchResult = indexApi.searchRegexp(".*", IFields.GENERAL_CONTENTS, true);
System.out.println("Matched: " + searchResult.getNumberOfDocumentMatches());
timer.printDiff("Total time");
// indexApi.dispose();
// indexApi.index(filepath, modifiedTime, general);
}
}