IndexApi.java example

Explorer
Pydev-master
- plugins
/******************************************************************************
* Copyright (C) 2015  Fabio Zadrozny and others
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*     Fabio Zadrozny <fabiofz@gmail.com>    - initial API and implementation
******************************************************************************/
package org.python.pydev.shared_core.index;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;
import org.eclipse.jface.text.rules.IToken;
import org.eclipse.jface.text.rules.ITokenScanner;
import org.python.pydev.shared_core.callbacks.ICallback;
import org.python.pydev.shared_core.io.FileUtils;
import org.python.pydev.shared_core.log.Log;
import org.python.pydev.shared_core.partitioner.IContentsScanner;
import org.python.pydev.shared_core.string.FastStringBuffer;
import org.python.pydev.shared_core.string.StringUtils;
import org.python.pydev.shared_core.structure.OrderedMap;
import org.python.pydev.shared_core.utils.Timer;

public class IndexApi {

    public static final boolean DEBUG = false;

    private static final String lucene6dot1Suffix = "L6dot1";

    private final Directory indexDir;
    private IndexWriter writer;
    private SearcherManager searchManager;
    private SearcherFactory searcherFactory;
    private int maxMatches = Integer.MAX_VALUE;
    private CodeAnalyzer analyzer;
    private final Object lock = new Object();

    protected IndexApi(Object /*Directory*/ indexDirObj, boolean applyAllDeletes) throws IOException {
        // Note; indexDirOjb must actually be a org.apache.lucene.store.Directory (but we don't export it
        // in the API so that it's not in the public API -- that way clients don't need to depend on it
        // as they'll usually use the other constructor which receive as File anyways).
        Directory indexDir = (Directory) indexDirObj;
        Directory resultDir = indexDir;
        if (indexDir instanceof FSDirectory) {
            FSDirectory dir = (FSDirectory) indexDir;
            java.nio.file.Path indexPath = dir.getDirectory();
            File indexFile = indexPath.toFile();
            if (!indexFile.getAbsolutePath().endsWith(lucene6dot1Suffix)) {
                File newIndexFile = new File(indexFile.getAbsolutePath() + lucene6dot1Suffix);
                resultDir = FSDirectory.open(newIndexFile.toPath());
            }
        }
        this.indexDir = resultDir;
        init(applyAllDeletes);
    }

    /**
     * @return an object which external users can use to synchronize on this lock. Note that
     * the methods in the API aren't synchronized (so, if more than one thread can use it in
     * the use-case, this lock should be used for synchronization).
     */
    public Object getLock() {
        return lock;
    }

    public IndexApi(File indexDir, boolean applyAllDeletes) throws IOException {
        this(FSDirectory.open(indexDir.toPath()), applyAllDeletes);
    }

    public void init(boolean applyAllDeletes) throws IOException {
        this.analyzer = new CodeAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        config.setCommitOnClose(true);
        config.setOpenMode(OpenMode.CREATE_OR_APPEND);
        try {
            writer = new IndexWriter(this.indexDir, config);
        } catch (IOException e) {
            config.setOpenMode(OpenMode.CREATE);
            writer = new IndexWriter(this.indexDir, config);
        }

        searcherFactory = new SearcherFactory();
        searchManager = new SearcherManager(writer, applyAllDeletes, false, searcherFactory);
    }

    public void registerTokenizer(String fieldName, TokenStreamComponents tokenStream) {
        this.analyzer.registerTokenizer(fieldName, tokenStream);
    }

    public void commit() throws IOException {
        if (this.writer != null) {
            this.writer.commit();
        }
    }

    public void dispose() {
        if (this.writer != null) {
            try {
                this.writer.commit();
            } catch (IOException e) {
                Log.log(e);
            }
            try {
                this.writer.close();
            } catch (Exception e) {
                Log.log(e);
            }
            this.writer = null;
        }

        if (this.searchManager != null) {
            try {
                this.searchManager.close();
            } catch (Exception e) {
                Log.log(e);
            }
            this.searchManager = null;
        }
    }

    private Document createDocument(Map<String, String> fieldsToIndex) {
        Document doc = new Document();

        Set<Entry<String, String>> entrySet = fieldsToIndex.entrySet();
        for (Entry<String, String> entry : entrySet) {
            doc.add(new StringField(entry.getKey(), entry.getValue(), Field.Store.YES));
        }

        return doc;
    }

    private Document createDocument(IPath filepath, long modifiedTime, Map<String, String> additionalStringFields) {
        Document doc = new Document();

        doc.add(new StringField(IFields.FILEPATH, filepath.toPortableString(), Field.Store.YES)); // StringField is not analyzed
        doc.add(new StringField(IFields.MODIFIED_TIME, String.valueOf(modifiedTime), Field.Store.YES));

        String lastSegment = filepath.removeFileExtension().lastSegment();
        if (lastSegment == null) {
            lastSegment = "";
        }
        doc.add(new StringField(IFields.FILENAME, lastSegment, Field.Store.YES)); // StringField is not analyzed
        String fileExtension = filepath.getFileExtension();
        if (fileExtension == null) {
            fileExtension = "";
        }

        if (additionalStringFields != null) {
            Set<Entry<String, String>> entrySet = additionalStringFields.entrySet();
            for (Entry<String, String> entry : entrySet) {
                doc.add(new StringField(entry.getKey(), entry.getValue(), Field.Store.YES));
            }
        }

        doc.add(new StringField(IFields.EXTENSION, fileExtension, Field.Store.YES)); // StringField is not analyzed
        return doc;
    }

    public void index(Path filepath, long modifiedTime, String general) throws IOException {
        this.index(filepath, modifiedTime, general, null);
    }

    public void index(Path filepath, long modifiedTime, String general, Map<String, String> additionalStringFields)
            throws IOException {
        this.index(filepath, modifiedTime, general, IFields.GENERAL_CONTENTS, additionalStringFields);
    }

    public void index(Path filepath, long modifiedTime, String general, String fieldName,
            Map<String, String> additionalStringFields) throws IOException {
        if (this.writer == null) {
            return;
        }
        Document doc = createDocument(filepath, modifiedTime, additionalStringFields);

        //Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
        doc.add(new TextField(fieldName, general, Field.Store.NO));

        this.writer.addDocument(doc);
    }

    public void index(Map<String, String> fieldsToIndex, Reader reader, String fieldName) throws IOException {
        if (this.writer == null) {
            return;
        }
        Document doc = createDocument(fieldsToIndex);

        //Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
        doc.add(new TextField(fieldName, reader));

        this.writer.addDocument(doc);
    }

    public void index(IPath filepath, long modifiedTime, Reader reader, String fieldName) throws IOException {
        if (this.writer == null) {
            return;
        }
        Document doc = createDocument(filepath, modifiedTime, null);

        //Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
        doc.add(new TextField(fieldName, reader));

        this.writer.addDocument(doc);
    }

    /**
     * We index based on what we want to search later on!
     *
     * We have to index giving the path for the file (workspace-relative path).
     *
     * The project is not expected to be passed because the idea is having one index
     * for each project.
     *
     * The scanner and the mapper work together: the scanner generates the tokens
     * and the mapper maps the token from the scanner to the mapping used for indexing.
     */
    public void index(Path filepath, long modifiedTime, ITokenScanner tokenScanner, IFields mapper)
            throws IOException {
        if (this.writer == null) {
            return;
        }
        IContentsScanner contentsScanner = (IContentsScanner) tokenScanner;
        Document doc = createDocument(filepath, modifiedTime, null);

        FastStringBuffer buf = new FastStringBuffer();
        IToken nextToken = tokenScanner.nextToken();
        while (!nextToken.isEOF()) {
            if (!nextToken.isUndefined() && !nextToken.isWhitespace()) {
                int offset = tokenScanner.getTokenOffset();
                int length = tokenScanner.getTokenLength();
                contentsScanner.getContents(offset, length, buf.clear());
                String fieldName = mapper.getTokenFieldName(nextToken);
                if (fieldName != null) {
                    //Note: TextField should be analyzed/normalized in Analyzer.createComponents(String)
                    doc.add(new TextField(fieldName, buf.toString(), Field.Store.NO));
                }
            }
            nextToken = tokenScanner.nextToken();
        }

        this.writer.addDocument(doc);
    }

    public SearchResult searchExact(String string, String fieldName, boolean applyAllDeletes) throws IOException {
        return searchExact(string, fieldName, applyAllDeletes, null);
    }

    public SearchResult searchExact(String string, String fieldName, boolean applyAllDeletes, IDocumentsVisitor visitor,
            String... fieldsToLoad)
            throws IOException {
        Query query = new TermQuery(new Term(fieldName, string));
        return search(query, applyAllDeletes, visitor, fieldsToLoad);
    }

    public SearchResult searchWildcard(Set<String> string, String fieldName, boolean applyAllDeletes,
            IDocumentsVisitor visitor, Map<String, String> translateFields, String... fieldsToLoad)
            throws IOException {
        OrderedMap<String, Set<String>> fieldNameToValues = new OrderedMap<>();
        fieldNameToValues.put(fieldName, string);
        return searchWildcard(fieldNameToValues, applyAllDeletes, visitor, translateFields, fieldsToLoad);
    }

    /**
     * Search where we return if any of the given strings appear.
     *
     * Accepts wildcard in queries
     */
    public SearchResult searchWildcard(OrderedMap<String, Set<String>> fieldNameToValues, boolean applyAllDeletes,
            IDocumentsVisitor visitor, Map<String, String> translateFields, String... fieldsToLoad)
            throws IOException {
        Builder booleanQueryBuilder = new BooleanQuery.Builder();
        Set<Entry<String, Set<String>>> entrySet = fieldNameToValues.entrySet();
        for (Entry<String, Set<String>> entry : entrySet) {
            Builder fieldQueryBuilder = new BooleanQuery.Builder();
            String fieldName = entry.getKey();
            if (translateFields != null) {
                String newFieldName = translateFields.get(fieldName);
                if (newFieldName != null) {
                    fieldName = newFieldName;
                }
            }
            boolean allNegate = true;
            for (String s : entry.getValue()) {
                if (s.length() == 0) {
                    throw new RuntimeException("Unable to create term for searching empty string.");
                }
                boolean negate = false;
                if (s.startsWith("!")) {
                    // Negation if dealing with paths
                    if (IFields.FIELDS_NEGATED_WITH_EXCLAMATION.contains(fieldName)) {
                        s = s.substring(1);
                        negate = true;
                    }
                }
                if (s.length() == 0) {
                    // Only a single '!' for the negate.
                    continue;
                }
                if (s.indexOf('*') != -1 || s.indexOf('?') != -1) {
                    if (StringUtils.containsOnlyWildCards(s)) {
                        throw new RuntimeException("Unable to create term for searching only wildcards: " + s);
                    }
                    fieldQueryBuilder.add(new WildcardQuery(new Term(fieldName, s)),
                            negate ? BooleanClause.Occur.MUST_NOT : BooleanClause.Occur.SHOULD);

                } else {
                    fieldQueryBuilder.add(new TermQuery(new Term(fieldName, s)),
                            negate ? BooleanClause.Occur.MUST_NOT : BooleanClause.Occur.SHOULD);
                }
                if (!negate) {
                    allNegate = false;
                }
            }
            BooleanQuery transitiveQuery = fieldQueryBuilder.build();
            if (transitiveQuery.clauses().size() != 0) {
                if (allNegate) {
                    // If all are negations, we actually have to add one which would
                    // match all to remove the negations.
                    fieldQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
                }
                booleanQueryBuilder.add(fieldQueryBuilder.build(), BooleanClause.Occur.MUST);
            }
        }
        BooleanQuery booleanQuery = booleanQueryBuilder.build();
        if (DEBUG) {
            System.out.println("Searching: " + booleanQuery);
        }
        return search(booleanQuery, applyAllDeletes, visitor, fieldsToLoad);
    }

    public SearchResult searchRegexp(String string, String fieldName, boolean applyAllDeletes) throws IOException {
        return searchRegexp(string, fieldName, applyAllDeletes, null);
    }

    public SearchResult searchRegexp(String string, String fieldName,
            boolean applyAllDeletes, IDocumentsVisitor visitor, String... fieldsToLoad) throws IOException {
        Query query = new RegexpQuery(new Term(fieldName, string));
        return search(query, applyAllDeletes, visitor, fieldsToLoad);
    }

    public static class DocumentInfo {

        private Document document;
        private int documentId;

        public DocumentInfo(Document document, int doc) {
            this.document = document;
            this.documentId = doc;
        }

        public String get(String field) {
            return this.document.get(field);
        }

        public int getDocId() {
            return this.documentId;
        }

    }

    public static interface IDocumentsVisitor {

        void visit(DocumentInfo documentInfo);

    }

    /**
     * @param fields the fields to be loaded.
     */
    public void visitAllDocs(IDocumentsVisitor visitor, String... fields) throws IOException {
        boolean applyAllDeletes = true;
        try (IndexReader reader = DirectoryReader.open(writer, applyAllDeletes, false);) {

            IndexSearcher searcher = searcherFactory.newSearcher(reader, null);
            Query query = new MatchAllDocsQuery();
            TopDocs docs = searcher.search(query, Integer.MAX_VALUE);
            ScoreDoc[] scoreDocs = docs.scoreDocs;
            int length = scoreDocs.length;
            for (int i = 0; i < length; i++) {
                ScoreDoc scoreDoc = scoreDocs[i];
                DocumentStoredFieldVisitor fieldVisitor = new DocumentStoredFieldVisitor(fields);
                reader.document(scoreDoc.doc, fieldVisitor);
                Document document = fieldVisitor.getDocument();
                visitor.visit(new DocumentInfo(document, scoreDoc.doc));
            }
        }
    }

    public SearchResult search(Query query, boolean applyAllDeletes, IDocumentsVisitor visitor, String... fields)
            throws IOException {
        try {
            this.writer.commit();
        } catch (Exception e) {
            Log.log(e);
        }
        try (IndexReader reader = DirectoryReader.open(writer, applyAllDeletes, false);) {
            IndexSearcher searcher = searcherFactory.newSearcher(reader, null);

            TopDocs search = searcher.search(query, maxMatches);
            ScoreDoc[] scoreDocs = search.scoreDocs;

            if (visitor != null) {
                int length = scoreDocs.length;
                for (int i = 0; i < length; i++) {
                    ScoreDoc scoreDoc = scoreDocs[i];
                    DocumentStoredFieldVisitor fieldVisitor = new DocumentStoredFieldVisitor(fields);
                    reader.document(scoreDoc.doc, fieldVisitor);
                    Document document = fieldVisitor.getDocument();
                    visitor.visit(new DocumentInfo(document, scoreDoc.doc));
                }
            }

            return new SearchResult(scoreDocs);
        }
    }

    public void removeDocs(Map<String, Collection<String>> fieldToValuesToRemove) throws IOException {
        int total = 0;
        Set<Entry<String, Collection<String>>> entrySet = fieldToValuesToRemove.entrySet();
        for (Entry<String, Collection<String>> entry : entrySet) {
            total += entry.getValue().size();
        }
        if (total == 0) {
            return;
        }
        ArrayList<Term> lst = new ArrayList<>(total);
        for (Entry<String, Collection<String>> entry : entrySet) {
            String fieldName = entry.getKey();
            for (String string : entry.getValue()) {
                lst.add(new Term(fieldName, string));
            }
        }

        Term[] queries = lst.toArray(new Term[0]);
        this.writer.deleteDocuments(queries);
    }

    public void setMaxMatches(int maxMatches) {
        this.maxMatches = maxMatches;
    }

    public int getMaxMatches() {
        return maxMatches;
    }

    public static void main(String[] args) throws IOException {
        File f = new File("x:\\index");
        final IndexApi indexApi = new IndexApi(f, true);

        ICallback<Object, java.nio.file.Path> onFile = new ICallback<Object, java.nio.file.Path>() {

            @Override
            public Object call(java.nio.file.Path path) {
                String string = path.toString();
                if (string.endsWith(".py")) {
                    try (SeekableByteChannel sbc = Files.newByteChannel(path);
                            InputStream in = Channels.newInputStream(sbc)) {
                        Reader reader = new BufferedReader(new InputStreamReader(in));
                        IPath path2 = Path.fromOSString(string);
                        indexApi.index(path2, FileUtils.lastModified(path.toFile()),
                                reader, IFields.GENERAL_CONTENTS);
                    } catch (Exception e) {
                        Log.log("Error parsing: " + path, e);
                    }
                }

                return null;
            }
        };
        Timer timer = new Timer();
        //        FileUtils.visitDirectory(new File("x:\\etk"), true, onFile);
        // indexApi.commit();
        indexApi.setMaxMatches(Integer.MAX_VALUE);
        SearchResult searchResult = indexApi.searchRegexp(".*", IFields.GENERAL_CONTENTS, true);

        System.out.println("Matched: " + searchResult.getNumberOfDocumentMatches());
        timer.printDiff("Total time");
        //        indexApi.dispose();
        //        indexApi.index(filepath, modifiedTime, general);
    }

}