LuceneExecutor.java example

Explorer
moxie-master
/*
 * Copyright 2012 James Moger
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.moxie.proxy;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Method;
import java.text.MessageFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.moxie.IMavenCache;
import org.moxie.MoxieCache;
import org.moxie.Pom;
import org.moxie.PomReader;
import org.moxie.RemoteRepository;
import org.moxie.utils.FileUtils;
import org.moxie.utils.StringUtils;

/**
 * The Lucene executor handles indexing and searching POM files.
 * 
 * @author James Moger
 * 
 */
public class LuceneExecutor implements Runnable {

	private static final int INDEX_VERSION = 1;

	private static final String FIELD_PACKAGING = "type";
	private static final String FIELD_GROUPID = "groupid";
	private static final String FIELD_ARTIFACTID = "artifactid";
	private static final String FIELD_VERSION = "version";
	private static final String FIELD_NAME = "name";
	private static final String FIELD_DESCRIPTION = "description";
	private static final String FIELD_DATE = "date";

	private static final String LUCENE_DIR = "lucene";
	private static final String CONF_VERSION = "version";

	private static final Version LUCENE_VERSION = Version.LUCENE_35;

	private final Logger logger = Logger.getLogger(LuceneExecutor.class.getSimpleName());

	private final ProxyConfig config;
	private final File indexesFolder;

	private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>();
	private final Map<String, IndexWriter> writers = new ConcurrentHashMap<String, IndexWriter>();

	private final Queue<IndexPom> queue;

	public LuceneExecutor(ProxyConfig config) {
		this.config = config;
		this.indexesFolder = new File(config.getMoxieRoot(), LUCENE_DIR);
		queue = new ConcurrentLinkedQueue<IndexPom>();
	}

	/**
	 * Reindex is a blocking call which synchronously rebuilds each repository's
	 * Lucene index.
	 */
	public synchronized void reindex() {
		for (String repository : config.getLocalRepositories()) {
			index(repository);
		}
		for (RemoteRepository repository : config.getRemoteRepositories()) {
			index(repository.id);
		}
		System.gc();
	}

	/**
	 * Run is executed by a scheduled executor service at a fixed rate. This
	 * guarantees no concurrent repository index updates.  Index updates are
	 * queued and processed asynchronously by the executor service.
	 */
	@Override
	public void run() {
		if (queue.isEmpty()) {
			return;
		}
		
		Set<String> repositories = new TreeSet<String>();
		long minDiff = 60*1000L; // 1 min
		while(!queue.isEmpty()) {
			IndexPom pom = queue.peek();
			// Wait till oldest element has been in queue for minimum time.
			// 
			// This is a practical workaround for expecting parent pom files
			// to have been retrieved.  The alternative is to make the proxy
			// smart enough to identify and retrieve parent poms.  The current
			// design relies on the client instructing the proxy to retrieve
			// parent poms.
			while ((System.currentTimeMillis() - pom.date.getTime()) < minDiff) {
				try {
					Thread.sleep(200);
				} catch (InterruptedException e) {
					// ignore
				}
			}
			
			queue.poll();
			logger.info("indexing " + pom.file);
			incrementalIndex(pom.file);
			
			// cache the repository that we just indexed
			String repository = config.getRepositoryId(pom.file);
			repositories.add(repository);
		}
		
		// create/update the prefix indexes for the repositories
		for (String repository : repositories) {
			IMavenCache cache = config.getMavenCache(repository);
			cache.updatePrefixesIndex();
		}
	}

	/**
	 * Reads the Lucene config file for the repository to check the index
	 * version. If the index version is different, then rebuild the repository
	 * index.
	 * 
	 * @param repository
	 * @return true of the on-disk index format is different than INDEX_VERSION
	 */
	private boolean shouldReindex(String repository) {
		try {
			File folder = new File(indexesFolder, LUCENE_DIR);
			File file = new File(folder, "config.properties");
			Properties props = new Properties();
			props.load(new FileReader(file));
			int indexVersion = Integer.parseInt(props.getProperty(CONF_VERSION, "0"));
			// reindex if versions do not match
			return indexVersion != INDEX_VERSION;
		} catch (Throwable t) {
		}
		return true;
	}

	/**
	 * Synchronously indexes a repository. This may build a complete index of a
	 * repository or it may update an existing index.
	 * 
	 * @param name
	 *            the name of the repository
	 * @param repository
	 *            the repository object
	 */
	private void index(String repository) {
		try {
			if (shouldReindex(repository)) {
				// (re)build the entire index
				IndexResult result = reindex(repository);

				if (result.success) {
					if (result.artifactCount > 0) {
						String msg = "Built {0} Lucene index from {1} artifacts in {2} secs";
						logger.info(MessageFormat.format(msg, repository, result.artifactCount,
								result.duration()));
					}
				} else {
					String msg = "Could not build {0} Lucene index!";
					logger.severe(MessageFormat.format(msg, repository));
				}
			} else {
				// update the index with latest artifacts
				IndexResult result = updateIndex(repository);
				if (result.success) {
					if (result.artifactCount > 0) {
						String msg = "Updated {0} Lucene index with {1} artifacts in {2} secs";
						logger.info(MessageFormat.format(msg, repository, result.artifactCount,
								result.duration()));
					}
				} else {
					String msg = "Could not update {0} Lucene index!";
					logger.severe(MessageFormat.format(msg, repository));
				}
			}

			// create/update the prefix indexes for the repositories
			IMavenCache cache = config.getMavenCache(repository);
			cache.updatePrefixesIndex();
		} catch (Throwable t) {
			logger.log(Level.SEVERE, MessageFormat.format("Lucene indexing failure for {0}", repository), t);
		}
	}

	/**
	 * Updates a repository index incrementally from the last indexed artifacts.
	 * 
	 * @param repository
	 * @return IndexResult
	 */
	private IndexResult updateIndex(String repository) {
		IndexResult result = new IndexResult();
		return result;
	}

	/**
	 * Close the writer/searcher objects for a repository.
	 * 
	 * @param repositoryName
	 */
	public synchronized void close(String repositoryName) {
		try {
			IndexSearcher searcher = searchers.remove(repositoryName);
			if (searcher != null) {
				searcher.getIndexReader().close();
			}
		} catch (Exception e) {
			logger.log(Level.SEVERE, "Failed to close index searcher for " + repositoryName, e);
		}

		try {
			IndexWriter writer = writers.remove(repositoryName);
			if (writer != null) {
				writer.close();
			}
		} catch (Exception e) {
			logger.log(Level.SEVERE, "Failed to close index writer for " + repositoryName, e);
		}
	}

	/**
	 * Close all Lucene indexers.
	 * 
	 */
	public synchronized void close() {
		// close all writers
		for (String writer : writers.keySet()) {
			try {
				writers.get(writer).close(true);
			} catch (Throwable t) {
				logger.log(Level.SEVERE, "Failed to close Lucene writer for " + writer, t);
			}
		}
		writers.clear();

		// close all searchers
		for (String searcher : searchers.keySet()) {
			try {
				searchers.get(searcher).getIndexReader().close();
			} catch (Throwable t) {
				logger.log(Level.SEVERE, "Failed to close Lucene searcher for " + searcher, t);
			}
		}
		searchers.clear();
	}

	/**
	 * Deletes the Lucene index for the specified repository.
	 * 
	 * @param repositoryName
	 * @return true, if successful
	 */
	public boolean deleteIndex(String repositoryName) {
		// close any open writer/searcher
		close(repositoryName);

		// delete the index folder
		File luceneIndex = new File(indexesFolder, repositoryName);
		if (luceneIndex.exists()) {
			FileUtils.delete(luceneIndex);
		}
		return true;
	}

	/**
	 * This completely indexes the repository and will destroy any existing
	 * index.
	 * 
	 * @param repositoryName
	 * @return IndexResult
	 */
	public IndexResult reindex(String repository) {
		IndexResult result = new IndexResult();
		if (!deleteIndex(repository)) {
			return result;
		}
		try {
			MoxieCache moxieCache = config.getMoxieCache();
			IMavenCache repositoryCache = config.getMavenCache(repository);
			Collection<File> files = repositoryCache.getFiles("." + org.moxie.Constants.POM);
			IndexWriter writer = getIndexWriter(repository);

			for (File pomFile : files) {
				try {
					Pom pom = PomReader.readPom(moxieCache, pomFile);
					String date = DateTools.timeToString(pomFile.lastModified(), Resolution.MINUTE);

					Document doc = new Document();
					doc.add(new Field(FIELD_PACKAGING, pom.packaging, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
					doc.add(new Field(FIELD_GROUPID, pom.groupId, Store.YES, Index.ANALYZED));
					doc.add(new Field(FIELD_ARTIFACTID, pom.artifactId, Store.YES, Index.ANALYZED));
					doc.add(new Field(FIELD_VERSION, pom.version, Store.YES, Index.ANALYZED));
					if (!StringUtils.isEmpty(pom.name)) {
						doc.add(new Field(FIELD_NAME, pom.name, Store.YES, Index.ANALYZED));
					}
					if (!StringUtils.isEmpty(pom.description)) {
						doc.add(new Field(FIELD_DESCRIPTION, pom.description, Store.YES, Index.ANALYZED));
					}
					doc.add(new Field(FIELD_DATE, date, Store.YES, Index.ANALYZED));

					// add the pom to the index
					writer.addDocument(doc);
				} catch (Exception e) {
					logger.log(Level.SEVERE, MessageFormat.format("Exception while reindexing {0} in {1}",pomFile, repository), e);
				}
				result.artifactCount++;
			}

			writer.commit();
			resetIndexSearcher(repository);
			result.success();
		} catch (Exception e) {
			logger.log(Level.SEVERE, "Exception while reindexing " + repository, e);
		}
		return result;
	}

	/**
	 * Incrementally update the index.
	 * 
	 * @return pomFile
	 */
	public void index(File pomFile) {
		queue.add(new IndexPom(pomFile));
	}

	/**
	 * Incrementally update the index.
	 * 
	 * @return pomFile
	 */
	private void incrementalIndex(File pomFile) {
		try {
			String repository = config.getRepositoryId(pomFile);
			IMavenCache cache = config.getMavenCache(repository);
			Pom pom = PomReader.readPom(cache, pomFile);
			
			delete(repository, pom);
			
			IndexWriter writer = getIndexWriter(repository);

			Document doc = new Document();
			doc.add(new Field(FIELD_PACKAGING, pom.packaging, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
			doc.add(new Field(FIELD_GROUPID, pom.groupId, Store.YES, Index.ANALYZED));
			doc.add(new Field(FIELD_ARTIFACTID, pom.artifactId, Store.YES, Index.ANALYZED));
			doc.add(new Field(FIELD_VERSION, pom.version, Store.YES, Index.ANALYZED));
			if (!StringUtils.isEmpty(pom.name)) {
				doc.add(new Field(FIELD_NAME, pom.name, Store.YES, Index.ANALYZED));
			}
			if (!StringUtils.isEmpty(pom.description)) {
				doc.add(new Field(FIELD_DESCRIPTION, pom.description, Store.YES, Index.ANALYZED));
			}
			String date = DateTools.timeToString(pomFile.lastModified(), Resolution.MINUTE);
			doc.add(new Field(FIELD_DATE, date, Store.YES, Index.ANALYZED));

			// add the pom to the index
			writer.addDocument(doc);

			writer.commit();
			resetIndexSearcher(repository);
			
			config.resetRepositorySize(repository);
		} catch (Exception e) {
			logger.log(Level.SEVERE, "Exception while indexing " + pomFile, e);
		}
	}
	
	private boolean delete(String repository, Pom pom) throws IOException {
		BooleanQuery query = new BooleanQuery();
		Term groupTerm = new Term(FIELD_GROUPID, pom.groupId);
		query.add(new TermQuery(groupTerm), Occur.MUST);
		Term artifactTerm = new Term(FIELD_ARTIFACTID, pom.artifactId);
		query.add(new TermQuery(artifactTerm), Occur.MUST);
		Term versionTerm = new Term(FIELD_VERSION, pom.version);
		query.add(new TermQuery(versionTerm), Occur.MUST);
		
		IndexWriter writer = getIndexWriter(repository);
		int numDocsBefore = writer.numDocs();
		writer.deleteDocuments(query);
		writer.commit();
		int numDocsAfter = writer.numDocs();
		if (numDocsBefore == numDocsAfter) {
			logger.fine(MessageFormat.format("no records found to delete {0}", query.toString()));
			return false;
		} else {
			logger.fine(MessageFormat.format("deleted {0} records with {1}", numDocsBefore - numDocsAfter, query.toString()));
			return true;
		}
	}

	private SearchResult createSearchResult(Document doc, int hitId, int totalHits) throws ParseException {
		SearchResult result = new SearchResult();
		result.hitId = hitId;
		result.totalHits = totalHits;
		result.date = DateTools.stringToDate(doc.get(FIELD_DATE));
		result.groupId = doc.get(FIELD_GROUPID);
		result.artifactId = doc.get(FIELD_ARTIFACTID);
		result.version = doc.get(FIELD_VERSION);
		result.packaging = doc.get(FIELD_PACKAGING);
		result.name = doc.get(FIELD_NAME);
		result.description = doc.get(FIELD_DESCRIPTION);
		return result;
	}

	private synchronized void resetIndexSearcher(String repository) throws IOException {
		IndexSearcher searcher = searchers.remove(repository);
		if (searcher != null) {
			searcher.getIndexReader().close();
		}
	}

	/**
	 * Gets an index searcher for the repository.
	 * 
	 * @param repository
	 * @return
	 * @throws IOException
	 */
	private IndexSearcher getIndexSearcher(String repository) throws IOException {
		IndexSearcher searcher = searchers.get(repository);
		if (searcher == null) {
			IndexWriter writer = getIndexWriter(repository);
			searcher = new IndexSearcher(IndexReader.open(writer, true));
			searchers.put(repository, searcher);
		}
		return searcher;
	}

	/**
	 * Gets an index writer for the repository. The index will be created if it
	 * does not already exist or if forceCreate is specified.
	 * 
	 * @param repository
	 * @return an IndexWriter
	 * @throws IOException
	 */
	private IndexWriter getIndexWriter(String repository) throws IOException {
		IndexWriter indexWriter = writers.get(repository);
		File indexFolder = new File(indexesFolder, repository);
		Directory directory = FSDirectory.open(indexFolder);

		if (indexWriter == null) {
			if (!indexFolder.exists()) {
				indexFolder.mkdirs();
			}
			StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
			IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer);
			config.setOpenMode(OpenMode.CREATE_OR_APPEND);
			indexWriter = new IndexWriter(directory, config);
			writers.put(repository, indexWriter);
		}
		return indexWriter;
	}

	/**
	 * Searches the specified repositories for the given text or query
	 * 
	 * @param text
	 *            if the text is null or empty, null is returned
	 * @param page
	 *            the page number to retrieve. page is 1-indexed.
	 * @param pageSize
	 *            the number of elements to return for this page
	 * @param repositories
	 *            a list of repositories to search. if no repositories are
	 *            specified null is returned.
	 * @return a list of SearchResults in order from highest to the lowest score
	 * 
	 */
	public List<SearchResult> search(String text, int page, int pageSize, List<String> repositories) {
		if (repositories == null || repositories.size() == 0) {
			return null;
		}
		return search(text, page, pageSize, repositories.toArray(new String[0]));
	}

	/**
	 * Searches the specified repositories for the given text or query
	 * 
	 * @param text
	 *            if the text is null or empty, null is returned
	 * @param page
	 *            the page number to retrieve. page is 1-indexed.
	 * @param pageSize
	 *            the number of elements to return for this page
	 * @param repositories
	 *            a list of repositories to search. if no repositories are
	 *            specified null is returned.
	 * @return a list of SearchResults in order from highest to the lowest score
	 * 
	 */
	public List<SearchResult> search(String text, int page, int pageSize, String... repositories) {
		if (StringUtils.isEmpty(text)) {
			return null;
		}
		if (repositories == null || repositories.length == 0) {
			return null;
		}
		Set<SearchResult> results = new LinkedHashSet<SearchResult>();
		StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
		try {
			// default search checks groupId and artifactId
			BooleanQuery query = new BooleanQuery();
			QueryParser qp;
			qp = new QueryParser(LUCENE_VERSION, FIELD_GROUPID, analyzer);
			qp.setAllowLeadingWildcard(true);
			query.add(qp.parse(text), Occur.SHOULD);

			qp = new QueryParser(LUCENE_VERSION, FIELD_ARTIFACTID, analyzer);
			qp.setAllowLeadingWildcard(true);
			query.add(qp.parse(text), Occur.SHOULD);

			IndexSearcher searcher;
			if (repositories.length == 1) {
				// single repository search
				searcher = getIndexSearcher(repositories[0]);
			} else {
				// multiple repository search
				List<IndexReader> readers = new ArrayList<IndexReader>();
				for (String repository : repositories) {
					IndexSearcher repositoryIndex = getIndexSearcher(repository);
					readers.add(repositoryIndex.getIndexReader());
				}
				IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]);
				MultiSourceReader reader = new MultiSourceReader(rdrs);
				searcher = new IndexSearcher(reader);
			}
			Query rewrittenQuery = searcher.rewrite(query);
			Sort sort = new Sort(new SortField(FIELD_DATE, SortField.STRING, true));
			TopFieldDocs topDocs = searcher.search(rewrittenQuery, 10000, sort);
			int offset = Math.max(0, (page - 1) * pageSize);
			ScoreDoc[] hits = topDocs.scoreDocs;
			int totalHits = topDocs.totalHits;
			if (pageSize <= 0) {
				pageSize = totalHits;
			}
			if (totalHits > offset) {
				for (int i = offset, len = Math.min(offset + pageSize, hits.length); i < len; i++) {
					int docId = hits[i].doc;
					Document doc = searcher.doc(docId);
					SearchResult result = createSearchResult(doc, i + 1, totalHits);
					if (repositories.length == 1) {
						// single repository search
						result.repository = repositories[0];
					} else {
						// multi-repository search
						MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader();
						int index = reader.getSourceIndex(docId);
						result.repository = repositories[index];
					}
					results.add(result);
				}
			}
		} catch (Exception e) {
			logger.log(Level.SEVERE, MessageFormat.format("Exception while searching for {0}", text), e);
		}
		return new ArrayList<SearchResult>(results);
	}

	/**
	 * Simple class to track the results of an index update.
	 */
	private class IndexResult {
		long startTime = System.currentTimeMillis();
		long endTime = startTime;
		boolean success;
		int artifactCount;

		void success() {
			success = true;
			endTime = System.currentTimeMillis();
		}

		float duration() {
			return (endTime - startTime) / 1000f;
		}
	}

	/**
	 * Custom subclass of MultiReader to identify the source index for a given
	 * doc id. This would not be necessary of there was a public method to
	 * obtain this information.
	 * 
	 */
	private class MultiSourceReader extends MultiReader {

		final Method method;

		MultiSourceReader(IndexReader[] subReaders) {
			super(subReaders);
			Method m = null;
			try {
				m = MultiReader.class.getDeclaredMethod("readerIndex", int.class);
				m.setAccessible(true);
			} catch (Exception e) {
				logger.log(Level.SEVERE, "Error getting readerIndex method", e);
			}
			method = m;
		}

		int getSourceIndex(int docId) {
			int index = -1;
			try {
				Object o = method.invoke(this, docId);
				index = (Integer) o;
			} catch (Exception e) {
				logger.log(Level.SEVERE, "Error getting source index", e);
			}
			return index;
		}
	}
	
	private class IndexPom {
		final File file;
		final Date date;
		
		IndexPom(File file) {
			this.file = file;
			this.date = new Date();
		}
	}
}