package org.wyona.yarep.impl.search.lucene; import java.util.ArrayList; import java.util.List; import org.apache.avalon.framework.configuration.Configuration; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; import org.apache.lucene.search.IndexSearcher; import org.wyona.yarep.core.NoSuchNodeException; import org.wyona.yarep.core.Node; import org.wyona.yarep.core.Repository; import org.wyona.yarep.core.search.SearchException; import org.wyona.yarep.core.search.Searcher; import java.io.File; /** * Lucene implementation of searcher */ public class LuceneSearcher implements Searcher { static Logger log = LogManager.getLogger(LuceneSearcher.class); private LuceneConfig config; private static final String PATH_FIELD_NAME = "_PATH"; private boolean autoClean = false; /** * @see org.wyona.yarep.core.search.Searcher#configure(Configuration, File, Repository) */ public void configure(Configuration searchIndexConfig, File configFile, Repository repo) throws SearchException { this.config = new LuceneConfig(searchIndexConfig, configFile.getParent(), repo); } /** * @see org.wyona.yarep.core.search.Searcher#search(String) */ public Node[] search(String query) throws SearchException { try { //TODO: this is not really nice re performance, it reads the index form the file-system for each search //it would be nice to initialize IndexSearcher at startup and reuse the IndexSearcher //but in this case the IndexSearcher then uses the index as it was at startup and not reloading it when the index has changed at runtime org.apache.lucene.search.Searcher searcher = new IndexSearcher(config.getFulltextSearchIndexFile().getAbsolutePath()); if (searcher != null) { try { org.apache.lucene.search.Query luceneQuery = new org.apache.lucene.queryParser.QueryParser(LuceneIndexer.INDEX_PROPERTY_FULL, config.getFulltextAnalyzer()).parse(query); org.apache.lucene.search.Hits hits = searcher.search(luceneQuery); if (hits.length() == 0) { log.info("Query \"" + query + "\" inside fulltext index '" + config.getFulltextSearchIndexFile().getAbsolutePath() + "' returned no hits."); } else { log.info("Query \"" + query + "\" returned " + hits.length() + " hits"); } java.util.List<Node> results = new java.util.ArrayList<Node>(); for (int i = 0; i < hits.length();i++) { String path = hits.doc(i).getField(PATH_FIELD_NAME).stringValue(); if (path.contains("#revision=")) { //log.debug("This seems to be a revision: " + resultPath); String resultPathWithoutRevision = path.substring(0, path.lastIndexOf("#revision=")); String revisionName = path.substring(path.lastIndexOf("#revision=") + 10); if (config.getRepo().existsNode(resultPathWithoutRevision)) { try { results.add(config.getRepo().getNode(resultPathWithoutRevision).getRevision(revisionName)); } catch(org.wyona.yarep.core.NoSuchRevisionException e) { log.error("Revision found within search index, but no such revision within repository: " + resultPathWithoutRevision + "#" + revisionName); } } else { log.error("Node found within search index, but no such node within repository: " + resultPathWithoutRevision); } } else { if (config.getRepo().existsNode(path)) { results.add(config.getRepo().getNode(path)); } else { log.error("No such node '" + path + "'. Search index (Fulltext: '" + config.getFulltextSearchIndexFile() + "', Properties: '" + config.getPropertiesSearchIndexFile() + "') seems to be out of sync!"); } } } searcher.close(); return results.toArray(new Node[results.size()]); } catch (Exception e) { searcher.close(); log.error(e, e); throw new SearchException(e.getMessage(),e); } } else { searcher.close(); log.warn("No search index seems to be configured!"); } } catch (Exception e) { log.error(e, e); throw new SearchException(e.getMessage(),e); } return null; } /** * @see org.wyona.yarep.core.search.Searcher#searchProperty(String, String, String) */ public Node[] searchProperty(String pName, String query, String path) throws SearchException { try { //TODO: this is not really nice re performance, it reads the index form the file-system for each search //it would be nice to initialize IndexSearcher at startup and reuse the IndexSearcher //but in this case the IndexSearcher then uses the index as it was at startup and not reloading it when the index has changed at runtime org.apache.lucene.search.Searcher searcher = new IndexSearcher(config.getPropertiesSearchIndexFile().getAbsolutePath()); if (searcher != null) { try { log.debug("Search property '" + pName + "': " + query); String defaultField = pName; org.apache.lucene.queryParser.QueryParser queryParser = new org.apache.lucene.queryParser.QueryParser(defaultField, config.getPropertyAnalyzer()); org.apache.lucene.search.Query luceneQuery = queryParser.parse(query); org.apache.lucene.search.Hits hits = searcher.search(luceneQuery); log.info("Number of matching documents (Property: " + pName + ", Query: " + query + ", Path: " + path + "): " + hits.length()); List results = new ArrayList(); for (int i = 0; i < hits.length(); i++) { try { String resultPath = hits.doc(i).getField(PATH_FIELD_NAME).stringValue(); // subtree filter (WARN: Peformance/Scalability!) if (path == null) { path = "/"; log.warn("No scope path set, hence set it ROOT: " + path); } if (resultPath.startsWith(path)) { if (resultPath.contains("#revision=")) { //log.debug("This seems to be a revision: " + resultPath); String resultPathWithoutRevision = resultPath.substring(0, resultPath.lastIndexOf("#revision=")); String revisionName = resultPath.substring(resultPath.lastIndexOf("#revision=") + 10); if (config.getRepo().existsNode(resultPathWithoutRevision)) { try { results.add(config.getRepo().getNode(resultPathWithoutRevision).getRevision(revisionName)); } catch(org.wyona.yarep.core.NoSuchRevisionException e) { log.error("Revision found within search index, but no such revision within repository: " + resultPathWithoutRevision + "#" + revisionName); } } else { log.error("Node found within search index, but no such node within repository: " + resultPathWithoutRevision); } } else { if (config.getRepo().existsNode(resultPath)) { results.add(config.getRepo().getNode(resultPath)); } else { log.debug("Node found within search index, but no such node within repository: " + resultPath); if(autoClean) { // TODO: Remove entry from index } } } } else { log.warn("Scope path '" + path + "' did not match result path: " + resultPath); } } catch (NoSuchNodeException nsne) { // INFO: I think catching this exception is not really necessary anymore. because the code above already checks the existence... log.warn("Node found within search index, but no such node within repository: " + hits.doc(i).getField(PATH_FIELD_NAME).stringValue()); if(autoClean) { // TODO: Remove entry from index } } } searcher.close(); return (Node[])results.toArray(new Node[results.size()]); } catch (Exception e) { log.error(e, e); throw new SearchException(e.getMessage(),e); } } } catch (Exception e) { log.error(e, e); throw new SearchException(e.getMessage(),e); } return null; } /** * Get list of paths of of nodes and/or revisions, which do not exist anymore inside repository * @param delete Flag to indicate whether nodes which are missing inside the repository should be deleted from the index * @param limitSize Limit the size of the returned list of missing nodes, because a search index can contain a huge amount of documents and hence also a huge amount of missing nodes, which means if one does not set a limit, then it might take a very long time to generate this list. If the limit of size is set to -1, then this means no limit. * @return List of paths of of nodes and/or revisions, which do not exist anymore inside repository */ public String[] getMissingNodes(boolean delete, int limitSize) throws SearchException { try { File indexDirectory = config.getPropertiesSearchIndexFile(); org.apache.lucene.search.Searcher searcher = new IndexSearcher(indexDirectory.getAbsolutePath()); if (searcher != null) { try { org.apache.lucene.search.Query luceneQuery = new org.apache.lucene.search.MatchAllDocsQuery(); org.apache.lucene.search.Hits hits = searcher.search(luceneQuery); log.warn("DEBUG: Number of documents: " + hits.length() + " (Index directory: " + indexDirectory.getAbsolutePath() + ")"); log.info("Number of documents: " + hits.length() + " (Index directory: " + indexDirectory.getAbsolutePath() + ")"); List<String> results = new ArrayList<String>(); for (int i = 0; i < hits.length(); i++) { String resultPath = hits.doc(i).getField(PATH_FIELD_NAME).stringValue(); try { if (resultPath.contains("#revision=")) { //log.debug("This seems to be a revision: " + resultPath); String resultPathWithoutRevision = resultPath.substring(0, resultPath.lastIndexOf("#revision=")); String revisionName = resultPath.substring(resultPath.lastIndexOf("#revision=") + 10); if (config.getRepo().existsNode(resultPathWithoutRevision)) { try { config.getRepo().getNode(resultPathWithoutRevision).getRevision(revisionName); } catch(org.wyona.yarep.core.NoSuchRevisionException e) { log.error("Revision found within search index and node itself exists inside repository, but no such revision within repository: " + resultPathWithoutRevision + "#" + revisionName); results.add(resultPath); } } else { log.error("Revision '" + resultPath + "' found within search index, but no such node within repository: " + resultPathWithoutRevision); results.add(resultPath); } } else { if (!config.getRepo().existsNode(resultPath)) { log.error("Node found within search index, but no such node within repository: " + resultPath); results.add(resultPath); } } } catch (NoSuchNodeException nsne) { // INFO: I think catching this exception is not really necessary anymore. because the code above already checks the existence... log.warn("Node found within search index, but no such node within repository: " + resultPath); results.add(resultPath); } if (limitSize > 0 && results.size() == limitSize) { log.warn("Size of returned list of missing nodes has been limited to '" + limitSize + "'"); break; } } searcher.close(); if (delete) { log.warn("Delete missing documents from index..."); try { // TODO: Use Indexer configured by repository! org.apache.lucene.index.IndexWriter indexWriter = LuceneIndexerV2.createIndexWriter(indexDirectory, config.getPropertyAnalyzer(), config.getWriteLockTimeout()); if (indexWriter != null) { for (String path: results) { log.warn("DEBUG: Try to delete document from index: " + path); indexWriter.deleteDocuments(new org.apache.lucene.index.Term(PATH_FIELD_NAME, path)); } indexWriter.close(); } else { throw new SearchException("Could not init IndexWriter in order to delete missing documents!"); } } catch(Exception e) { throw new SearchException(e); } } return (String[])results.toArray(new String[results.size()]); } catch (Exception e) { log.error(e, e); throw new SearchException(e.getMessage(),e); } } else { log.error("Searcher is null!"); return null; } } catch (Exception e) { log.error(e, e); throw new SearchException(e.getMessage(),e); } } }