/* * Geotoolkit - An Open Source Java GIS Toolkit * http://www.geotoolkit.org * * (C) 2007 - 2010, Geomatys * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 3 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. */ package org.geotoolkit.lucene.index; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.logging.Level; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser.Operator; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.geotoolkit.index.tree.manager.SQLRtreeManager; import org.geotoolkit.index.IndexingException; import org.geotoolkit.index.LogicalFilterType; import org.geotoolkit.nio.IOUtilities; import org.geotoolkit.lucene.LuceneUtils; import org.geotoolkit.index.SearchingException; import org.geotoolkit.lucene.filter.SerialChainFilter; import org.geotoolkit.index.SpatialQuery; /** * An Lucene index searcher. allowing to perform query on the index. * * @author Guilhem legal (Geomatys) * @module */ public class LuceneIndexSearcher extends IndexLucene { /** * This is the index searcher of Lucene. */ protected IndexSearcher searcher; /** * A default Query requesting all the document */ private final static Query SIMPLE_QUERY = new TermQuery(new Term("metafile", "doc")); /** * A map of cached request */ private final Map<SpatialQuery, Set<String>> cachedQueries = new ConcurrentHashMap<>(); /** * The maximum size of the map of queries. */ private static final int MAX_CACHED_QUERIES_SIZE = 50; /** * A flag indicating if the cache system for query is enabled. */ private final boolean isCacheEnabled; /** * A Map of DocID -> metadata ID . */ private final Map<Integer, String> identifiers = new HashMap<>(); /** * A list of numeric fields names. */ private Map<String, Character> numericFields; /** * A flag indicating if all the geometry indexed are envelope. * if set, no JTS filter will be applied on geometry search (only R-tree search) */ private final boolean envelopeOnly; /** * Build a new index searcher. * * @param configDir The configuration directory where to build the index directory. * @param serviceID the "ID" of the service (allow multiple index in the same directory). The value "" is allowed. * * @throws IndexingException */ public LuceneIndexSearcher(final Path configDir, final String serviceID) throws IndexingException { this(configDir, serviceID, null, false); } /** * Build a new index searcher. * * @param configDir The configuration directory where to build the index directory. * @param serviceID the "ID" of the service (allow multiple index in the same directory). The value "" is allowed. * @param analyzer A lucene Analyzer (Default is ClassicAnalyzer) * * @throws org.geotoolkit.lucene.IndexingException */ public LuceneIndexSearcher(final Path configDir, final String serviceID, final Analyzer analyzer) throws IndexingException { this(configDir, serviceID, analyzer, false); } /** * Build a new index searcher. * * @param configDir The configuration directory where to build the index directory. * @param serviceID the "ID" of the service (allow multiple index in the same directory). The value "" is allowed. * @param analyzer A lucene Analyzer (Default is ClassicAnalyzer) * @param envelopeOnly A flag indicating if all the geometry indexed are envelope. * * @throws org.geotoolkit.lucene.IndexingException */ public LuceneIndexSearcher(final Path configDir, final String serviceID, final Analyzer analyzer, final boolean envelopeOnly) throws IndexingException { super(analyzer); this.envelopeOnly = envelopeOnly; if (envelopeOnly) { LOGGER.info("envelope only mode activated"); } try { // we get the last index directory long maxTime = 0; Path currentIndexDirectory = null; if (configDir != null && Files.isDirectory(configDir)) { for (Path indexDirectory : Files.newDirectoryStream(configDir)) { String suffix = indexDirectory.getFileName().toString(); suffix = suffix.substring(suffix.lastIndexOf('-') + 1); try { long currentTime = Long.parseLong(suffix); if (currentTime > maxTime) { maxTime = currentTime; currentIndexDirectory = indexDirectory; } } catch (NumberFormatException ex) { LOGGER.log(Level.WARNING, "Unable to parse the timestamp:{0}", suffix); } } } if (currentIndexDirectory != null && Files.exists(currentIndexDirectory)) { setFileDirectory(currentIndexDirectory); try { this.numericFields = new HashMap<>(); final Path numericFieldFile = currentIndexDirectory.resolve("numericFields.properties"); if (Files.isRegularFile(numericFieldFile)) { final Properties prop = IOUtilities.getPropertiesFromFile(numericFieldFile); for (String fieldName : prop.stringPropertyNames()) { this.numericFields.put(fieldName, ((String) prop.get(fieldName)).charAt(0)); } } } catch (IOException ex) { LOGGER.log(Level.WARNING, "IO exception while reading numericFields file", ex); } } else { throw new IndexingException("The index searcher can't find a index directory."); } isCacheEnabled = true; initSearcher(); initIdentifiersList(); } catch (CorruptIndexException ex) { throw new IndexingException("Corruption encountered during index searcher creation", ex); } catch (IOException ex) { throw new IndexingException("IO Exception during index searcher creation", ex); } } /** * initialize the IndexSearcher of this index. */ private void initSearcher() throws CorruptIndexException, IOException { final Path indexDirectory = getFileDirectory(); this.rTree = SQLRtreeManager.get(indexDirectory, this); final IndexReader reader = DirectoryReader.open(LuceneUtils.getAppropriateDirectory(indexDirectory)); searcher = new IndexSearcher(reader); LOGGER.log(Level.INFO, "Creating new Index Searcher with index directory:{0}", indexDirectory.toString()); } /** * Fill the list of identifiers ordered by doc ID */ private void initIdentifiersList() throws IOException { final Map<Integer, String> temp = new HashMap<>(); final int nbValidDoc = searcher.getIndexReader().numDocs(); // do not take in count deleted document final long nbDoc = searcher.collectionStatistics("id").maxDoc(); // contains deleted document for (int i = 0; i < nbDoc; i++) { final String metadataID = getMatchingID(searcher.doc(i)); temp.put(i, metadataID); } identifiers.clear(); identifiers.putAll(temp); LOGGER.log(logLevel, "{0} records found.", nbValidDoc); } /** * Refresh the searcher (must be call after deleting document from the index for example) * * @throws IndexingException */ public void refresh() throws IndexingException { try { initSearcher(); initIdentifiersList(); cachedQueries.clear(); LOGGER.log(logLevel, "refreshing index searcher"); } catch (CorruptIndexException ex) { throw new IndexingException("Corruption exception encountered during refreshing the index searcher", ex); } catch (IOException ex) { throw new IndexingException("IO Exception during refreshing the index searcher", ex); } } /** * Add the metadata id to the list of result if its present in the identifiers. * @param results * @param docID */ private void addToResult(final Set<String> results, final int docID) { final String metadataID = identifiers.get(docID); if (metadataID != null) { results.add(metadataID); } else { LOGGER.log(Level.WARNING, "Unable to find a metadata ID for doc :{0}", docID); } } /** * This method proceed a lucene search and to verify that the identifier exist. * If it exist it return the database ID. * * @param id A simple Term query on "identifier field". * * @return A database id. * @throws SearchingException */ public String identifierQuery(final String id) throws SearchingException { try { final TermQuery query = new TermQuery(new Term(getIdentifierSearchField(), id)); final Set<String> results = new LinkedHashSet<>(); final int maxRecords = (int)searcher.collectionStatistics("id").maxDoc(); if (maxRecords == 0) { LOGGER.warning("There is no document in the index"); return null; } final TopDocs hits = searcher.search(query, maxRecords); for (ScoreDoc doc : hits.scoreDocs) { final Set<String> fieldsToLoad = new HashSet<>(); fieldsToLoad.add("id"); results.add(searcher.doc(doc.doc, fieldsToLoad).get("id")); } if (results.size() > 1) { LOGGER.log(Level.WARNING, "multiple record in lucene index for identifier: {0}", id); } if (!results.isEmpty()) { return results.iterator().next(); } } catch (IOException ex) { throw new SearchingException("Parse Exception while performing lucene request", ex); } return null; } /** * Return the name of the identifier field used in the identifierQuery method. * * @return the name of the identifier field. */ public String getIdentifierSearchField() { return "id"; } public Map<String, Character> getNumericFields() { return numericFields; } /** * This method return the database ID of a matching Document * * @param doc A matching document. * * @return A database id. */ public String getMatchingID(final Document doc) { return doc.get("id"); } /** * This method proceed a lucene search and returns a list of ID. * * @param spatialQueryI The lucene query string with spatials filters. * * @return A List of metadata identifiers. * @throws SearchingException */ public Set<String> doSearch(final SpatialQuery spatialQueryI) throws SearchingException { org.geotoolkit.lucene.filter.SpatialQuery spatialQuery = (org.geotoolkit.lucene.filter.SpatialQuery) spatialQueryI; try { final long start = System.currentTimeMillis(); final Set<String> results = new LinkedHashSet<>(); spatialQuery.applyRtreeOnFilter(rTree, envelopeOnly); //we look for a cached Query if (isCacheEnabled && cachedQueries.containsKey(spatialQuery)) { final Set<String> cachedResults = cachedQueries.get(spatialQuery); LOGGER.log(logLevel, "returning result from cache ({0} matching documents)", results.size()); return cachedResults; } int maxRecords = (int) searcher.collectionStatistics("id").maxDoc(); if (maxRecords == 0) { LOGGER.warning("The index seems to be empty."); maxRecords = 1; } final String field = "title"; String stringQuery = spatialQuery.getQuery(); final QueryParser parser = new ExtendedQueryParser(field, analyzer, numericFields); parser.setDefaultOperator(Operator.AND); // remove term:* query stringQuery = removeOnlyWildchar(stringQuery); // escape '/' character stringQuery = stringQuery.replace("/", "\\/"); // we enable the leading wildcard mode if the first character of the query is a '*' if (stringQuery.indexOf(":*") != -1 || stringQuery.indexOf(":?") != -1 || stringQuery.indexOf(":(*") != -1 || stringQuery.indexOf(":(+*") != -1 || stringQuery.indexOf(":+*") != -1) { parser.setAllowLeadingWildcard(true); LOGGER.log(Level.FINER, "Allowing leading wildChar"); BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE); } //we set off the mecanism setting all the character to lower case // we do that for range queries only for now. TODO see if we need to set it every time if (stringQuery.contains(" TO ")) { parser.setLowercaseExpandedTerms(false); } final Query query; if (!stringQuery.isEmpty()) { query = parser.parse(stringQuery); } else { query = SIMPLE_QUERY; } LOGGER.log(Level.FINER, "QueryType:{0}", query.getClass().getName()); final Filter filter = spatialQuery.getSpatialFilter(); final LogicalFilterType operator = spatialQuery.getLogicalOperator(); final Sort sort = spatialQuery.getSort(); String sorted = ""; if (sort != null) { sorted = "\norder by: " + sort.toString(); } String f = ""; if (filter != null) { f = '\n' + filter.toString(); } String operatorValue = ""; if (!(operator == LogicalFilterType.AND || (operator == LogicalFilterType.OR && filter == null))) { operatorValue = '\n' + SerialChainFilter.valueOf(operator); } LOGGER.log(logLevel, "Searching for: " + query.toString(field) + operatorValue + f + sorted + "\nmax records: " + maxRecords); // simple query with an AND if (operator == LogicalFilterType.AND || (operator == LogicalFilterType.OR && filter == null)) { final TopDocs docs; if (sort != null) { docs = searcher.search(query, filter, maxRecords, sort); } else { docs = searcher.search(query, filter, maxRecords); } for (ScoreDoc doc : docs.scoreDocs) { addToResult(results, doc.doc); } // for a OR we need to perform many request } else if (operator == LogicalFilterType.OR) { final TopDocs hits1; final TopDocs hits2; if (sort != null) { hits1 = searcher.search(query, null, maxRecords, sort); hits2 = searcher.search(SIMPLE_QUERY, spatialQuery.getSpatialFilter(), maxRecords, sort); } else { hits1 = searcher.search(query, maxRecords); hits2 = searcher.search(SIMPLE_QUERY, spatialQuery.getSpatialFilter(), maxRecords); } for (ScoreDoc doc : hits1.scoreDocs) { addToResult(results, doc.doc); } for (ScoreDoc doc : hits2.scoreDocs) { addToResult(results, doc.doc); } // for a NOT we need to perform many request } else if (operator == LogicalFilterType.NOT) { final TopDocs hits1; if (sort != null) { hits1 = searcher.search(query, filter, maxRecords, sort); } else { hits1 = searcher.search(query, filter, maxRecords); } final Set<String> unWanteds = new LinkedHashSet<>(); for (ScoreDoc doc : hits1.scoreDocs) { addToResult(unWanteds, doc.doc); } final TopDocs hits2; if (sort != null) { hits2 = searcher.search(SIMPLE_QUERY, null, maxRecords, sort); } else { hits2 = searcher.search(SIMPLE_QUERY, maxRecords); } for (ScoreDoc doc : hits2.scoreDocs) { final String id = identifiers.get(doc.doc); if (id != null && !unWanteds.contains(id)) { results.add(id); } } } else { throw new IllegalArgumentException("unsupported logical Operator"); } // if we have some subQueries we execute it separely and merge the result if (spatialQuery.getSubQueries().size() > 0) { if (operator == LogicalFilterType.OR && query.equals(SIMPLE_QUERY)) { results.clear(); } for (SpatialQuery sub : spatialQuery.getSubQueries()) { final Set<String> subResults = doSearch(sub); if (operator == LogicalFilterType.AND) { final Set<String> toRemove = new HashSet<>(); for (String r : results) { if (!subResults.contains(r)) { toRemove.add(r); } } results.removeAll(toRemove); } else if (operator == LogicalFilterType.OR){ results.addAll(subResults); } else { LOGGER.warning("unimplemented case in doSearch"); } } } //we put the query in cache putInCache(spatialQuery, results); LOGGER.log(logLevel, results.size() + " total matching documents (" + (System.currentTimeMillis() - start) + "ms)"); return results; } catch (ParseException ex) { throw new SearchingException("Parse Exception while performing lucene request", ex); } catch (IOException ex) { throw new SearchingException("IO Exception while performing lucene request", ex); } } public static String removeOnlyWildchar(String s) { final String pattern = "[^: +\\(]*:\\* "; s = s.replaceAll(pattern, "metafile:doc "); final String pattern2 = "[^: +\\(]*:\\*$"; s = s.replaceAll(pattern2, "metafile:doc"); final String pattern3 = "[^: +\\(]*:[(][*][)]"; s = s.replaceAll(pattern3, "metafile:doc"); final String pattern4 = "[^: +\\(]*:\\*[)]"; s = s.replaceAll(pattern4, "metafile:doc)"); return s; } /** * Add a query and its results to the cache. * if the map has reach the maximum size the older query is removed from the cache. * * @param query a Lucene spatial query. * @param results A list of metadataIdentifier. */ private void putInCache(final SpatialQuery query, final Set<String> results) { if (isCacheEnabled) { // if we had reach the maximum cache size we remove the first request if (cachedQueries.size() >= MAX_CACHED_QUERIES_SIZE) { cachedQueries.remove(cachedQueries.keySet().iterator().next()); } cachedQueries.put(query, results); } } /** * Free the resources when closing the searcher. */ @Override public void destroy() { super.destroy(); LOGGER.info("shutting down index searcher"); cachedQueries.clear(); } }