package org.cdlib.xtf.textEngine; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Acknowledgements: * * A significant amount of new and/or modified code in this module * was made possible by a grant from the Andrew W. Mellon Foundation, * as part of the Melvyl Recommender Project. */ import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Set; import java.util.StringTokenizer; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.chunk.DocNumMap; import org.apache.lucene.chunk.SpanChunkedNotQuery; import org.apache.lucene.chunk.SparseStringComparator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldSortedHitQueue; import org.apache.lucene.search.FlippableStringComparator; import org.apache.lucene.search.Query; import org.apache.lucene.search.RecordingSearcher; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SpanHitCollector; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.Weight; import org.apache.lucene.search.FieldSpanSource; import org.apache.lucene.search.spans.SpanNotNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.spelt.SpellReader; import org.apache.lucene.util.PriorityQueue; import org.cdlib.xtf.textEngine.facet.DynamicGroupData; import org.cdlib.xtf.textEngine.facet.FacetSpec; import org.cdlib.xtf.textEngine.facet.GroupCounts; import org.cdlib.xtf.textEngine.facet.GroupData; import org.cdlib.xtf.textEngine.facet.ResultFacet; import org.cdlib.xtf.textEngine.facet.ResultGroup; import org.cdlib.xtf.textEngine.facet.StaticGroupData; import org.cdlib.xtf.util.CharMap; import org.cdlib.xtf.util.Trace; import org.cdlib.xtf.util.WordMap; /** * Takes a QueryRequest, rewrites the queries if necessary to remove stop- * words and form bi-grams, then consults the index(es), and produces a * QueryResult. * * @author Martin Haye */ public class DefaultQueryProcessor extends QueryProcessor { /** Map of all XtfSearchers, so we can re-use them */ private static HashMap searchers = new HashMap(); /** Lucene reader from which to read index data */ private IndexReader indexReader; /** Fetches spelling suggestions */ private SpellReader spellReader; /** Keeps track of which chunks belong to which documents */ private DocNumMap docNumMap; /** Max size of a chunk (in words) */ @SuppressWarnings("unused") private int chunkSize; /** Number of words a chunk shares with its successor */ private int chunkOverlap; /** Stop-words to remove (e.g. "the", "a", "and", etc.) */ private Set stopSet; /** Mapping of plural words to singular words */ private WordMap pluralMap; /** Mapping of accented chars to chars without diacritics */ private CharMap accentMap; /** Whether the index is "sparse" (i.e. more than 5 chunks per doc) */ private boolean isSparse; /** Names of fields that are tokenized in this index */ private Set tokFields; /** Total number of documents hit (not just those that scored high) */ private int nDocsHit; /** Maximum document score (used to normalize scores) */ private float maxDocScore; /** Document normalization factor (calculated from {@link #maxDocScore}) */ private float docScoreNorm; /** Used to warm up indexes prior to use */ private IndexWarmer indexWarmer; /** Comparator used for sorting strings in "sparse" indexes */ private static final SparseStringComparator sparseStringComparator = new SparseStringComparator(); /** Comparator used for sorting strings in "compact" indexes */ private static final FlippableStringComparator compactStringComparator = new FlippableStringComparator(); /** Comparator used to sort by total number of hits */ private static final TotalHitsComparator totalHitsComparator = new TotalHitsComparator(); /** Record an index warmer to use for background warming. */ public void setIndexWarmer(IndexWarmer warmer) { indexWarmer = warmer; } /** * This is main entry point. Takes a pre-parsed query request and handles * searching the index and forming the results.<br> * * This method is synchronized because it uses two instance variables, * so access by multiple threads would result in incorrect counting. For * maximum efficiency, each thread should really use its own instance. * * @param req The pre-parsed request to process * @return Zero or more document hits */ public synchronized QueryResult processRequest(final QueryRequest req) throws IOException { // Clear out our counters. nDocsHit = 0; maxDocScore = 0; // Make an vector to store the hits (we'll make it into an array // later, when we know how many we have.) // Vector hitVec = new Vector(10); if (indexWarmer == null) throw new IOException("Fatal: must call setIndexWarmer() before DefaultQueryProcessor.processRequest()"); // Get a reader, searcher, and document number map that will all be // consistent with each other and up-to-date. // XtfSearcher xtfSearcher = indexWarmer.getSearcher(req.indexPath); synchronized (xtfSearcher) { xtfSearcher.update(); indexReader = xtfSearcher.indexReader(); docNumMap = xtfSearcher.docNumMap(); chunkSize = xtfSearcher.chunkSize(); chunkOverlap = xtfSearcher.chunkOverlap(); stopSet = xtfSearcher.stopSet(); pluralMap = xtfSearcher.pluralMap(); accentMap = xtfSearcher.accentMap(); spellReader = xtfSearcher.spellReader(); isSparse = xtfSearcher.isSparse(); tokFields = xtfSearcher.tokenizedFields(); } // Apply a work limit to the query if we were requested to. If no // specific limit was set, use a limiter with an infinite limit // (because we still need it to check periodically if the thread // should kill itself.) // IndexReader limReader = new XtfLimIndexReader(indexReader, (req.workLimit > 0) ? req.workLimit : Integer.MAX_VALUE); // Translate -1 maxDocs to "essentially all" int maxDocs = req.maxDocs; if (maxDocs < 0) maxDocs = docNumMap.getDocCount(); // Make a queue that will accumulate the hits and pick the first // load of them for us. If there is a sort field specification, // do it in field-sorted order; otherwise, sort by score. // final PriorityQueue docHitQueue = createHitQueue(indexReader, req.startDoc + req.maxDocs, req.sortMetaFields, isSparse); // Start making the result by filling in its context. QueryResult result = new QueryResult(); result.context = new QueryContext(); result.context.accentMap = accentMap; result.context.pluralMap = pluralMap; result.context.stopSet = stopSet; result.scoresNormalized = req.normalizeScores; // If no query was specified, then there will be no results. Query query = req.query; if (query == null) { result.docHits = new DocHit[0]; return result; } // Perform standard tokenization tasks: change words to lowercase, // remove apostrophes, etc. // query = new StdTermRewriter(tokFields).rewriteQuery(query); // If an accent map is present, remove diacritics. if (accentMap != null) query = new AccentFoldingRewriter(accentMap, tokFields).rewriteQuery(query); // If a plural map is present, change plural words to non-plural. if (pluralMap != null) query = new PluralFoldingRewriter(pluralMap, tokFields).rewriteQuery(query); // Rewrite the query for bigrams (if we have stop-words to deal with.) if (stopSet != null) query = new XtfBigramQueryRewriter(stopSet, chunkOverlap, tokFields).rewriteQuery( query); // If there's nothing left (for instance if the query was all stop-words) // then there will be no results. // if (query == null) { result.docHits = new DocHit[0]; return result; } // Fix up all the "infinite" slop entries to be actually limited to // the chunk overlap size. That way, we'll get consistent results and // the user won't be able to tell where the chunk boundaries are. // Also, attach the docNumMap to every SpanDechunkingQuery. // final Query finalQuery = new SlopFixupRewriter(docNumMap, stopSet, pluralMap, accentMap).rewriteQuery(query); // If debugging is enabled, print out the final rewritten and fixed // up query. // if (finalQuery != req.query) Trace.debug("Rewritten query: " + finalQuery.toString()); // While processing the query, we want to lazily generate DocHits, // and only generate a DocHit once even if it's added to multiple // groups. // final DocHitMakerImpl docHitMaker = new DocHitMakerImpl(); // If we're to apply a set of additional boost sets to the documents, // get the set now. // final BoostSet boostSet = (req.boostSetParams == null) ? null : BoostSet.getCachedSet(indexReader, new File( req.boostSetParams.path), req.boostSetParams.field); // Make a Lucene searcher that will access the index according to // our query. // RecordingSearcher searcher = new RecordingSearcher(limReader); // If grouping was specified by the query, read in all the group data. // Note that the GroupData class holds its own cache so we don't have // to read data for a given field more than once. // final GroupCounts[] groupCounts = (req.facetSpecs == null) ? null : prepGroups(req, boostSet, searcher, finalQuery); // Now for the big show... go get the hits! searcher.search(finalQuery, null, new SpanHitCollector() { public void collect(int doc, float score, FieldSpanSource spanSource) { // Apply a boost (if there's a boost set) score = applyBoost(doc, score, boostSet, req); // Ignore deleted entries, and entries boosted down to zero. if (score <= 0.0f) return; // Bump the count of documents hit, and update the max score. nDocsHit++; if (score > maxDocScore) maxDocScore = score; // Record the hit. docHitMaker.reset(doc, score, spanSource); if (req.maxDocs > 0) docHitMaker.insertInto(docHitQueue); // If grouping is enabled, add this document to the counts. if (groupCounts != null) { for (int i = 0; i < groupCounts.length; i++) groupCounts[i].addDoc(docHitMaker); } } // collect() }); // Take the high-ranking hits and add them to the hit vector. // Note that they come out of the hit queue in backwards order. // int nFound = docHitQueue.size(); DocHitImpl[] hitArray = new DocHitImpl[nFound]; for (int i = 0; i < nFound; i++) { int index = nFound - i - 1; hitArray[index] = (DocHitImpl)docHitQueue.pop(); } // Calculate the document score normalization factor. docScoreNorm = 1.0f; if (req.normalizeScores && maxDocScore > 0.0f) docScoreNorm = 1.0f / maxDocScore; // We'll need a query weight if we're being asked to explain the // scores. // Weight weight = null; if (req.explainScores) weight = finalQuery.weight(searcher); // Finish off the hits (read in the fields, normalize, make snippets). SnippetMaker snippetMaker = new SnippetMaker(limReader, docNumMap, stopSet, pluralMap, accentMap, tokFields, req.maxContext, req.termMode, req.returnMetaFields); for (int i = req.startDoc; i < nFound; i++) { if (req.explainScores) { hitArray[i].finishWithExplain(snippetMaker, docScoreNorm, weight, boostSet, req.boostSetParams); } else hitArray[i].finish(snippetMaker, docScoreNorm); if (result.textTerms == null) result.textTerms = hitArray[i].textTerms(); hitVec.add(hitArray[i]); } // If grouping was enabled, group the hits and finish all of them. if (groupCounts != null) { result.facets = new ResultFacet[groupCounts.length]; for (int i = 0; i < groupCounts.length; i++) { result.facets[i] = groupCounts[i].getResult(); finishGroup(result.facets[i].rootGroup, snippetMaker, req, weight, boostSet); } // for if } // Done with that searcher searcher.close(); searcher = null; assert req.maxDocs < 0 || hitVec.size() <= req.maxDocs; // Pack up the results into a tidy array. result.totalDocs = nDocsHit; result.startDoc = req.startDoc; result.endDoc = req.startDoc + hitVec.size(); result.docHits = (DocHit[])hitVec.toArray(new DocHit[hitVec.size()]); // Make spelling suggestions if applicable. if (spellReader != null && req.spellcheckParams != null) spellCheck(req, result, tokFields); // All done. return result; } // processReq() /** * Checks spelling of query terms, if spelling suggestion is enabled and * the result falls below the cutoff threshholds. * * @param req Original query request * @param res Results of the query * @param tokFields Set of tokenized fields (in case no field list was * specified in the query request.) */ private void spellCheck(QueryRequest req, QueryResult res, Set tokFields) throws IOException { // We can use a handy reference to the spellcheck params, and to the // field list. // SpellcheckParams params = req.spellcheckParams; // When checking the cutoffs, account for the possibility that // the query might be faceted, in which case we want the document // count of the biggest facet. // int totalDocs = res.totalDocs; if (res.facets != null) { for (int i = 0; i < res.facets.length; i++) { if (res.facets[i].rootGroup != null) totalDocs = Math.max(totalDocs, res.facets[i].rootGroup.totalDocs); } } // Check the cutoffs. If the documents scored well, or there were // a lot of them, then suggestions aren't needed. // if (params.docScoreCutoff > 0 && maxDocScore > params.docScoreCutoff) return; if (params.totalDocsCutoff > 0 && totalDocs > params.totalDocsCutoff) return; // Gather the query terms, grouped by field set. Set spellFieldSet = params.fields != null ? params.fields : tokFields; LinkedHashMap fieldsMap = gatherKeywords(req.query, spellFieldSet); // Make suggestions for each field set. LinkedHashMap out = new LinkedHashMap(); for (Iterator fi = fieldsMap.keySet().iterator(); fi.hasNext();) { // Make a list of fields and terms. LinkedHashSet fieldsSet = (LinkedHashSet)fi.next(); String[] fields = (String[])fieldsSet.toArray(new String[fieldsSet.size()]); LinkedHashSet termsSet = (LinkedHashSet)fieldsMap.get(fieldsSet); String[] terms = (String[])termsSet.toArray(new String[termsSet.size()]); // Get some suggestions String[] suggested = spellReader.suggestKeywords(terms); // If no suggestions, skip these fields. if (suggested == null) continue; assert suggested.length == terms.length; // Record each suggestion. for (int i = 0; i < suggested.length; i++) { // Skip duplicated in different field sets... retain only the // first one. // if (out.containsKey(terms[i])) continue; // Skip suggestions that don't change anything. if (terms[i].equals(suggested[i])) continue; // Okay, record it. SpellingSuggestion sugg = new SpellingSuggestion(); sugg.origTerm = terms[i]; sugg.fields = fields; sugg.suggestedTerm = suggested[i]; out.put(terms[i], sugg); } } // for fi // If no suggestions, we're done. if (out.size() == 0) return; // Make sure the suggestions result in better results. if (!spellingImprovesResults(req, res, spellFieldSet, out)) return; // Record the final suggestions in an array. res.suggestions = (SpellingSuggestion[])out.values().toArray( new SpellingSuggestion[out.values().size()]); } // spellCheck() /** * Re-runs the original query, except with terms replaced by their suggestions. * Checks that the results are improved -- at present that means that there * are more of them, and their max score is higher. * * @param origReq Original query request * @param origRes Results of the original query * @param spellFieldSet Set of fields to rewrite terms within * @param suggs Map of terms to their suggested replacements * @return true if the suggestions improve the results. * @throws IOException */ private boolean spellingImprovesResults(QueryRequest origReq, QueryResult origRes, Set spellFieldSet, LinkedHashMap suggs) throws IOException { // First, clone the original request, and then turn off spellcheck for // the clone so that we don't get in an infinite recursive loop. // QueryRequest newReq = (QueryRequest)origReq.clone(); newReq.spellcheckParams = null; // Before re-querying, save the max doc score. float origMaxDocScore = maxDocScore; // Now apply the spelling suggestions to the original query. newReq.query = new SpellSuggRewriter(suggs, spellFieldSet).rewriteQuery( newReq.query); QueryResult newRes = this.processRequest(newReq); // If the new query returns nothing and the old query also returned // nothing, it's a semi-failure. There's no use suggesting the new // words even if they are better, because it won't help the user. // if (newRes.totalDocs == 0 && origRes.totalDocs == 0) { //System.out.print("No docs before or after: " + newReq.query.toString() + "... "); return false; } // If the new query returns less results, consider it a failure. if (newRes.totalDocs < origRes.totalDocs) { //System.out.print("Fewer docs: " + newReq.query.toString() + "... "); return false; } // If the max doc score is lower, that's also a failure. if (maxDocScore < origMaxDocScore) { //System.out.print("Lower score: " + newReq.query.toString() + "... "); return false; } // Cool! We think this is a better query. return true; } /** * Make a list of all the terms present in the given query, * grouped by field set. * * @param query The query to traverse * @param desiredFields The set of fields to limit to. If null, all * fields are considered. * * @return An ordered map consisting of entries of a key and a * value. The key is an ordered set of field names. * The value is an ordered set of words. */ private LinkedHashMap gatherKeywords(Query query, final Set desiredFields) { // Make an ordered set of words, each with an ordered list of fields final LinkedHashMap termMap = new LinkedHashMap(); XtfQueryTraverser trav = new XtfQueryTraverser() { private void add(Term t) { final String field = t.field(); final String word = t.text(); if (desiredFields != null && !desiredFields.contains(field)) return; if (!termMap.containsKey(word)) termMap.put(word, new LinkedHashSet()); ((LinkedHashSet)termMap.get(word)).add(field); } public void traverseQuery(Query q) { // Skip queries boosted to nothing if (q.getBoost() > 0.001f) super.traverseQuery(q); } protected void traverse(TermQuery q) { add(q.getTerm()); } protected void traverse(SpanTermQuery q) { add(q.getTerm()); } protected void traverse(SpanExactQuery q) { // Do not correct inside "exact" queries } protected void traverse(BooleanQuery bq) { BooleanClause[] clauses = bq.getClauses(); for (int i = 0; i < clauses.length; i++) { if (clauses[i].getOccur() != BooleanClause.Occur.MUST_NOT) traverseQuery(clauses[i].getQuery()); } } // traverse() protected void traverse(SpanChunkedNotQuery nq) { traverseQuery(nq.getInclude()); // No: traverseQuery(nq.getExclude()); } // traverse() protected void traverse(SpanNotQuery nq) { traverseQuery(nq.getInclude()); // No: traverseQuery(nq.getExclude()); } // traverse() protected void traverse(SpanNotNearQuery nq) { traverseQuery(nq.getInclude()); // No: traverseQuery(nq.getExclude()); } // traverse() }; trav.traverseQuery(query); // Now invert: for each unique set of fields, make an ordered list // of the keywords. // LinkedHashMap fieldsMap = new LinkedHashMap(); for (Iterator ti = termMap.keySet().iterator(); ti.hasNext();) { String word = (String)ti.next(); LinkedHashSet fieldsSet = (LinkedHashSet)termMap.get(word); if (!fieldsMap.containsKey(fieldsSet)) fieldsMap.put(fieldsSet, new LinkedHashSet()); ((LinkedHashSet)fieldsMap.get(fieldsSet)).add(word); } // All done. return fieldsMap; } // gatherKeywords() /** * Create the GroupCounts objects for the given query request. Also handles * creating the proper hit queue for each one. * * @param req query request containing group specs * @param query query to use to form dynamic groups * @param searcher searcher for dynamic groups * @param boostSet boost set for dynamic groups */ private GroupCounts[] prepGroups(final QueryRequest req, final BoostSet boostSet, RecordingSearcher searcher, Query query) throws IOException { GroupData[] groupData = new GroupData[req.facetSpecs.length]; Vector dynamicGroupVec = new Vector(); // First get data for each group for (int i = 0; i < req.facetSpecs.length; i++) { FacetSpec spec = req.facetSpecs[i]; if (spec.field.startsWith("java:")) { groupData[i] = createDynamicGroup(indexReader, spec.field); dynamicGroupVec.add(groupData[i]); } else groupData[i] = StaticGroupData.getCachedData(indexReader, spec.field); } // If there are dynamic groups, pre-scan the query and hand them the // documents and scores. // if (!dynamicGroupVec.isEmpty()) { final DynamicGroupData[] dynGroups = (DynamicGroupData[])dynamicGroupVec.toArray( new DynamicGroupData[dynamicGroupVec.size()]); searcher.search(query, null, new SpanHitCollector() { public void collect(int doc, float score, FieldSpanSource spanSource) { // Apply a boost (if there's a boost set) score = applyBoost(doc, score, boostSet, req); // If document isn't deleted, collect it. if (score > 0.0f) { for (int i = 0; i < dynGroups.length; i++) dynGroups[i].collect(doc, score); } } // collect() }); // Finish off the dynamic group data. for (int i = 0; i < dynGroups.length; i++) dynGroups[i].finish(); } // if // Now make a GroupCount object around each data object. GroupCounts[] groupCounts = new GroupCounts[req.facetSpecs.length]; for (int i = 0; i < req.facetSpecs.length; i++) { FacetSpec spec = req.facetSpecs[i]; HitQueueMakerImpl maker = new HitQueueMakerImpl(indexReader, spec.sortDocsBy, isSparse); groupCounts[i] = new GroupCounts(groupData[i], spec, maker); } // All done. return groupCounts; } // prepGroups() /** * Create a dynamic group based on a field specification. * * @param indexReader Where to get the data from * @param field Special field name starting with "java:" * @return Dynamic group data * @throws IOException */ private GroupData createDynamicGroup(IndexReader indexReader, String field) throws IOException { // Parse out the class name and parameters Pattern pat = Pattern.compile("java:([\\w.]+)\\((.*)\\)"); Matcher matcher = pat.matcher(field); if (!matcher.matches()) throw new RuntimeException( "Unrecognized dynamic facet field '" + field + "'"); String className = matcher.group(1); String params = matcher.group(2); // Create an instance of the given class. DynamicGroupData dynData = null; try { Class c = Class.forName(className); dynData = (DynamicGroupData)c.newInstance(); } catch (ClassNotFoundException e) { throw new RuntimeException( "Dynamic facet class '" + className + "' not found"); } catch (InstantiationException e) { throw new RuntimeException( "Cannot instantiate dynamic facet class '" + className + "'", e); } catch (IllegalAccessException e) { throw new RuntimeException( "Cannot instantiate dynamic facet class '" + className + "'", e); } catch (ClassCastException e) { throw new RuntimeException( "Class '" + className + "' must be derived from DynamicGroupData"); } // Initialize the new instance, and we're done. dynData.init(indexReader, tokFields, params); return dynData; } // createDynamicGroup() /** * Finishes DocHits within a single group (also processes all its * descendant groups.) * * @param group Group to finish * @param snippetMaker Used to make snippets for any DocHits inside the * group. * @param req Determines whether to finish with 'explain' or not * @param weight Used for score explanations * @param boostSet Used for score explanations */ private void finishGroup(ResultGroup group, SnippetMaker snippetMaker, QueryRequest req, Weight weight, BoostSet boostSet) throws IOException { // Finish DocHits for this group if (group.docHits != null) { for (int k = 0; k < group.docHits.length; k++) { DocHitImpl hit = (DocHitImpl)group.docHits[k]; if (req.explainScores) { hit.finishWithExplain(snippetMaker, docScoreNorm, weight, boostSet, req.boostSetParams); } else hit.finish(snippetMaker, docScoreNorm); } // for k } // Now finish all the descendants. if (group.subGroups != null) { for (int j = 0; j < group.subGroups.length; j++) finishGroup(group.subGroups[j], snippetMaker, req, weight, boostSet); } } // finishGroup() /** * QueryProcessor maintains a static cache of Lucene searchers, one for * each index directory. If data is changed, normally it's not recognized * until a periodic (every 30 seconds) check. * * Calling this method forces new changes to an index to be immediately * recognized. */ public void resetCache() { searchers.clear(); } // resetCache() /** * If a boost set was specified, boost the given document's score according to the * set. */ private float applyBoost(int doc, float score, BoostSet boostSet, QueryRequest req) { // If we're boosting, apply that factor. if (score > 0 && boostSet != null) { float boost = boostSet.getBoost(doc, req.boostSetParams.defaultBoost); if (req.boostSetParams.exponent != 1.0f) boost = (float)Math.pow(boost, req.boostSetParams.exponent); score *= boost; } return score; } /** * Creates either a standard score-sorting hit queue, or a field-sorting * hit queue, depending on whether the query is to be sorted. * * @param reader will be used to read the field contents * @param inSize size of the queue (typically startDoc + maxDocs). If * this number is >= 999999, an infinitely resizing * queue will be created. * @param sortFields space or comma delimited list of fields to sort by * @param isSparse if index is sparse (i.e. more than 5 chunks per doc) * @return an appropriate hit queue */ private static PriorityQueue createHitQueue(IndexReader reader, int inSize, String sortFields, boolean isSparse) throws IOException { // If a large size is requested, start with a small queue and expand // later, if necessary. // int size = (inSize >= 999999) ? 1 : inSize; // If no sort fields, do a simple score sort. PriorityQueue ret; if (sortFields == null) ret = new HitQueue(size); else { // Parse out the list of fields to sort by. Vector fieldNames = new Vector(); StringTokenizer st = new StringTokenizer(sortFields, " \t\r\n,;"); while (st.hasMoreTokens()) fieldNames.add(st.nextToken()); // If there were none, do a simple score sort. if (fieldNames.size() == 0) ret = new HitQueue(size); else { // Okay, make a SortField out of each one, in priority order from // highest to lowest. After all the fields, an implicit score sorter // is added so that documents which match in all other respects // will come out ordered by score. // // Each name can be optionally prefixed with "-" to sort in reverse, // or "+" to sort in normal order (but "+" is unnecessary, since // normal order is the default.) // // There's also a more verbose and powerful way to affect sort order: // modifiers. Possible modifiers are ":ascending", ":descending", // ":emptyFirst", and ":emptyLast". // SortField[] fields = new SortField[fieldNames.size() + 2]; for (int i = 0; i < fieldNames.size(); i++) { String name = (String)fieldNames.elementAt(i); boolean ascending = false; boolean descending = false; boolean emptyFirst = false; boolean emptyLast = false; // Check for the short-hand "-" and "+" prefixes if (name.startsWith("-")) { descending = true; name = name.substring(1); } else if (name.startsWith("+")) { ascending = true; name = name.substring(1); } // Check for more verbose ":" modifiers after the field name String[] parts = name.split(":"); name = parts[0]; for (int j=1; j<parts.length; j++) { if (parts[j].equalsIgnoreCase("ascending")) ascending = true; else if (parts[j].equalsIgnoreCase("descending")) descending = true; else if (parts[j].equalsIgnoreCase("emptyFirst")) emptyFirst = true; else if (parts[j].equalsIgnoreCase("emptyLast")) emptyLast = true; else throw new IOException("Unknown sort modifier: '" + parts[j] + "'"); } // Check for conflicting modifiers. if ((ascending && descending) || (emptyFirst && emptyLast)) throw new IOException("Conflicting sort modifiers"); // Interpret the modifiers. boolean reverse; if (ascending) reverse = false; else if (descending) reverse = true; else reverse = false; // default boolean flipEmpty; if (!reverse) { if (emptyFirst) flipEmpty = true; else if (emptyLast) flipEmpty = false; else flipEmpty = false; // default } else { if (emptyFirst) flipEmpty = false; else if (emptyLast) flipEmpty = true; else flipEmpty = true; // default } String finalName = flipEmpty ? (name + ":flipEmpty") : name; // Though not strictly necessary, allow the user to specify "score" or // "relevance" to sort by those. That way, automated programs can always give // a "sortDocsBy" field. // if (name.equals("score") || name.equals("relevance")) { if (reverse || flipEmpty) throw new RuntimeException("Illegal modifier on sortDocsBy 'score'"); fields[i] = SortField.FIELD_SCORE; } else if (name.equals("totalHits")) fields[i] = new SortField(finalName, totalHitsComparator, reverse); else if (isSparse) fields[i] = new SortField(finalName, sparseStringComparator, reverse); else fields[i] = new SortField(finalName, compactStringComparator, reverse); } // Default tie-breakers: first, score. If score is equal, sort by doc ID. fields[fieldNames.size()] = SortField.FIELD_SCORE; fields[fieldNames.size()+1] = SortField.FIELD_DOC; // And make the final hit queue. ret = new FieldSortedHitQueue(reader, fields, size); } } // If a ton of hits is requested, make the queue into a resizing one. if (inSize >= 999999) ret.setExpandable(); // All done. return ret; } // createHitQueue() private static class DocHitMakerImpl implements GroupCounts.DocHitMaker { private int doc; private float score; private FieldSpanSource spanSrc; private DocHitImpl docHit; public final void reset(int doc, float score, FieldSpanSource spanSrc) { this.doc = doc; this.score = score; this.spanSrc = spanSrc; docHit = null; } public final int getDocNum() { return doc; } public final float getScore() { return score; } public final boolean insertInto(PriorityQueue queue) { if (docHit == null) docHit = new DocHitImpl(doc, score); try { docHit.setSpanSource(spanSrc); boolean inserted = queue.insert(docHit); // If we're keeping this hit, make sure spans have been grabbed. if (inserted) docHit.totalSnippets(); return inserted; } finally { docHit.setSpanSource(null); // prevent memory leaks } } } // class DocHitMaker private static class HitQueueMakerImpl implements GroupCounts.HitQueueMaker { private IndexReader reader; private String sortFields; private boolean isSparse; public HitQueueMakerImpl(IndexReader reader, String sortFields, boolean isSparse) { this.reader = reader; this.sortFields = sortFields; this.isSparse = isSparse; } public PriorityQueue makeQueue(int size) { try { return DefaultQueryProcessor.createHitQueue(reader, size, sortFields, isSparse); } catch (IOException e) { throw new RuntimeException(e); } } } // class HitQueueMakerImpl } // class QueryProcessor