/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.*; import org.apache.lucene.index.Term; import org.apache.lucene.misc.ChainedFilter; import org.apache.hadoop.conf.Configuration; import java.util.LinkedHashMap; import java.util.Map; import java.util.ArrayList; import java.io.IOException; /** Utility which converts certain query clauses into {@link QueryFilter}s and * caches these. Only required clauses whose boost is zero are converted to * cached filters. Range queries are converted to range filters. This * accellerates query constraints like date, language, document format, etc., * which do not affect ranking but might otherwise slow search considerably. */ class LuceneQueryOptimizer { // This thread provides a pseudo-clock service to all searching // threads, so that they can count elapsed time with less overhead than // repeatedly calling System.currentTimeMillis. private TimerThread timerThread = null; private static class TimerThread extends Thread { private int tick; // NOTE: we can avoid explicit synchronization here for several reasons: // * updates to 32-bit-sized variables are atomic // * only single thread modifies this value // * use of volatile keyword ensures that it does not reside in // a register, but in main memory (so that changes are visible to // other threads). // * visibility of changes does not need to be instantanous, we can // afford losing a tick or two. // // See section 17 of the Java Language Specification for details. public volatile int timeCounter = 0; boolean running = true; public TimerThread(int tick) { super("LQO timer thread"); this.tick = tick; this.setDaemon(true); } public void run() { while(running) { timeCounter++; try { Thread.sleep(tick); } catch (InterruptedException ie) {}; } } } private void initTimerThread(int p) { if (timerThread == null || !timerThread.isAlive()) { timerThread = new TimerThread(p); timerThread.start(); } } @SuppressWarnings("serial") private static class TimeExceeded extends RuntimeException { public long maxTime; private int maxDoc; public TimeExceeded(long maxTime, int maxDoc) { super("Exceeded search time: " + maxTime + " ms."); this.maxTime = maxTime; this.maxDoc = maxDoc; } } private static class LimitedCollector extends TopDocCollector { private int maxHits; private int maxTicks; private int startTicks; private TimerThread timer; private int curTicks; public LimitedCollector(int numHits, int maxHits, int maxTicks, TimerThread timer) { super(numHits); this.maxHits = maxHits; this.maxTicks = maxTicks; if (timer != null) { this.timer = timer; this.startTicks = timer.timeCounter; } } public void collect(int doc, float score) { if (maxHits > 0 && getTotalHits() >= maxHits) { throw new LimitExceeded(doc); } if (timer != null) { curTicks = timer.timeCounter; // overflow check if (curTicks < startTicks) curTicks += Integer.MAX_VALUE; if (curTicks - startTicks > maxTicks) { throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc); } } super.collect(doc, score); } } @SuppressWarnings("serial") private static class LimitExceeded extends RuntimeException { private int maxDoc; public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; } } private LinkedHashMap<BooleanQuery, Filter> cache; // an LRU cache of QueryFilter private float threshold; private int searcherMaxHits; private int tickLength; private int maxTickCount; /** * Construct an optimizer that caches and uses filters for required clauses * whose boost is zero. * * @param cacheSize * the number of QueryFilters to cache * @param threshold * the fraction of documents which must contain a term */ @SuppressWarnings("serial") public LuceneQueryOptimizer(Configuration conf) { final int cacheSize = conf.getInt("searcher.filter.cache.size", 16); this.threshold = conf.getFloat("searcher.filter.cache.threshold", 0.05f); this.searcherMaxHits = conf.getInt("searcher.max.hits", -1); this.cache = new LinkedHashMap<BooleanQuery, Filter>(cacheSize, 0.75f, true) { protected boolean removeEldestEntry(Map.Entry<BooleanQuery, Filter> eldest) { return size() > cacheSize; // limit size of cache } }; this.tickLength = conf.getInt("searcher.max.time.tick_length", 200); this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1); if (this.maxTickCount > 0) { initTimerThread(this.tickLength); } } public TopDocs optimize(BooleanQuery original, Searcher searcher, int numHits, String sortField, boolean reverse) throws IOException { BooleanQuery query = new BooleanQuery(); BooleanQuery cacheQuery = new BooleanQuery(); BooleanQuery filterQuery = new BooleanQuery(); ArrayList<Filter> filters = new ArrayList<Filter>(); BooleanClause[] clauses = original.getClauses(); for (int i = 0; i < clauses.length; i++) { BooleanClause c = clauses[i]; if (c.isRequired() // required && c.getQuery().getBoost() == 0.0f) { // boost is zero if (c.getQuery() instanceof TermQuery // TermQuery && (searcher.docFreq(((TermQuery)c.getQuery()).getTerm()) / (float)searcher.maxDoc()) < threshold) { // beneath threshold query.add(c); // don't filterize continue; } if (c.getQuery() instanceof RangeQuery) { // RangeQuery RangeQuery range = (RangeQuery)c.getQuery(); boolean inclusive = range.isInclusive();// convert to RangeFilter Term lower = range.getLowerTerm(); Term upper = range.getUpperTerm(); filters.add(new RangeFilter(lower!=null?lower.field():upper.field(), lower != null ? lower.text() : null, upper != null ? upper.text() : null, inclusive, inclusive)); cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it continue; } // all other query types filterQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // filter it cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it continue; } query.add(c); // query it } Filter filter = null; if (cacheQuery.getClauses().length != 0) { synchronized (cache) { // check cache filter = cache.get(cacheQuery); } if (filter == null) { // miss if (filterQuery.getClauses().length != 0) // add filterQuery to filters filters.add(new CachingWrapperFilter(new QueryWrapperFilter(filterQuery))); if (filters.size() == 1) { // convert filters to filter filter = (Filter)filters.get(0); } else { filter = new ChainedFilter((Filter[])filters.toArray (new Filter[filters.size()]), ChainedFilter.AND); } if (!(filter instanceof CachingWrapperFilter)) // make sure bits are cached filter = new CachingWrapperFilter(filter); synchronized (cache) { cache.put(cacheQuery, filter); // cache the filter } } } if (sortField == null && !reverse) { // no hit limit if (this.searcherMaxHits <= 0 && timerThread == null) { return searcher.search(query, filter, numHits); } // hits limited in time or in count -- use a LimitedCollector LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits, maxTickCount, timerThread); LimitExceeded exceeded = null; TimeExceeded timeExceeded = null; try { searcher.search(query, filter, collector); } catch (LimitExceeded le) { exceeded = le; } catch (TimeExceeded te) { timeExceeded = te; } TopDocs results = collector.topDocs(); if (exceeded != null) { // limit was exceeded results.totalHits = (int) // must estimate totalHits (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc)); } else if (timeExceeded != null) { // Estimate total hits. results.totalHits = (int)(results.totalHits * (searcher.maxDoc()/(float)timeExceeded.maxDoc)); } return results; } else { return searcher.search(query, filter, numHits, new Sort(sortField, reverse)); } } }