/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.grouping; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Map; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.CachingCollector; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.Weight; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.mutable.MutableValue; /** * Convenience class to perform grouping in a non distributed environment. * * @lucene.experimental */ public class GroupingSearch { private final GroupSelector grouper; private final Query groupEndDocs; private Sort groupSort = Sort.RELEVANCE; private Sort sortWithinGroup = Sort.RELEVANCE; private int groupDocsOffset; private int groupDocsLimit = 1; private boolean fillSortFields; private boolean includeScores = true; private boolean includeMaxScore = true; private Double maxCacheRAMMB; private Integer maxDocsToCache; private boolean cacheScores; private boolean allGroups; private boolean allGroupHeads; private Collection<?> matchingGroups; private Bits matchingGroupHeads; /** * Constructs a <code>GroupingSearch</code> instance that groups documents by index terms using DocValues. * The group field can only have one token per document. This means that the field must not be analysed. * * @param groupField The name of the field to group by. */ public GroupingSearch(String groupField) { this(new TermGroupSelector(groupField), null); } /** * Constructs a <code>GroupingSearch</code> instance that groups documents by function using a {@link ValueSource} * instance. * * @param groupFunction The function to group by specified as {@link ValueSource} * @param valueSourceContext The context of the specified groupFunction */ public GroupingSearch(ValueSource groupFunction, Map<?, ?> valueSourceContext) { this(new ValueSourceGroupSelector(groupFunction, valueSourceContext), null); } /** * Constructor for grouping documents by doc block. * This constructor can only be used when documents belonging in a group are indexed in one block. * * @param groupEndDocs The query that marks the last document in all doc blocks */ public GroupingSearch(Query groupEndDocs) { this(null, groupEndDocs); } private GroupingSearch(GroupSelector grouper, Query groupEndDocs) { this.grouper = grouper; this.groupEndDocs = groupEndDocs; } /** * Executes a grouped search. Both the first pass and second pass are executed on the specified searcher. * * @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the grouped search on. * @param query The query to execute with the grouping * @param groupOffset The group offset * @param groupLimit The number of groups to return from the specified group offset * @return the grouped result as a {@link TopGroups} instance * @throws IOException If any I/O related errors occur */ @SuppressWarnings("unchecked") public <T> TopGroups<T> search(IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException { if (grouper != null) { return groupByFieldOrFunction(searcher, query, groupOffset, groupLimit); } else if (groupEndDocs != null) { return (TopGroups<T>) groupByDocBlock(searcher, query, groupOffset, groupLimit); } else { throw new IllegalStateException("Either groupField, groupFunction or groupEndDocs must be set."); // This can't happen... } } @SuppressWarnings({"unchecked", "rawtypes"}) protected TopGroups groupByFieldOrFunction(IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException { int topN = groupOffset + groupLimit; final FirstPassGroupingCollector firstPassCollector = new FirstPassGroupingCollector(grouper, groupSort, topN); final AllGroupsCollector allGroupsCollector = allGroups ? new AllGroupsCollector(grouper) : null; final AllGroupHeadsCollector allGroupHeadsCollector = allGroupHeads ? AllGroupHeadsCollector.newCollector(grouper, sortWithinGroup) : null; final Collector firstRound = MultiCollector.wrap(firstPassCollector, allGroupsCollector, allGroupHeadsCollector); CachingCollector cachedCollector = null; if (maxCacheRAMMB != null || maxDocsToCache != null) { if (maxCacheRAMMB != null) { cachedCollector = CachingCollector.create(firstRound, cacheScores, maxCacheRAMMB); } else { cachedCollector = CachingCollector.create(firstRound, cacheScores, maxDocsToCache); } searcher.search(query, cachedCollector); } else { searcher.search(query, firstRound); } matchingGroups = allGroups ? allGroupsCollector.getGroups() : Collections.emptyList(); matchingGroupHeads = allGroupHeads ? allGroupHeadsCollector.retrieveGroupHeads(searcher.getIndexReader().maxDoc()) : new Bits.MatchNoBits(searcher.getIndexReader().maxDoc()); Collection<SearchGroup> topSearchGroups = firstPassCollector.getTopGroups(groupOffset, fillSortFields); if (topSearchGroups == null) { return new TopGroups(new SortField[0], new SortField[0], 0, 0, new GroupDocs[0], Float.NaN); } int topNInsideGroup = groupDocsOffset + groupDocsLimit; TopGroupsCollector secondPassCollector = new TopGroupsCollector(grouper, topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields); if (cachedCollector != null && cachedCollector.isCached()) { cachedCollector.replay(secondPassCollector); } else { searcher.search(query, secondPassCollector); } if (allGroups) { return new TopGroups(secondPassCollector.getTopGroups(groupDocsOffset), matchingGroups.size()); } else { return secondPassCollector.getTopGroups(groupDocsOffset); } } protected TopGroups<?> groupByDocBlock(IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException { int topN = groupOffset + groupLimit; final Weight groupEndDocs = searcher.createNormalizedWeight(this.groupEndDocs, false); BlockGroupingCollector c = new BlockGroupingCollector(groupSort, topN, includeScores, groupEndDocs); searcher.search(query, c); int topNInsideGroup = groupDocsOffset + groupDocsLimit; return c.getTopGroups(sortWithinGroup, groupOffset, groupDocsOffset, topNInsideGroup, fillSortFields); } /** * Enables caching for the second pass search. The cache will not grow over a specified limit in MB. * The cache is filled during the first pass searched and then replayed during the second pass searched. * If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search. * * @param maxCacheRAMMB The maximum amount in MB the cache is allowed to hold * @param cacheScores Whether to cache the scores * @return <code>this</code> */ public GroupingSearch setCachingInMB(double maxCacheRAMMB, boolean cacheScores) { this.maxCacheRAMMB = maxCacheRAMMB; this.maxDocsToCache = null; this.cacheScores = cacheScores; return this; } /** * Enables caching for the second pass search. The cache will not contain more than the maximum specified documents. * The cache is filled during the first pass searched and then replayed during the second pass searched. * If the cache grows beyond the specified limit, then the cache is purged and not used in the second pass search. * * @param maxDocsToCache The maximum number of documents the cache is allowed to hold * @param cacheScores Whether to cache the scores * @return <code>this</code> */ public GroupingSearch setCaching(int maxDocsToCache, boolean cacheScores) { this.maxDocsToCache = maxDocsToCache; this.maxCacheRAMMB = null; this.cacheScores = cacheScores; return this; } /** * Disables any enabled cache. * * @return <code>this</code> */ public GroupingSearch disableCaching() { this.maxCacheRAMMB = null; this.maxDocsToCache = null; return this; } /** * Specifies how groups are sorted. * Defaults to {@link Sort#RELEVANCE}. * * @param groupSort The sort for the groups. * @return <code>this</code> */ public GroupingSearch setGroupSort(Sort groupSort) { this.groupSort = groupSort; return this; } /** * Specified how documents inside a group are sorted. * Defaults to {@link Sort#RELEVANCE}. * * @param sortWithinGroup The sort for documents inside a group * @return <code>this</code> */ public GroupingSearch setSortWithinGroup(Sort sortWithinGroup) { this.sortWithinGroup = sortWithinGroup; return this; } /** * Specifies the offset for documents inside a group. * * @param groupDocsOffset The offset for documents inside a * @return <code>this</code> */ public GroupingSearch setGroupDocsOffset(int groupDocsOffset) { this.groupDocsOffset = groupDocsOffset; return this; } /** * Specifies the number of documents to return inside a group from the specified groupDocsOffset. * * @param groupDocsLimit The number of documents to return inside a group * @return <code>this</code> */ public GroupingSearch setGroupDocsLimit(int groupDocsLimit) { this.groupDocsLimit = groupDocsLimit; return this; } /** * Whether to also fill the sort fields per returned group and groups docs. * * @param fillSortFields Whether to also fill the sort fields per returned group and groups docs * @return <code>this</code> */ public GroupingSearch setFillSortFields(boolean fillSortFields) { this.fillSortFields = fillSortFields; return this; } /** * Whether to include the scores per doc inside a group. * * @param includeScores Whether to include the scores per doc inside a group * @return <code>this</code> */ public GroupingSearch setIncludeScores(boolean includeScores) { this.includeScores = includeScores; return this; } /** * Whether to include the score of the most relevant document per group. * * @param includeMaxScore Whether to include the score of the most relevant document per group * @return <code>this</code> */ public GroupingSearch setIncludeMaxScore(boolean includeMaxScore) { this.includeMaxScore = includeMaxScore; return this; } /** * Whether to also compute all groups matching the query. * This can be used to determine the number of groups, which can be used for accurate pagination. * <p> * When grouping by doc block the number of groups are automatically included in the {@link TopGroups} and this * option doesn't have any influence. * * @param allGroups to also compute all groups matching the query * @return <code>this</code> */ public GroupingSearch setAllGroups(boolean allGroups) { this.allGroups = allGroups; return this; } /** * If {@link #setAllGroups(boolean)} was set to <code>true</code> then all matching groups are returned, otherwise * an empty collection is returned. * * @param <T> The group value type. This can be a {@link BytesRef} or a {@link MutableValue} instance. If grouping * by doc block this the group value is always <code>null</code>. * @return all matching groups are returned, or an empty collection */ @SuppressWarnings({"unchecked", "rawtypes"}) public <T> Collection<T> getAllMatchingGroups() { return (Collection<T>) matchingGroups; } /** * Whether to compute all group heads (most relevant document per group) matching the query. * <p> * This feature isn't enabled when grouping by doc block. * * @param allGroupHeads Whether to compute all group heads (most relevant document per group) matching the query * @return <code>this</code> */ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) { this.allGroupHeads = allGroupHeads; return this; } /** * Returns the matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set. * * @return The matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an empty bit set */ public Bits getAllGroupHeads() { return matchingGroupHeads; } }