package org.apache.lucene.search.grouping.term;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.grouping.AbstractGroupFacetCollector;
import org.apache.lucene.util.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* An implementation of {@link AbstractGroupFacetCollector} that computes grouped facets based on the indexed terms
* from the {@link FieldCache}.
*
* @lucene.experimental
*/
public abstract class TermGroupFacetCollector extends AbstractGroupFacetCollector {
final List<GroupedFacetHit> groupedFacetHits;
final SentinelIntSet segmentGroupedFacetHits;
final BytesRef spare = new BytesRef();
FieldCache.DocTermsIndex groupFieldTermsIndex;
/**
* Factory method for creating the right implementation based on the fact whether the facet field contains
* multiple tokens per documents.
*
* @param groupField The group field
* @param facetField The facet field
* @param facetFieldMultivalued Whether the facet field has multiple tokens per document
* @param facetPrefix The facet prefix a facet entry should start with to be included.
* @param initialSize The initial allocation size of the internal int set and group facet list which should roughly
* match the total number of expected unique groups. Be aware that the heap usage is
* 4 bytes * initialSize.
* @return <code>TermGroupFacetCollector</code> implementation
*/
public static TermGroupFacetCollector createTermGroupFacetCollector(String groupField,
String facetField,
boolean facetFieldMultivalued,
BytesRef facetPrefix,
int initialSize) {
if (facetFieldMultivalued) {
return new MV(groupField, facetField, facetPrefix, initialSize);
} else {
return new SV(groupField, facetField, facetPrefix, initialSize);
}
}
TermGroupFacetCollector(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
super(groupField, facetField, facetPrefix);
groupedFacetHits = new ArrayList<GroupedFacetHit>(initialSize);
segmentGroupedFacetHits = new SentinelIntSet(initialSize, -1);
}
// Implementation for single valued facet fields.
static class SV extends TermGroupFacetCollector {
private FieldCache.DocTermsIndex facetFieldTermsIndex;
SV(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
super(groupField, facetField, facetPrefix, initialSize);
}
public void collect(int doc) throws IOException {
int facetOrd = facetFieldTermsIndex.getOrd(doc);
if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
return;
}
int groupOrd = groupFieldTermsIndex.getOrd(doc);
int segmentGroupedFacetsIndex = (groupOrd * facetFieldTermsIndex.numOrd()) + facetOrd;
if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
return;
}
segmentTotalCount++;
segmentFacetCounts[facetOrd]++;
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
groupedFacetHits.add(
new GroupedFacetHit(
groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()),
facetOrd == 0 ? null : facetFieldTermsIndex.lookup(facetOrd, new BytesRef())
)
);
}
public void setNextReader(AtomicReaderContext context) throws IOException {
if (segmentFacetCounts != null) {
segmentResults.add(createSegmentResult());
}
groupFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
facetFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), facetField);
segmentFacetCounts = new int[facetFieldTermsIndex.numOrd()];
segmentTotalCount = 0;
segmentGroupedFacetHits.clear();
for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
int facetOrd = facetFieldTermsIndex.binarySearchLookup(groupedFacetHit.facetValue, spare);
if (facetOrd < 0) {
continue;
}
int groupOrd = groupFieldTermsIndex.binarySearchLookup(groupedFacetHit.groupValue, spare);
if (groupOrd < 0) {
continue;
}
int segmentGroupedFacetsIndex = (groupOrd * facetFieldTermsIndex.numOrd()) + facetOrd;
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
}
if (facetPrefix != null) {
startFacetOrd = facetFieldTermsIndex.binarySearchLookup(facetPrefix, spare);
if (startFacetOrd < 0) {
// Points to the ord one higher than facetPrefix
startFacetOrd = -startFacetOrd - 1;
}
BytesRef facetEndPrefix = BytesRef.deepCopyOf(facetPrefix);
facetEndPrefix.append(UnicodeUtil.BIG_TERM);
endFacetOrd = facetFieldTermsIndex.binarySearchLookup(facetEndPrefix, spare);
endFacetOrd = -endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix
} else {
startFacetOrd = 0;
endFacetOrd = facetFieldTermsIndex.numOrd();
}
}
protected SegmentResult createSegmentResult() throws IOException {
return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldTermsIndex.getTermsEnum(), startFacetOrd, endFacetOrd);
}
private static class SegmentResult extends AbstractGroupFacetCollector.SegmentResult {
final TermsEnum tenum;
SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException {
super(counts, total - counts[0], counts[0], endFacetOrd);
this.tenum = tenum;
this.mergePos = startFacetOrd == 0 ? 1 : startFacetOrd;
if (mergePos < maxTermPos) {
tenum.seekExact(mergePos);
mergeTerm = tenum.term();
}
}
protected void nextTerm() throws IOException {
mergeTerm = tenum.next();
}
}
}
// Implementation for multi valued facet fields.
static class MV extends TermGroupFacetCollector {
private DocTermOrds facetFieldDocTermOrds;
private TermsEnum facetOrdTermsEnum;
private DocTermOrds.TermOrdsIterator reuse;
MV(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
super(groupField, facetField, facetPrefix, initialSize);
}
public void collect(int doc) throws IOException {
int groupOrd = groupFieldTermsIndex.getOrd(doc);
if (facetFieldDocTermOrds.isEmpty()) {
int segmentGroupedFacetsIndex = groupOrd * (facetFieldDocTermOrds.numTerms() + 1);
if (facetPrefix != null || segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
return;
}
segmentTotalCount++;
segmentFacetCounts[facetFieldDocTermOrds.numTerms()]++;
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
groupedFacetHits.add(
new GroupedFacetHit(groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()), null)
);
return;
}
if (facetOrdTermsEnum != null) {
reuse = facetFieldDocTermOrds.lookup(doc, reuse);
}
int chunk;
boolean first = true;
int[] buffer = new int[5];
do {
chunk = reuse != null ? reuse.read(buffer) : 0;
if (first && chunk == 0) {
chunk = 1;
buffer[0] = facetFieldDocTermOrds.numTerms(); // this facet ord is reserved for docs not containing facet field.
}
first = false;
for (int pos = 0; pos < chunk; pos++) {
int facetOrd = buffer[pos];
if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
continue;
}
int segmentGroupedFacetsIndex = (groupOrd * (facetFieldDocTermOrds.numTerms() + 1)) + facetOrd;
if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
continue;
}
segmentTotalCount++;
segmentFacetCounts[facetOrd]++;
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
groupedFacetHits.add(
new GroupedFacetHit(
groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()),
facetOrd == facetFieldDocTermOrds.numTerms() ? null : BytesRef.deepCopyOf(facetFieldDocTermOrds.lookupTerm(facetOrdTermsEnum, facetOrd))
)
);
}
} while (chunk >= buffer.length);
}
public void setNextReader(AtomicReaderContext context) throws IOException {
if (segmentFacetCounts != null) {
segmentResults.add(createSegmentResult());
}
reuse = null;
groupFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
facetFieldDocTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), facetField);
facetOrdTermsEnum = facetFieldDocTermOrds.getOrdTermsEnum(context.reader());
// [facetFieldDocTermOrds.numTerms() + 1] for all possible facet values and docs not containing facet field
segmentFacetCounts = new int[facetFieldDocTermOrds.numTerms() + 1];
segmentTotalCount = 0;
segmentGroupedFacetHits.clear();
for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
int groupOrd = groupFieldTermsIndex.binarySearchLookup(groupedFacetHit.groupValue, spare);
if (groupOrd < 0) {
continue;
}
int facetOrd;
if (groupedFacetHit.facetValue != null) {
if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue, true)) {
continue;
}
facetOrd = (int) facetOrdTermsEnum.ord();
} else {
facetOrd = facetFieldDocTermOrds.numTerms();
}
// (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field
int segmentGroupedFacetsIndex = (groupOrd * (facetFieldDocTermOrds.numTerms() + 1)) + facetOrd;
segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
}
if (facetPrefix != null) {
TermsEnum.SeekStatus seekStatus;
if (facetOrdTermsEnum != null) {
seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix, true);
} else {
seekStatus = TermsEnum.SeekStatus.END;
}
if (seekStatus != TermsEnum.SeekStatus.END) {
startFacetOrd = (int) facetOrdTermsEnum.ord();
} else {
startFacetOrd = 0;
endFacetOrd = 0;
return;
}
BytesRef facetEndPrefix = BytesRef.deepCopyOf(facetPrefix);
facetEndPrefix.append(UnicodeUtil.BIG_TERM);
seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix, true);
if (seekStatus != TermsEnum.SeekStatus.END) {
endFacetOrd = (int) facetOrdTermsEnum.ord();
} else {
endFacetOrd = facetFieldDocTermOrds.numTerms(); // Don't include null...
}
} else {
startFacetOrd = 0;
endFacetOrd = facetFieldDocTermOrds.numTerms() + 1;
}
}
protected SegmentResult createSegmentResult() throws IOException {
return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldDocTermOrds.numTerms(), facetOrdTermsEnum, startFacetOrd, endFacetOrd);
}
private static class SegmentResult extends AbstractGroupFacetCollector.SegmentResult {
final TermsEnum tenum;
SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException {
super(counts, total - counts[missingCountIndex], counts[missingCountIndex],
endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd);
this.tenum = tenum;
this.mergePos = startFacetOrd;
if (tenum != null) {
tenum.seekExact(mergePos);
mergeTerm = tenum.term();
}
}
protected void nextTerm() throws IOException {
mergeTerm = tenum.next();
}
}
}
}
class GroupedFacetHit {
final BytesRef groupValue;
final BytesRef facetValue;
GroupedFacetHit(BytesRef groupValue, BytesRef facetValue) {
this.groupValue = groupValue;
this.facetValue = facetValue;
}
}