BestDocsDeferringCollector.java example

Explorer
elasticsearch-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.search.aggregations.bucket.sampler;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.ObjectArray;
import org.elasticsearch.search.aggregations.BucketCollector;
import org.elasticsearch.search.aggregations.LeafBucketCollector;
import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * A specialization of {@link DeferringBucketCollector} that collects all
 * matches and then replays only the top scoring documents to child
 * aggregations. The method
 * {@link BestDocsDeferringCollector#createTopDocsCollector(int)} is designed to
 * be overridden and allows subclasses to choose a custom collector
 * implementation for determining the top N matches.
 */
public class BestDocsDeferringCollector extends DeferringBucketCollector implements Releasable {
    private final List<PerSegmentCollects> entries = new ArrayList<>();
    private BucketCollector deferred;
    private ObjectArray<PerParentBucketSamples> perBucketSamples;
    private int shardSize;
    private PerSegmentCollects perSegCollector;
    private final BigArrays bigArrays;

    /**
     * Sole constructor.
     *
     * @param shardSize
     *            The number of top-scoring docs to collect for each bucket
     */
    BestDocsDeferringCollector(int shardSize, BigArrays bigArrays) {
        this.shardSize = shardSize;
        this.bigArrays = bigArrays;
        perBucketSamples = bigArrays.newObjectArray(1);
    }

    @Override
    public boolean needsScores() {
        return true;
    }

    /** Set the deferred collectors. */
    @Override
    public void setDeferredCollector(Iterable<BucketCollector> deferredCollectors) {
        this.deferred = BucketCollector.wrap(deferredCollectors);
    }

    @Override
    public LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws IOException {
        perSegCollector = new PerSegmentCollects(ctx);
        entries.add(perSegCollector);

        // Deferring collector
        return new LeafBucketCollector() {
            @Override
            public void setScorer(Scorer scorer) throws IOException {
                perSegCollector.setScorer(scorer);
            }

            @Override
            public void collect(int doc, long bucket) throws IOException {
                perSegCollector.collect(doc, bucket);
            }
        };
    }

    // Designed to be overridden by subclasses that may score docs by criteria
    // other than Lucene score
    protected TopDocsCollector<? extends ScoreDoc> createTopDocsCollector(int size) throws IOException {
        return TopScoreDocCollector.create(size);
    }

    @Override
    public void preCollection() throws IOException {
        deferred.preCollection();
    }

    @Override
    public void postCollection() throws IOException {
        runDeferredAggs();
    }


    @Override
    public void prepareSelectedBuckets(long... selectedBuckets) throws IOException {
        // no-op - deferred aggs processed in postCollection call
    }

    private void runDeferredAggs() throws IOException {
        List<ScoreDoc> allDocs = new ArrayList<>(shardSize);
        for (int i = 0; i < perBucketSamples.size(); i++) {
            PerParentBucketSamples perBucketSample = perBucketSamples.get(i);
            if (perBucketSample == null) {
                continue;
            }
            perBucketSample.getMatches(allDocs);
        }

        // Sort the top matches by docID for the benefit of deferred collector
        ScoreDoc[] docsArr = allDocs.toArray(new ScoreDoc[allDocs.size()]);
        Arrays.sort(docsArr, (o1, o2) -> {
            if(o1.doc == o2.doc){
                return o1.shardIndex - o2.shardIndex;
            }
            return o1.doc - o2.doc;
        });
        try {
            for (PerSegmentCollects perSegDocs : entries) {
                perSegDocs.replayRelatedMatches(docsArr);
            }
        } catch (IOException e) {
            throw new ElasticsearchException("IOException collecting best scoring results", e);
        }
        deferred.postCollection();
    }

    class PerParentBucketSamples {
        private LeafCollector currentLeafCollector;
        private TopDocsCollector<? extends ScoreDoc> tdc;
        private long parentBucket;
        private int matchedDocs;

        PerParentBucketSamples(long parentBucket, Scorer scorer, LeafReaderContext readerContext) {
            try {
                this.parentBucket = parentBucket;
                tdc = createTopDocsCollector(shardSize);
                currentLeafCollector = tdc.getLeafCollector(readerContext);
                setScorer(scorer);
            } catch (IOException e) {
                throw new ElasticsearchException("IO error creating collector", e);
            }
        }

        public void getMatches(List<ScoreDoc> allDocs) {
            TopDocs topDocs = tdc.topDocs();
            ScoreDoc[] sd = topDocs.scoreDocs;
            matchedDocs = sd.length;
            for (ScoreDoc scoreDoc : sd) {
                // A bit of a hack to (ab)use shardIndex property here to
                // hold a bucket ID but avoids allocating extra data structures
                // and users should have bigger concerns if bucket IDs
                // exceed int capacity..
                scoreDoc.shardIndex = (int) parentBucket;
            }
            allDocs.addAll(Arrays.asList(sd));
        }

        public void collect(int doc) throws IOException {
            currentLeafCollector.collect(doc);
        }

        public void setScorer(Scorer scorer) throws IOException {
            currentLeafCollector.setScorer(scorer);
        }

        public void changeSegment(LeafReaderContext readerContext) throws IOException {
            currentLeafCollector = tdc.getLeafCollector(readerContext);
        }

        public int getDocCount() {
            return matchedDocs;
        }
    }

    class PerSegmentCollects extends Scorer {
        private LeafReaderContext readerContext;
        int maxDocId = Integer.MIN_VALUE;
        private float currentScore;
        private int currentDocId = -1;
        private Scorer currentScorer;

        PerSegmentCollects(LeafReaderContext readerContext) throws IOException {
            // The publisher behaviour for Reader/Scorer listeners triggers a
            // call to this constructor with a null scorer so we can't call
            // scorer.getWeight() and pass the Weight to our base class.
            // However, passing null seems to have no adverse effects here...
            super(null);
            this.readerContext = readerContext;
            for (int i = 0; i < perBucketSamples.size(); i++) {
                PerParentBucketSamples perBucketSample = perBucketSamples.get(i);
                if (perBucketSample == null) {
                    continue;
                }
                perBucketSample.changeSegment(readerContext);
            }
        }

        public void setScorer(Scorer scorer) throws IOException {
            this.currentScorer = scorer;
            for (int i = 0; i < perBucketSamples.size(); i++) {
                PerParentBucketSamples perBucketSample = perBucketSamples.get(i);
                if (perBucketSample == null) {
                    continue;
                }
                perBucketSample.setScorer(scorer);
            }
        }

        public void replayRelatedMatches(ScoreDoc[] sd) throws IOException {
            final LeafBucketCollector leafCollector = deferred.getLeafCollector(readerContext);
            leafCollector.setScorer(this);

            currentScore = 0;
            currentDocId = -1;
            if (maxDocId < 0) {
                return;
            }
            for (ScoreDoc scoreDoc : sd) {
                // Doc ids from TopDocCollector are root-level Reader so
                // need rebasing
                int rebased = scoreDoc.doc - readerContext.docBase;
                if ((rebased >= 0) && (rebased <= maxDocId)) {
                    currentScore = scoreDoc.score;
                    currentDocId = rebased;
                    // We stored the bucket ID in Lucene's shardIndex property
                    // for convenience.
                    leafCollector.collect(rebased, scoreDoc.shardIndex);
                }
            }

        }

        @Override
        public float score() throws IOException {
            return currentScore;
        }

        @Override
        public int freq() throws IOException {
            throw new ElasticsearchException("This caching scorer implementation only implements score() and docID()");
        }

        @Override
        public int docID() {
            return currentDocId;
        }

        @Override
        public DocIdSetIterator iterator() {
            throw new ElasticsearchException("This caching scorer implementation only implements score() and docID()");
        }

        public void collect(int docId, long parentBucket) throws IOException {
            perBucketSamples = bigArrays.grow(perBucketSamples, parentBucket + 1);
            PerParentBucketSamples sampler = perBucketSamples.get((int) parentBucket);
            if (sampler == null) {
                sampler = new PerParentBucketSamples(parentBucket, currentScorer, readerContext);
                perBucketSamples.set((int) parentBucket, sampler);
            }
            sampler.collect(docId);
            maxDocId = Math.max(maxDocId, docId);
        }
    }

    public int getDocCount(long parentBucket) {
        PerParentBucketSamples sampler = perBucketSamples.get((int) parentBucket);
        if (sampler == null) {
            // There are conditions where no docs are collected and the aggs
            // framework still asks for doc count.
            return 0;
        }
        return sampler.getDocCount();
    }

    @Override
    public void close() throws ElasticsearchException {
        Releasables.close(perBucketSamples);
    }

}