/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.aggregations.bucket.significant; import com.google.common.collect.Maps; import org.elasticsearch.common.io.stream.Streamable; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.search.aggregations.Aggregations; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.search.aggregations.InternalMultiBucketAggregation; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; /** * */ public abstract class InternalSignificantTerms<A extends InternalSignificantTerms, B extends InternalSignificantTerms.Bucket> extends InternalMultiBucketAggregation<A, B> implements SignificantTerms, ToXContent, Streamable { protected SignificanceHeuristic significanceHeuristic; protected int requiredSize; protected long minDocCount; protected List<? extends Bucket> buckets; protected Map<String, Bucket> bucketMap; protected long subsetSize; protected long supersetSize; protected InternalSignificantTerms() {} // for serialization @SuppressWarnings("PMD.ConstructorCallsOverridableMethod") public static abstract class Bucket extends SignificantTerms.Bucket { long bucketOrd; protected InternalAggregations aggregations; double score; protected Bucket(long subsetSize, long supersetSize) { // for serialization super(subsetSize, supersetSize); } protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) { super(subsetDf, subsetSize, supersetDf, supersetSize); this.aggregations = aggregations; } @Override public long getSubsetDf() { return subsetDf; } @Override public long getSupersetDf() { return supersetDf; } @Override public long getSupersetSize() { return supersetSize; } @Override public long getSubsetSize() { return subsetSize; } public void updateScore(SignificanceHeuristic significanceHeuristic) { score = significanceHeuristic.getScore(subsetDf, subsetSize, supersetDf, supersetSize); } @Override public long getDocCount() { return subsetDf; } @Override public Aggregations getAggregations() { return aggregations; } public Bucket reduce(List<? extends Bucket> buckets, ReduceContext context) { long subsetDf = 0; long supersetDf = 0; List<InternalAggregations> aggregationsList = new ArrayList<>(buckets.size()); for (Bucket bucket : buckets) { subsetDf += bucket.subsetDf; supersetDf += bucket.supersetDf; aggregationsList.add(bucket.aggregations); } InternalAggregations aggs = InternalAggregations.reduce(aggregationsList, context); return newBucket(subsetDf, subsetSize, supersetDf, supersetSize, aggs); } abstract Bucket newBucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations); @Override public double getSignificanceScore() { return score; } } protected InternalSignificantTerms(long subsetSize, long supersetSize, String name, int requiredSize, long minDocCount, SignificanceHeuristic significanceHeuristic, List<? extends Bucket> buckets, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) { super(name, pipelineAggregators, metaData); this.requiredSize = requiredSize; this.minDocCount = minDocCount; this.buckets = buckets; this.subsetSize = subsetSize; this.supersetSize = supersetSize; this.significanceHeuristic = significanceHeuristic; } @Override public Iterator<SignificantTerms.Bucket> iterator() { Object o = buckets.iterator(); return (Iterator<SignificantTerms.Bucket>) o; } @Override public List<SignificantTerms.Bucket> getBuckets() { Object o = buckets; return (List<SignificantTerms.Bucket>) o; } @Override public SignificantTerms.Bucket getBucketByKey(String term) { if (bucketMap == null) { bucketMap = Maps.newHashMapWithExpectedSize(buckets.size()); for (Bucket bucket : buckets) { bucketMap.put(bucket.getKeyAsString(), bucket); } } return bucketMap.get(term); } @Override public InternalAggregation doReduce(List<InternalAggregation> aggregations, ReduceContext reduceContext) { long globalSubsetSize = 0; long globalSupersetSize = 0; // Compute the overall result set size and the corpus size using the // top-level Aggregations from each shard for (InternalAggregation aggregation : aggregations) { InternalSignificantTerms<A, B> terms = (InternalSignificantTerms<A, B>) aggregation; globalSubsetSize += terms.subsetSize; globalSupersetSize += terms.supersetSize; } Map<String, List<InternalSignificantTerms.Bucket>> buckets = new HashMap<>(); for (InternalAggregation aggregation : aggregations) { InternalSignificantTerms<A, B> terms = (InternalSignificantTerms<A, B>) aggregation; for (Bucket bucket : terms.buckets) { List<Bucket> existingBuckets = buckets.get(bucket.getKeyAsString()); if (existingBuckets == null) { existingBuckets = new ArrayList<>(aggregations.size()); buckets.put(bucket.getKeyAsString(), existingBuckets); } // Adjust the buckets with the global stats representing the // total size of the pots from which the stats are drawn existingBuckets.add(bucket.newBucket(bucket.getSubsetDf(), globalSubsetSize, bucket.getSupersetDf(), globalSupersetSize, bucket.aggregations)); } } significanceHeuristic.initialize(reduceContext); final int size = Math.min(requiredSize, buckets.size()); BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); for (Map.Entry<String, List<Bucket>> entry : buckets.entrySet()) { List<Bucket> sameTermBuckets = entry.getValue(); final Bucket b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext); b.updateScore(significanceHeuristic); if ((b.score > 0) && (b.subsetDf >= minDocCount)) { ordered.insertWithOverflow(b); } } Bucket[] list = new Bucket[ordered.size()]; for (int i = ordered.size() - 1; i >= 0; i--) { list[i] = (Bucket) ordered.pop(); } return create(globalSubsetSize, globalSupersetSize, Arrays.asList(list), this); } protected abstract A create(long subsetSize, long supersetSize, List<InternalSignificantTerms.Bucket> buckets, InternalSignificantTerms prototype); }