/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.lucene.index.FilterableTermsEnum;
import org.elasticsearch.common.lucene.index.FreqTermsEnum;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.NonCollectingAggregator;
import org.elasticsearch.search.aggregations.bucket.BucketUtils;
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory;
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
import java.util.List;
import java.util.Map;
public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource, SignificantTermsAggregatorFactory>
implements Releasable {
private final IncludeExclude includeExclude;
private final String executionHint;
private String indexedFieldName;
private MappedFieldType fieldType;
private FilterableTermsEnum termsEnum;
private int numberOfAggregatorsCreated;
final Query filter;
private final int supersetNumDocs;
private final TermsAggregator.BucketCountThresholds bucketCountThresholds;
private final SignificanceHeuristic significanceHeuristic;
public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig<ValuesSource> config, IncludeExclude includeExclude,
String executionHint, QueryBuilder filterBuilder, TermsAggregator.BucketCountThresholds bucketCountThresholds,
SignificanceHeuristic significanceHeuristic, SearchContext context, AggregatorFactory<?> parent,
AggregatorFactories.Builder subFactoriesBuilder, Map<String, Object> metaData) throws IOException {
super(name, config, context, parent, subFactoriesBuilder, metaData);
this.includeExclude = includeExclude;
this.executionHint = executionHint;
this.filter = filterBuilder == null
? null
: filterBuilder.toFilter(context.getQueryShardContext());
IndexSearcher searcher = context.searcher();
this.supersetNumDocs = filter == null
// Important - need to use the doc count that includes deleted docs
// or we have this issue: https://github.com/elastic/elasticsearch/issues/7951
? searcher.getIndexReader().maxDoc()
: searcher.count(filter);
this.bucketCountThresholds = bucketCountThresholds;
this.significanceHeuristic = significanceHeuristic;
setFieldInfo(context);
}
private void setFieldInfo(SearchContext context) {
if (!config.unmapped()) {
this.indexedFieldName = config.fieldContext().field();
fieldType = context.smartNameFieldType(indexedFieldName);
}
}
/**
* Get the number of docs in the superset.
*/
public long getSupersetNumDocs() {
return supersetNumDocs;
}
private FilterableTermsEnum getTermsEnum(String field) throws IOException {
if (termsEnum != null) {
return termsEnum;
}
IndexReader reader = context.searcher().getIndexReader();
if (numberOfAggregatorsCreated > 1) {
termsEnum = new FreqTermsEnum(reader, field, true, false, filter, context.bigArrays());
} else {
termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
}
return termsEnum;
}
private long getBackgroundFrequency(String value) throws IOException {
Query query = fieldType.termQuery(value, context.getQueryShardContext());
if (query instanceof TermQuery) {
// for types that use the inverted index, we prefer using a caching terms
// enum that will do a better job at reusing index inputs
Term term = ((TermQuery) query).getTerm();
FilterableTermsEnum termsEnum = getTermsEnum(term.field());
if (termsEnum.seekExact(term.bytes())) {
return termsEnum.docFreq();
} else {
return 0;
}
}
// otherwise do it the naive way
if (filter != null) {
query = new BooleanQuery.Builder()
.add(query, Occur.FILTER)
.add(filter, Occur.FILTER)
.build();
}
return context.searcher().count(query);
}
public long getBackgroundFrequency(BytesRef termBytes) throws IOException {
String value = config.format().format(termBytes);
return getBackgroundFrequency(value);
}
public long getBackgroundFrequency(long termNum) throws IOException {
String value = config.format().format(termNum);
return getBackgroundFrequency(value);
}
@Override
protected Aggregator createUnmapped(Aggregator parent, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData)
throws IOException {
final InternalAggregation aggregation = new UnmappedSignificantTerms(name, bucketCountThresholds.getRequiredSize(),
bucketCountThresholds.getMinDocCount(), pipelineAggregators, metaData);
return new NonCollectingAggregator(name, context, parent, pipelineAggregators, metaData) {
@Override
public InternalAggregation buildEmptyAggregation() {
return aggregation;
}
};
}
@Override
protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator parent, boolean collectsFromSingleBucket,
List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
if (collectsFromSingleBucket == false) {
return asMultiBucketAggregator(this, context, parent);
}
numberOfAggregatorsCreated++;
BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(this.bucketCountThresholds);
if (bucketCountThresholds.getShardSize() == SignificantTermsAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS.getShardSize()) {
// The user has not made a shardSize selection .
// Use default heuristic to avoid any wrong-ranking caused by
// distributed counting
// but request double the usual amount.
// We typically need more than the number of "top" terms requested
// by other aggregations
// as the significance algorithm is in less of a position to
// down-select at shard-level -
// some of the things we want to find have only one occurrence on
// each shard and as
// such are impossible to differentiate from non-significant terms
// at that early stage.
bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize(),
context.numberOfShards()));
}
if (valuesSource instanceof ValuesSource.Bytes) {
ExecutionMode execution = null;
if (executionHint != null) {
execution = ExecutionMode.fromString(executionHint);
}
if (!(valuesSource instanceof ValuesSource.Bytes.WithOrdinals)) {
execution = ExecutionMode.MAP;
}
if (execution == null) {
if (Aggregator.descendsFromBucketAggregator(parent)) {
execution = ExecutionMode.GLOBAL_ORDINALS_HASH;
} else {
execution = ExecutionMode.GLOBAL_ORDINALS;
}
}
assert execution != null;
DocValueFormat format = config.format();
if ((includeExclude != null) && (includeExclude.isRegexBased()) && format != DocValueFormat.RAW) {
throw new AggregationExecutionException("Aggregation [" + name + "] cannot support regular expression style include/exclude "
+ "settings as they can only be applied to string fields. Use an array of values for include/exclude clauses");
}
return execution.create(name, factories, valuesSource, format, bucketCountThresholds, includeExclude, context, parent,
significanceHeuristic, this, pipelineAggregators, metaData);
}
if ((includeExclude != null) && (includeExclude.isRegexBased())) {
throw new AggregationExecutionException("Aggregation [" + name + "] cannot support regular expression style include/exclude "
+ "settings as they can only be applied to string fields. Use an array of numeric values for include/exclude clauses used to filter numeric fields");
}
if (valuesSource instanceof ValuesSource.Numeric) {
if (((ValuesSource.Numeric) valuesSource).isFloatingPoint()) {
throw new UnsupportedOperationException("No support for examining floating point numerics");
}
IncludeExclude.LongFilter longFilter = null;
if (includeExclude != null) {
longFilter = includeExclude.convertToLongFilter(config.format());
}
return new SignificantLongTermsAggregator(name, factories, (ValuesSource.Numeric) valuesSource, config.format(),
bucketCountThresholds, context, parent, significanceHeuristic, this, longFilter, pipelineAggregators,
metaData);
}
throw new AggregationExecutionException("significant_terms aggregation cannot be applied to field ["
+ config.fieldContext().field() + "]. It can only be applied to numeric or string fields.");
}
public enum ExecutionMode {
MAP(new ParseField("map")) {
@Override
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, DocValueFormat format,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
SearchContext aggregationContext, Aggregator parent, SignificanceHeuristic significanceHeuristic,
SignificantTermsAggregatorFactory termsAggregatorFactory, List<PipelineAggregator> pipelineAggregators,
Map<String, Object> metaData) throws IOException {
final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(format);
return new SignificantStringTermsAggregator(name, factories, valuesSource, format, bucketCountThresholds, filter,
aggregationContext, parent, significanceHeuristic, termsAggregatorFactory, pipelineAggregators, metaData);
}
},
GLOBAL_ORDINALS(new ParseField("global_ordinals")) {
@Override
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, DocValueFormat format,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
SearchContext aggregationContext, Aggregator parent, SignificanceHeuristic significanceHeuristic,
SignificantTermsAggregatorFactory termsAggregatorFactory, List<PipelineAggregator> pipelineAggregators,
Map<String, Object> metaData) throws IOException {
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(format);
return new GlobalOrdinalsSignificantTermsAggregator(name, factories,
(ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, format, bucketCountThresholds, filter,
aggregationContext, parent, significanceHeuristic, termsAggregatorFactory, pipelineAggregators, metaData);
}
},
GLOBAL_ORDINALS_HASH(new ParseField("global_ordinals_hash")) {
@Override
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, DocValueFormat format,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
SearchContext aggregationContext, Aggregator parent, SignificanceHeuristic significanceHeuristic,
SignificantTermsAggregatorFactory termsAggregatorFactory, List<PipelineAggregator> pipelineAggregators,
Map<String, Object> metaData) throws IOException {
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(format);
return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories,
(ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, format, bucketCountThresholds, filter,
aggregationContext, parent, significanceHeuristic, termsAggregatorFactory, pipelineAggregators, metaData);
}
};
public static ExecutionMode fromString(String value) {
for (ExecutionMode mode : values()) {
if (mode.parseField.match(value)) {
return mode;
}
}
throw new IllegalArgumentException("Unknown `execution_hint`: [" + value + "], expected any of " + values());
}
private final ParseField parseField;
ExecutionMode(ParseField parseField) {
this.parseField = parseField;
}
abstract Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, DocValueFormat format,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
SearchContext aggregationContext, Aggregator parent, SignificanceHeuristic significanceHeuristic,
SignificantTermsAggregatorFactory termsAggregatorFactory, List<PipelineAggregator> pipelineAggregators,
Map<String, Object> metaData) throws IOException;
@Override
public String toString() {
return parseField.getPreferredName();
}
}
@Override
public void close() {
try {
if (termsEnum instanceof Releasable) {
((Releasable) termsEnum).close();
}
} finally {
termsEnum = null;
}
}
}