/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.aggregations.metrics.cardinality; import com.carrotsearch.hppc.BitMixer; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.RamUsageEstimator; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.ObjectArray; import org.elasticsearch.index.fielddata.SortedBinaryDocValues; import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.LeafBucketCollector; import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregator; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; import org.elasticsearch.search.aggregations.support.ValuesSource; import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; import java.util.List; import java.util.Map; /** * An aggregator that computes approximate counts of unique values. */ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue { private final int precision; private final ValuesSource valuesSource; // Expensive to initialize, so we only initialize it when we have an actual value source @Nullable private HyperLogLogPlusPlus counts; private Collector collector; public CardinalityAggregator(String name, ValuesSource valuesSource, int precision, SearchContext context, Aggregator parent, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException { super(name, context, parent, pipelineAggregators, metaData); this.valuesSource = valuesSource; this.precision = precision; this.counts = valuesSource == null ? null : new HyperLogLogPlusPlus(precision, context.bigArrays(), 1); } @Override public boolean needsScores() { return valuesSource != null && valuesSource.needsScores(); } private Collector pickCollector(LeafReaderContext ctx) throws IOException { if (valuesSource == null) { return new EmptyCollector(); } if (valuesSource instanceof ValuesSource.Numeric) { ValuesSource.Numeric source = (ValuesSource.Numeric) valuesSource; MurmurHash3Values hashValues = source.isFloatingPoint() ? MurmurHash3Values.hash(source.doubleValues(ctx)) : MurmurHash3Values.hash(source.longValues(ctx)); return new DirectCollector(counts, hashValues); } if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals) { ValuesSource.Bytes.WithOrdinals source = (ValuesSource.Bytes.WithOrdinals) valuesSource; final SortedSetDocValues ordinalValues = source.ordinalsValues(ctx); final long maxOrd = ordinalValues.getValueCount(); if (maxOrd == 0) { return new EmptyCollector(); } final long ordinalsMemoryUsage = OrdinalsCollector.memoryOverhead(maxOrd); final long countsMemoryUsage = HyperLogLogPlusPlus.memoryUsage(precision); // only use ordinals if they don't increase memory usage by more than 25% if (ordinalsMemoryUsage < countsMemoryUsage / 4) { return new OrdinalsCollector(counts, ordinalValues, context.bigArrays()); } } return new DirectCollector(counts, MurmurHash3Values.hash(valuesSource.bytesValues(ctx))); } @Override public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException { postCollectLastCollector(); collector = pickCollector(ctx); return collector; } private void postCollectLastCollector() throws IOException { if (collector != null) { try { collector.postCollect(); collector.close(); } finally { collector = null; } } } @Override protected void doPostCollection() throws IOException { postCollectLastCollector(); } @Override public double metric(long owningBucketOrd) { return counts == null ? 0 : counts.cardinality(owningBucketOrd); } @Override public InternalAggregation buildAggregation(long owningBucketOrdinal) { if (counts == null || owningBucketOrdinal >= counts.maxBucket() || counts.cardinality(owningBucketOrdinal) == 0) { return buildEmptyAggregation(); } // We need to build a copy because the returned Aggregation needs remain usable after // this Aggregator (and its HLL++ counters) is released. HyperLogLogPlusPlus copy = new HyperLogLogPlusPlus(precision, BigArrays.NON_RECYCLING_INSTANCE, 1); copy.merge(0, counts, owningBucketOrdinal); return new InternalCardinality(name, copy, pipelineAggregators(), metaData()); } @Override public InternalAggregation buildEmptyAggregation() { return new InternalCardinality(name, null, pipelineAggregators(), metaData()); } @Override protected void doClose() { Releasables.close(counts, collector); } private abstract static class Collector extends LeafBucketCollector implements Releasable { public abstract void postCollect() throws IOException; } private static class EmptyCollector extends Collector { @Override public void collect(int doc, long bucketOrd) { // no-op } @Override public void postCollect() { // no-op } @Override public void close() { // no-op } } private static class DirectCollector extends Collector { private final MurmurHash3Values hashes; private final HyperLogLogPlusPlus counts; DirectCollector(HyperLogLogPlusPlus counts, MurmurHash3Values values) { this.counts = counts; this.hashes = values; } @Override public void collect(int doc, long bucketOrd) throws IOException { if (hashes.advanceExact(doc)) { final int valueCount = hashes.count(); for (int i = 0; i < valueCount; ++i) { counts.collect(bucketOrd, hashes.nextValue()); } } } @Override public void postCollect() { // no-op } @Override public void close() { // no-op } } private static class OrdinalsCollector extends Collector { private static final long SHALLOW_FIXEDBITSET_SIZE = RamUsageEstimator.shallowSizeOfInstance(FixedBitSet.class); /** * Return an approximate memory overhead per bucket for this collector. */ public static long memoryOverhead(long maxOrd) { return RamUsageEstimator.NUM_BYTES_OBJECT_REF + SHALLOW_FIXEDBITSET_SIZE + (maxOrd + 7) / 8; // 1 bit per ord } private final BigArrays bigArrays; private final SortedSetDocValues values; private final int maxOrd; private final HyperLogLogPlusPlus counts; private ObjectArray<FixedBitSet> visitedOrds; OrdinalsCollector(HyperLogLogPlusPlus counts, SortedSetDocValues values, BigArrays bigArrays) { if (values.getValueCount() > Integer.MAX_VALUE) { throw new IllegalArgumentException(); } maxOrd = (int) values.getValueCount(); this.bigArrays = bigArrays; this.counts = counts; this.values = values; visitedOrds = bigArrays.newObjectArray(1); } @Override public void collect(int doc, long bucketOrd) throws IOException { visitedOrds = bigArrays.grow(visitedOrds, bucketOrd + 1); FixedBitSet bits = visitedOrds.get(bucketOrd); if (bits == null) { bits = new FixedBitSet(maxOrd); visitedOrds.set(bucketOrd, bits); } if (values.advanceExact(doc)) { for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { bits.set((int) ord); } } } @Override public void postCollect() throws IOException { final FixedBitSet allVisitedOrds = new FixedBitSet(maxOrd); for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) { final FixedBitSet bits = visitedOrds.get(bucket); if (bits != null) { allVisitedOrds.or(bits); } } final org.elasticsearch.common.hash.MurmurHash3.Hash128 hash = new org.elasticsearch.common.hash.MurmurHash3.Hash128(); try (LongArray hashes = bigArrays.newLongArray(maxOrd, false)) { for (int ord = allVisitedOrds.nextSetBit(0); ord < DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 < maxOrd ? allVisitedOrds.nextSetBit(ord + 1) : DocIdSetIterator.NO_MORE_DOCS) { final BytesRef value = values.lookupOrd(ord); org.elasticsearch.common.hash.MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, hash); hashes.set(ord, hash.h1); } for (long bucket = visitedOrds.size() - 1; bucket >= 0; --bucket) { final FixedBitSet bits = visitedOrds.get(bucket); if (bits != null) { for (int ord = bits.nextSetBit(0); ord < DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 < maxOrd ? bits.nextSetBit(ord + 1) : DocIdSetIterator.NO_MORE_DOCS) { counts.collect(bucket, hashes.get(ord)); } } } } } @Override public void close() { Releasables.close(visitedOrds); } } /** * Representation of a list of hash values. There might be dups and there is no guarantee on the order. */ abstract static class MurmurHash3Values { public abstract boolean advanceExact(int docId) throws IOException; public abstract int count(); public abstract long nextValue() throws IOException; /** * Return a {@link MurmurHash3Values} instance that computes hashes on the fly for each double value. */ public static MurmurHash3Values hash(SortedNumericDoubleValues values) { return new Double(values); } /** * Return a {@link MurmurHash3Values} instance that computes hashes on the fly for each long value. */ public static MurmurHash3Values hash(SortedNumericDocValues values) { return new Long(values); } /** * Return a {@link MurmurHash3Values} instance that computes hashes on the fly for each binary value. */ public static MurmurHash3Values hash(SortedBinaryDocValues values) { return new Bytes(values); } private static class Long extends MurmurHash3Values { private final SortedNumericDocValues values; Long(SortedNumericDocValues values) { this.values = values; } @Override public boolean advanceExact(int docId) throws IOException { return values.advanceExact(docId); } @Override public int count() { return values.docValueCount(); } @Override public long nextValue() throws IOException { return BitMixer.mix64(values.nextValue()); } } private static class Double extends MurmurHash3Values { private final SortedNumericDoubleValues values; Double(SortedNumericDoubleValues values) { this.values = values; } @Override public boolean advanceExact(int docId) throws IOException { return values.advanceExact(docId); } @Override public int count() { return values.docValueCount(); } @Override public long nextValue() throws IOException { return BitMixer.mix64(java.lang.Double.doubleToLongBits(values.nextValue())); } } private static class Bytes extends MurmurHash3Values { private final org.elasticsearch.common.hash.MurmurHash3.Hash128 hash = new org.elasticsearch.common.hash.MurmurHash3.Hash128(); private final SortedBinaryDocValues values; Bytes(SortedBinaryDocValues values) { this.values = values; } @Override public boolean advanceExact(int docId) throws IOException { return values.advanceExact(docId); } @Override public int count() { return values.docValueCount(); } @Override public long nextValue() throws IOException { final BytesRef bytes = values.nextValue(); org.elasticsearch.common.hash.MurmurHash3.hash128(bytes.bytes, bytes.offset, bytes.length, 0, hash); return hash.h1; } } } }