PackedArrayIndexFieldData.java example

Explorer
elassandra-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.fielddata.plain;

import com.google.common.base.Preconditions;
import org.apache.lucene.index.*;
import org.apache.lucene.util.*;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.breaker.CircuitBreaker;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.*;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.fieldcomparator.LongValuesComparatorSource;
import org.elasticsearch.index.fielddata.ordinals.Ordinals;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.MultiValueMode;

import java.io.IOException;
import java.util.*;

/**
 * Stores numeric data into bit-packed arrays for better memory efficiency.
 */
public class PackedArrayIndexFieldData extends AbstractIndexFieldData<AtomicNumericFieldData> implements IndexNumericFieldData {

    public static class Builder implements IndexFieldData.Builder {

        private NumericType numericType;

        public Builder setNumericType(NumericType numericType) {
            this.numericType = numericType;
            return this;
        }

        @Override
        public IndexFieldData<AtomicNumericFieldData> build(Index index, Settings indexSettings, MappedFieldType fieldType,
                                                            IndexFieldDataCache cache, CircuitBreakerService breakerService, MapperService mapperService) {
            return new PackedArrayIndexFieldData(index, indexSettings, fieldType.names(), fieldType.fieldDataType(), cache, numericType, breakerService);
        }
    }

    private final NumericType numericType;
    private final CircuitBreakerService breakerService;

    public PackedArrayIndexFieldData(Index index, Settings indexSettings, MappedFieldType.Names fieldNames,
                                     FieldDataType fieldDataType, IndexFieldDataCache cache, NumericType numericType,
                                     CircuitBreakerService breakerService) {
        super(index, indexSettings, fieldNames, fieldDataType, cache);
        Preconditions.checkNotNull(numericType);
        Preconditions.checkArgument(EnumSet.of(NumericType.BOOLEAN, NumericType.BYTE, NumericType.SHORT, NumericType.INT, NumericType.LONG).contains(numericType), getClass().getSimpleName() + " only supports integer types, not " + numericType);
        this.numericType = numericType;
        this.breakerService = breakerService;
    }

    @Override
    public NumericType getNumericType() {
        return numericType;
    }

    @Override
    public AtomicNumericFieldData loadDirect(LeafReaderContext context) throws Exception {
        final LeafReader reader = context.reader();
        Terms terms = reader.terms(getFieldNames().indexName());
        AtomicNumericFieldData data = null;
        PackedArrayEstimator estimator = new PackedArrayEstimator(breakerService.getBreaker(CircuitBreaker.FIELDDATA), getNumericType(), getFieldNames().fullName());
        if (terms == null) {
            data = AtomicLongFieldData.empty(reader.maxDoc());
            estimator.adjustForNoTerms(data.ramBytesUsed());
            return data;
        }
        // TODO: how can we guess the number of terms? numerics end up creating more terms per value...
        // Lucene encodes numeric data so that the lexicographical (encoded) order matches the integer order so we know the sequence of
        // longs is going to be monotonically increasing
        final PackedLongValues.Builder valuesBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);

        final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
        TermsEnum termsEnum = estimator.beforeLoad(terms);
        assert !getNumericType().isFloatingPoint();
        boolean success = false;
        try (OrdinalsBuilder builder = new OrdinalsBuilder(-1, reader.maxDoc(), acceptableTransientOverheadRatio)) {
            BytesRefIterator iter = builder.buildFromTerms(termsEnum);
            BytesRef term;
            while ((term = iter.next()) != null) {
                final long value = numericType.toLong(term);
                valuesBuilder.add(value);
            }
            final PackedLongValues values = valuesBuilder.build();
            final Ordinals build = builder.build(fieldDataType.getSettings());
            CommonSettings.MemoryStorageFormat formatHint = CommonSettings.getMemoryStorageHint(fieldDataType);

            RandomAccessOrds ordinals = build.ordinals();
            if (FieldData.isMultiValued(ordinals) || formatHint == CommonSettings.MemoryStorageFormat.ORDINALS) {
                final long ramBytesUsed = build.ramBytesUsed() + values.ramBytesUsed();
                data = new AtomicLongFieldData(ramBytesUsed) {

                    @Override
                    public SortedNumericDocValues getLongValues() {
                        return withOrdinals(build, values, reader.maxDoc());
                    }

                    @Override
                    public Collection<Accountable> getChildResources() {
                        List<Accountable> resources = new ArrayList<>();
                        resources.add(Accountables.namedAccountable("ordinals", build));
                        resources.add(Accountables.namedAccountable("values", values));
                        return Collections.unmodifiableList(resources);
                    }
                };
            } else {
                final BitSet docsWithValues = builder.buildDocsWithValuesSet();

                long minV, maxV;
                minV = maxV = 0;
                if (values.size() > 0) {
                    minV = values.get(0);
                    maxV = values.get(values.size() - 1);
                }


                final float acceptableOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
                final int pageSize = fieldDataType.getSettings().getAsInt("single_value_page_size", 1024);

                if (formatHint == null) {
                    formatHint = chooseStorageFormat(reader, values, build, ordinals, minV, maxV, acceptableOverheadRatio, pageSize);
                }

                logger.trace("single value format for field [{}] set to [{}]", getFieldNames().fullName(), formatHint);

                switch (formatHint) {
                    case PACKED:
                        // Encode document without a value with a special value
                        long missingV = 0;
                        if (docsWithValues != null) {
                            if ((maxV - minV + 1) == values.size()) {
                                // values are dense
                                if (minV > Long.MIN_VALUE) {
                                    missingV = --minV;
                                } else {
                                    assert maxV != Long.MAX_VALUE;
                                    missingV = ++maxV;
                                }
                            } else {
                                for (long i = 1; i < values.size(); ++i) {
                                    if (values.get(i) > values.get(i - 1) + 1) {
                                        missingV = values.get(i - 1) + 1;
                                        break;
                                    }
                                }
                            }
                            missingV -= minV;
                        }
                        final long missingValue = missingV;
                        final long minValue = minV;
                        final long maxValue = maxV;

                        final long valuesDelta = maxValue - minValue;
                        int bitsRequired = valuesDelta < 0 ? 64 : PackedInts.bitsRequired(valuesDelta);
                        final PackedInts.Mutable sValues = PackedInts.getMutable(reader.maxDoc(), bitsRequired, acceptableOverheadRatio);

                        if (docsWithValues != null) {
                            sValues.fill(0, sValues.size(), missingV);
                        }

                        for (int i = 0; i < reader.maxDoc(); i++) {
                            ordinals.setDocument(i);
                            if (ordinals.cardinality() > 0) {
                                final long ord = ordinals.ordAt(0);
                                long value = values.get(ord);
                                sValues.set(i, value - minValue);
                            }
                        }
                        long ramBytesUsed = values.ramBytesUsed() + (docsWithValues == null ? 0 : docsWithValues.ramBytesUsed());
                        data = new AtomicLongFieldData(ramBytesUsed) {

                            @Override
                            public SortedNumericDocValues getLongValues() {
                                if (docsWithValues == null) {
                                    return singles(sValues, minValue);
                                } else {
                                    return sparseSingles(sValues, minValue, missingValue, reader.maxDoc());
                                }
                            }
                            
                            @Override
                            public Collection<Accountable> getChildResources() {
                                List<Accountable> resources = new ArrayList<>();
                                resources.add(Accountables.namedAccountable("values", sValues));
                                if (docsWithValues != null) {
                                    resources.add(Accountables.namedAccountable("missing bitset", docsWithValues));
                                }
                                return Collections.unmodifiableList(resources);
                            }

                        };
                        break;
                    case PAGED:
                        final PackedLongValues.Builder dpValues = PackedLongValues.deltaPackedBuilder(pageSize, acceptableOverheadRatio);

                        long lastValue = 0;
                        for (int i = 0; i < reader.maxDoc(); i++) {
                            ordinals.setDocument(i);
                            if (ordinals.cardinality() > 0) {
                                final long ord = ordinals.ordAt(i);
                                lastValue = values.get(ord);
                            }
                            dpValues.add(lastValue);
                        }
                        final PackedLongValues pagedValues = dpValues.build();
                        ramBytesUsed = pagedValues.ramBytesUsed();
                        if (docsWithValues != null) {
                            ramBytesUsed += docsWithValues.ramBytesUsed();
                        }
                        data = new AtomicLongFieldData(ramBytesUsed) {

                            @Override
                            public SortedNumericDocValues getLongValues() {
                                return pagedSingles(pagedValues, docsWithValues);
                            }

                            @Override
                            public Collection<Accountable> getChildResources() {
                                List<Accountable> resources = new ArrayList<>();
                                resources.add(Accountables.namedAccountable("values", pagedValues));
                                if (docsWithValues != null) {
                                    resources.add(Accountables.namedAccountable("missing bitset", docsWithValues));
                                }
                                return Collections.unmodifiableList(resources);
                            }
                            
                        };
                        break;
                    case ORDINALS:
                        ramBytesUsed = build.ramBytesUsed() + values.ramBytesUsed();
                        data = new AtomicLongFieldData(ramBytesUsed) {

                            @Override
                            public SortedNumericDocValues getLongValues() {
                                return withOrdinals(build, values, reader.maxDoc());
                            }
                            
                            @Override
                            public Collection<Accountable> getChildResources() {
                                List<Accountable> resources = new ArrayList<>();
                                resources.add(Accountables.namedAccountable("ordinals", build));
                                resources.add(Accountables.namedAccountable("values", values));
                                return Collections.unmodifiableList(resources);
                            }

                        };
                        break;
                    default:
                        throw new ElasticsearchException("unknown memory format: " + formatHint);
                }

            }

            success = true;
            return data;
        } finally {
            if (!success) {
                // If something went wrong, unwind any current estimations we've made
                estimator.afterLoad(termsEnum, 0);
            } else {
                // Adjust as usual, based on the actual size of the field data
                estimator.afterLoad(termsEnum, data.ramBytesUsed());
            }

        }

    }

    protected CommonSettings.MemoryStorageFormat chooseStorageFormat(LeafReader reader, PackedLongValues values, Ordinals build, RandomAccessOrds ordinals,
                                                                     long minValue, long maxValue, float acceptableOverheadRatio, int pageSize) {

        CommonSettings.MemoryStorageFormat format;

        // estimate memory usage for a single packed array
        long packedDelta = maxValue - minValue + 1; // allow for a missing value
        // valuesDelta can be negative if the difference between max and min values overflows the positive side of longs.
        int bitsRequired = packedDelta < 0 ? 64 : PackedInts.bitsRequired(packedDelta);
        PackedInts.FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(reader.maxDoc(), bitsRequired, acceptableOverheadRatio);
        final long singleValuesSize = formatAndBits.format.longCount(PackedInts.VERSION_CURRENT, reader.maxDoc(), formatAndBits.bitsPerValue) * 8L;

        // ordinal memory usage
        final long ordinalsSize = build.ramBytesUsed() + values.ramBytesUsed();

        // estimate the memory signature of paged packing
        long pagedSingleValuesSize = (reader.maxDoc() / pageSize + 1) * RamUsageEstimator.NUM_BYTES_OBJECT_REF; // array of pages
        int pageIndex = 0;
        long pageMinOrdinal = Long.MAX_VALUE;
        long pageMaxOrdinal = Long.MIN_VALUE;
        for (int i = 1; i < reader.maxDoc(); ++i, pageIndex = (pageIndex + 1) % pageSize) {
            ordinals.setDocument(i);
            if (ordinals.cardinality() > 0) {
                long ordinal = ordinals.ordAt(0);
                pageMaxOrdinal = Math.max(ordinal, pageMaxOrdinal);
                pageMinOrdinal = Math.min(ordinal, pageMinOrdinal);
            }
            if (pageIndex == pageSize - 1) {
                // end of page, we now know enough to estimate memory usage
                pagedSingleValuesSize += getPageMemoryUsage(values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal);

                pageMinOrdinal = Long.MAX_VALUE;
                pageMaxOrdinal = Long.MIN_VALUE;
            }
        }

        if (pageIndex > 0) {
            // last page estimation
            pageIndex++;
            pagedSingleValuesSize += getPageMemoryUsage(values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal);
        }

        if (ordinalsSize < singleValuesSize) {
            if (ordinalsSize < pagedSingleValuesSize) {
                format = CommonSettings.MemoryStorageFormat.ORDINALS;
            } else {
                format = CommonSettings.MemoryStorageFormat.PAGED;
            }
        } else {
            if (pagedSingleValuesSize < singleValuesSize) {
                format = CommonSettings.MemoryStorageFormat.PAGED;
            } else {
                format = CommonSettings.MemoryStorageFormat.PACKED;
            }
        }
        return format;
    }

    private long getPageMemoryUsage(PackedLongValues values, float acceptableOverheadRatio, int pageSize, long pageMinOrdinal, long pageMaxOrdinal) {
        int bitsRequired;
        long pageMemorySize = 0;
        PackedInts.FormatAndBits formatAndBits;
        if (pageMaxOrdinal == Long.MIN_VALUE) {
            // empty page - will use the null reader which just stores size
            pageMemorySize += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT);

        } else {
            long pageMinValue = values.get(pageMinOrdinal);
            long pageMaxValue = values.get(pageMaxOrdinal);
            long pageDelta = pageMaxValue - pageMinValue;
            if (pageDelta != 0) {
                bitsRequired = pageDelta < 0 ? 64 : PackedInts.bitsRequired(pageDelta);
                formatAndBits = PackedInts.fastestFormatAndBits(pageSize, bitsRequired, acceptableOverheadRatio);
                pageMemorySize += formatAndBits.format.longCount(PackedInts.VERSION_CURRENT, pageSize, formatAndBits.bitsPerValue) * RamUsageEstimator.NUM_BYTES_LONG;
                pageMemorySize += RamUsageEstimator.NUM_BYTES_LONG; // min value per page storage
            } else {
                // empty page
                pageMemorySize += RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT);
            }
        }
        return pageMemorySize;
    }

    @Override
    protected AtomicNumericFieldData empty(int maxDoc) {
        return AtomicLongFieldData.empty(maxDoc);
    }

    @Override
    public XFieldComparatorSource comparatorSource(@Nullable Object missingValue, MultiValueMode sortMode, Nested nested) {
        return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
    }

    /**
     * Estimator that wraps numeric field data loading in a
     * RamAccountingTermsEnum, adjusting the breaker after data has been
     * loaded
     */
    public class PackedArrayEstimator implements PerValueEstimator {

        private final CircuitBreaker breaker;
        private final NumericType type;
        private final String fieldName;

        public PackedArrayEstimator(CircuitBreaker breaker, NumericType type, String fieldName) {
            this.breaker = breaker;
            this.type = type;
            this.fieldName = fieldName;
        }

        /**
         * @return number of bytes per term, based on the NumericValue.requiredBits()
         */
        @Override
        public long bytesPerValue(BytesRef term) {
            // Estimate about  about 0.8 (8 / 10) compression ratio for
            // numbers, but at least 4 bytes
            return Math.max(type.requiredBits() / 10, 4);
        }

        /**
         * @return A TermsEnum wrapped in a RamAccountingTermsEnum
         */
        @Override
        public TermsEnum beforeLoad(Terms terms) throws IOException {
            return new RamAccountingTermsEnum(type.wrapTermsEnum(terms.iterator()), breaker, this, this.fieldName);
        }

        /**
         * Adjusts the breaker based on the aggregated value from the RamAccountingTermsEnum
         *
         * @param termsEnum  terms that were wrapped and loaded
         * @param actualUsed actual field data memory usage
         */
        @Override
        public void afterLoad(TermsEnum termsEnum, long actualUsed) {
            assert termsEnum instanceof RamAccountingTermsEnum;
            long estimatedBytes = ((RamAccountingTermsEnum) termsEnum).getTotalBytes();
            breaker.addWithoutBreaking(-(estimatedBytes - actualUsed));
        }

        /**
         * Adjust the breaker when no terms were actually loaded, but the field
         * data takes up space regardless. For instance, when ordinals are
         * used.
         *
         * @param actualUsed bytes actually used
         */
        public void adjustForNoTerms(long actualUsed) {
            breaker.addWithoutBreaking(actualUsed);
        }
    }

    private static SortedNumericDocValues withOrdinals(Ordinals ordinals, final LongValues values, int maxDoc) {
        final RandomAccessOrds ords = ordinals.ordinals();
        final SortedDocValues singleOrds = DocValues.unwrapSingleton(ords);
        if (singleOrds != null) {
            final NumericDocValues singleValues = new NumericDocValues() {
                @Override
                public long get(int docID) {
                    final int ord = singleOrds.getOrd(docID);
                    if (ord >= 0) {
                        return values.get(singleOrds.getOrd(docID));
                    } else {
                        return 0;
                    }
                }
            };
            return DocValues.singleton(singleValues, DocValues.docsWithValue(ords, maxDoc));
        } else {
            return new SortedNumericDocValues() {
                @Override
                public long valueAt(int index) {
                    return values.get(ords.ordAt(index));
                }

                @Override
                public void setDocument(int doc) {
                    ords.setDocument(doc);
                }

                @Override
                public int count() {
                    return ords.cardinality();
                }
            };
        }
    }

    private static SortedNumericDocValues singles(final NumericDocValues deltas, final long minValue) {
        final NumericDocValues values;
        if (minValue == 0) {
            values = deltas;
        } else {
            values = new NumericDocValues() {
                @Override
                public long get(int docID) {
                    return minValue + deltas.get(docID);
                }
            };
        }
        return DocValues.singleton(values, null);
    }

    private static SortedNumericDocValues sparseSingles(final NumericDocValues deltas, final long minValue,  final long missingValue, final int maxDoc) {
        final NumericDocValues values = new NumericDocValues() {
            @Override
            public long get(int docID) {
                final long delta = deltas.get(docID);
                if (delta == missingValue) {
                    return 0;
                }
                return minValue + delta;
            }
        };
        final Bits docsWithFields = new Bits() {
            @Override
            public boolean get(int index) {
                return deltas.get(index) != missingValue;
            }
            @Override
            public int length() {
                return maxDoc;
            }
        };
        return DocValues.singleton(values, docsWithFields);
    }

    private static SortedNumericDocValues pagedSingles(final PackedLongValues values, final Bits docsWithValue) {
        return DocValues.singleton(new NumericDocValues() {
            // we need to wrap since NumericDocValues must return 0 when a doc has no value
            @Override
            public long get(int docID) {
                if (docsWithValue == null || docsWithValue.get(docID)) {
                    return values.get(docID);
                } else {
                    return 0;
                }
            }
        }, docsWithValue);
    }
}