/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.fielddata.ordinals; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedGrowableWriter; import java.io.Closeable; import java.io.IOException; import java.util.Arrays; /** * Simple class to build document ID <-> ordinal mapping. Note: Ordinals are * <tt>1</tt> based monotonically increasing positive integers. <tt>0</tt> * donates the missing value in this context. */ public final class OrdinalsBuilder implements Closeable { /** * Whether to for the use of {@link MultiOrdinals} to store the ordinals for testing purposes. */ public static final String FORCE_MULTI_ORDINALS = "force_multi_ordinals"; /** * Default acceptable overhead ratio. {@link OrdinalsBuilder} memory usage is mostly transient so it is likely a better trade-off to * trade memory for speed in order to resize less often. */ public static final float DEFAULT_ACCEPTABLE_OVERHEAD_RATIO = PackedInts.FAST; /** * The following structure is used to store ordinals. The idea is to store ords on levels of increasing sizes. Level 0 stores * 1 value and 1 pointer to level 1. Level 1 stores 2 values and 1 pointer to level 2, ..., Level n stores 2**n values and * 1 pointer to level n+1. If at some point an ordinal or a pointer has 0 as a value, this means that there are no remaining * values. On the first level, ordinals.get(docId) is the first ordinal for docId or 0 if the document has no ordinals. On * subsequent levels, the first 2^level slots are reserved and all have 0 as a value. * <pre> * Example for an index of 3 docs (O=ordinal, P = pointer) * Level 0: * ordinals [1] [4] [2] * nextLevelSlices 2 0 1 * Level 1: * ordinals [0 0] [2 0] [3 4] * nextLevelSlices 0 0 1 * Level 2: * ordinals [0 0 0 0] [5 0 0 0] * nextLevelSlices 0 0 * </pre> * On level 0, all documents have an ordinal: 0 has 1, 1 has 4 and 2 has 2 as a first ordinal, this means that we need to read * nextLevelEntries to get the index of their ordinals on the next level. The entry for document 1 is 0, meaning that we have * already read all its ordinals. On the contrary 0 and 2 have more ordinals which are stored at indices 2 and 1. Let's continue * with document 2: it has 2 more ordinals on level 1: 3 and 4 and its next level index is 1 meaning that there are remaining * ordinals on the next level. On level 2 at index 1, we can read [5 0 0 0] meaning that 5 is an ordinal as well, but the * fact that it is followed by zeros means that there are no more ordinals. In the end, document 2 has 2, 3, 4 and 5 as ordinals. * <p> * In addition to these structures, there is another array which stores the current position (level + slice + offset in the slice) * in order to be able to append data in constant time. */ private static class OrdinalsStore { private static final int PAGE_SIZE = 1 << 12; /** * Number of slots at <code>level</code> */ private static int numSlots(int level) { return 1 << level; } private static int slotsMask(int level) { return numSlots(level) - 1; } /** * Encode the position for the given level and offset. The idea is to encode the level using unary coding in the lower bits and * then the offset in the higher bits. */ private static long position(int level, long offset) { assert level >= 1; return (1 << (level - 1)) | (offset << level); } /** * Decode the level from an encoded position. */ private static int level(long position) { return 1 + Long.numberOfTrailingZeros(position); } /** * Decode the offset from the position. */ private static long offset(long position, int level) { return position >>> level; } /** * Get the ID of the slice given an offset. */ private static long sliceID(int level, long offset) { return offset >>> level; } /** * Compute the first offset of the given slice. */ private static long startOffset(int level, long slice) { return slice << level; } /** * Compute the number of ordinals stored for a value given its current position. */ private static int numOrdinals(int level, long offset) { return (1 << level) + (int) (offset & slotsMask(level)); } // Current position private PagedGrowableWriter positions; // First level (0) of ordinals and pointers to the next level private final GrowableWriter firstOrdinals; private PagedGrowableWriter firstNextLevelSlices; // Ordinals and pointers for other levels +1 private final PagedGrowableWriter[] ordinals; private final PagedGrowableWriter[] nextLevelSlices; private final int[] sizes; private final int startBitsPerValue; private final float acceptableOverheadRatio; OrdinalsStore(int maxDoc, int startBitsPerValue, float acceptableOverheadRatio) { this.startBitsPerValue = startBitsPerValue; this.acceptableOverheadRatio = acceptableOverheadRatio; positions = new PagedGrowableWriter(maxDoc, PAGE_SIZE, startBitsPerValue, acceptableOverheadRatio); firstOrdinals = new GrowableWriter(startBitsPerValue, maxDoc, acceptableOverheadRatio); // over allocate in order to never worry about the array sizes, 24 entries would allow to store several millions of ordinals per doc... ordinals = new PagedGrowableWriter[24]; nextLevelSlices = new PagedGrowableWriter[24]; sizes = new int[24]; Arrays.fill(sizes, 1); // reserve the 1st slice on every level } /** * Allocate a new slice and return its ID. */ private long newSlice(int level) { final long newSlice = sizes[level]++; // Lazily allocate ordinals if (ordinals[level] == null) { ordinals[level] = new PagedGrowableWriter(8L * numSlots(level), PAGE_SIZE, startBitsPerValue, acceptableOverheadRatio); } else { ordinals[level] = ordinals[level].grow(sizes[level] * numSlots(level)); if (nextLevelSlices[level] != null) { nextLevelSlices[level] = nextLevelSlices[level].grow(sizes[level]); } } return newSlice; } public int addOrdinal(int docID, long ordinal) { final long position = positions.get(docID); if (position == 0L) { // on the first level return firstLevel(docID, ordinal); } else { return nonFirstLevel(docID, ordinal, position); } } private int firstLevel(int docID, long ordinal) { // 0 or 1 ordinal if (firstOrdinals.get(docID) == 0L) { firstOrdinals.set(docID, ordinal + 1); return 1; } else { final long newSlice = newSlice(1); if (firstNextLevelSlices == null) { firstNextLevelSlices = new PagedGrowableWriter(firstOrdinals.size(), PAGE_SIZE, 3, acceptableOverheadRatio); } firstNextLevelSlices.set(docID, newSlice); final long offset = startOffset(1, newSlice); ordinals[1].set(offset, ordinal + 1); positions.set(docID, position(1, offset)); // current position is on the 1st level and not allocated yet return 2; } } private int nonFirstLevel(int docID, long ordinal, long position) { int level = level(position); long offset = offset(position, level); assert offset != 0L; if (((offset + 1) & slotsMask(level)) == 0L) { // reached the end of the slice, allocate a new one on the next level final long newSlice = newSlice(level + 1); if (nextLevelSlices[level] == null) { nextLevelSlices[level] = new PagedGrowableWriter(sizes[level], PAGE_SIZE, 1, acceptableOverheadRatio); } nextLevelSlices[level].set(sliceID(level, offset), newSlice); ++level; offset = startOffset(level, newSlice); assert (offset & slotsMask(level)) == 0L; } else { // just go to the next slot ++offset; } ordinals[level].set(offset, ordinal + 1); final long newPosition = position(level, offset); positions.set(docID, newPosition); return numOrdinals(level, offset); } public void appendOrdinals(int docID, LongsRef ords) { // First level final long firstOrd = firstOrdinals.get(docID); if (firstOrd == 0L) { return; } ords.longs = ArrayUtil.grow(ords.longs, ords.offset + ords.length + 1); ords.longs[ords.offset + ords.length++] = firstOrd - 1; if (firstNextLevelSlices == null) { return; } long sliceID = firstNextLevelSlices.get(docID); if (sliceID == 0L) { return; } // Other levels for (int level = 1; ; ++level) { final int numSlots = numSlots(level); ords.longs = ArrayUtil.grow(ords.longs, ords.offset + ords.length + numSlots); final long offset = startOffset(level, sliceID); for (int j = 0; j < numSlots; ++j) { final long ord = ordinals[level].get(offset + j); if (ord == 0L) { return; } ords.longs[ords.offset + ords.length++] = ord - 1; } if (nextLevelSlices[level] == null) { return; } sliceID = nextLevelSlices[level].get(sliceID); if (sliceID == 0L) { return; } } } } private final int maxDoc; private long currentOrd = -1; private int numDocsWithValue = 0; private int numMultiValuedDocs = 0; private int totalNumOrds = 0; private OrdinalsStore ordinals; private final LongsRef spare; public OrdinalsBuilder(int maxDoc, float acceptableOverheadRatio) throws IOException { this.maxDoc = maxDoc; int startBitsPerValue = 8; ordinals = new OrdinalsStore(maxDoc, startBitsPerValue, acceptableOverheadRatio); spare = new LongsRef(); } public OrdinalsBuilder(int maxDoc) throws IOException { this(maxDoc, DEFAULT_ACCEPTABLE_OVERHEAD_RATIO); } /** * Returns a shared {@link LongsRef} instance for the given doc ID holding all ordinals associated with it. */ public LongsRef docOrds(int docID) { spare.offset = spare.length = 0; ordinals.appendOrdinals(docID, spare); return spare; } /** * Return a {@link org.apache.lucene.util.packed.PackedInts.Reader} instance mapping every doc ID to its first ordinal + 1 if it exists and 0 otherwise. */ public PackedInts.Reader getFirstOrdinals() { return ordinals.firstOrdinals; } /** * Advances the {@link OrdinalsBuilder} to the next ordinal and * return the current ordinal. */ public long nextOrdinal() { return ++currentOrd; } /** * Returns the current ordinal or <tt>0</tt> if this build has not been advanced via * {@link #nextOrdinal()}. */ public long currentOrdinal() { return currentOrd; } /** * Associates the given document id with the current ordinal. */ public OrdinalsBuilder addDoc(int doc) { totalNumOrds++; final int numValues = ordinals.addOrdinal(doc, currentOrd); if (numValues == 1) { ++numDocsWithValue; } else if (numValues == 2) { ++numMultiValuedDocs; } return this; } /** * Returns <code>true</code> iff this builder contains a document ID that is associated with more than one ordinal. Otherwise <code>false</code>; */ public boolean isMultiValued() { return numMultiValuedDocs > 0; } /** * Returns the number distinct of document IDs with one or more values. */ public int getNumDocsWithValue() { return numDocsWithValue; } /** * Returns the number distinct of document IDs associated with exactly one value. */ public int getNumSingleValuedDocs() { return numDocsWithValue - numMultiValuedDocs; } /** * Returns the number distinct of document IDs associated with two or more values. */ public int getNumMultiValuesDocs() { return numMultiValuedDocs; } /** * Returns the number of document ID to ordinal pairs in this builder. */ public int getTotalNumOrds() { return totalNumOrds; } /** * Returns the number of distinct ordinals in this builder. */ public long getValueCount() { return currentOrd + 1; } /** * Builds a {@link BitSet} where each documents bit is that that has one or more ordinals associated with it. * if every document has an ordinal associated with it this method returns <code>null</code> */ public BitSet buildDocsWithValuesSet() { if (numDocsWithValue == maxDoc) { return null; } final FixedBitSet bitSet = new FixedBitSet(maxDoc); for (int docID = 0; docID < maxDoc; ++docID) { if (ordinals.firstOrdinals.get(docID) != 0) { bitSet.set(docID); } } return bitSet; } /** * Builds an {@link Ordinals} instance from the builders current state. */ public Ordinals build() { final float acceptableOverheadRatio = PackedInts.DEFAULT; if (numMultiValuedDocs > 0 || MultiOrdinals.significantlySmallerThanSinglePackedOrdinals(maxDoc, numDocsWithValue, getValueCount(), acceptableOverheadRatio)) { // MultiOrdinals can be smaller than SinglePackedOrdinals for sparse fields return new MultiOrdinals(this, acceptableOverheadRatio); } else { return new SinglePackedOrdinals(this, acceptableOverheadRatio); } } /** * Returns the maximum document ID this builder can associate with an ordinal */ public int maxDoc() { return maxDoc; } /** * This method iterates all terms in the given {@link TermsEnum} and * associates each terms ordinal with the terms documents. The caller must * exhaust the returned {@link BytesRefIterator} which returns all values * where the first returned value is associated with the ordinal <tt>1</tt> * etc. */ public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException { return new BytesRefIterator() { private PostingsEnum docsEnum = null; @Override public BytesRef next() throws IOException { BytesRef ref; if ((ref = termsEnum.next()) != null) { docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE); nextOrdinal(); int docId; while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { addDoc(docId); } } return ref; } }; } /** * Closes this builder and release all resources. */ @Override public void close() throws IOException { ordinals = null; } }