package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.DocValues.SortedSource;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.packed.PackedInts;
/**
* Utility class for merging SortedBytes DocValues
* instances.
*
* @lucene.internal
*/
public final class SortedBytesMergeUtils {
private SortedBytesMergeUtils() {
// no instance
}
/** Creates the {@link MergeContext} necessary for merging
* the ordinals. */
public static MergeContext init(Type type, DocValues[] docValues,
Comparator<BytesRef> comp, int mergeDocCount) {
int size = -1;
if (type == Type.BYTES_FIXED_SORTED) {
for (DocValues indexDocValues : docValues) {
if (indexDocValues != null) {
size = indexDocValues.getValueSize();
break;
}
}
assert size >= 0;
}
return new MergeContext(comp, mergeDocCount, size, type);
}
/**
* Encapsulates contextual information about the merge.
* This class holds document id to ordinal mappings, offsets for
* variable length values and the comparator to sort the merged
* bytes.
*
* @lucene.internal
*/
public static final class MergeContext {
private final Comparator<BytesRef> comp;
private final BytesRef missingValue = new BytesRef();
/** How many bytes each value occupies, or -1 if it
* varies. */
public final int sizePerValues; // -1 if var length
final Type type;
/** Maps each document to the ordinal for its value. */
public final int[] docToEntry;
/** File-offset for each document; will be null if it's
* not needed (eg fixed-size values). */
public long[] offsets; // if non-null #mergeRecords collects byte offsets here
/** Sole constructor. */
public MergeContext(Comparator<BytesRef> comp, int mergeDocCount,
int size, Type type) {
assert type == Type.BYTES_FIXED_SORTED || type == Type.BYTES_VAR_SORTED;
this.comp = comp;
this.sizePerValues = size;
this.type = type;
if (size > 0) {
missingValue.grow(size);
missingValue.length = size;
}
docToEntry = new int[mergeDocCount];
}
/** Returns number of documents merged. */
public int getMergeDocCount() {
return docToEntry.length;
}
}
/** Creates the {@link SortedSourceSlice}s for
* merging. */
public static List<SortedSourceSlice> buildSlices(
int[] docBases, MergeState.DocMap[] docMaps,
DocValues[] docValues, MergeContext ctx) throws IOException {
final List<SortedSourceSlice> slices = new ArrayList<SortedSourceSlice>();
for (int i = 0; i < docValues.length; i++) {
final SortedSourceSlice nextSlice;
final Source directSource;
if (docValues[i] != null
&& (directSource = docValues[i].getDirectSource()) != null) {
final SortedSourceSlice slice = new SortedSourceSlice(i, directSource
.asSortedSource(), docBases, ctx.getMergeDocCount(), ctx.docToEntry);
nextSlice = slice;
} else {
nextSlice = new SortedSourceSlice(i, new MissingValueSource(ctx),
docBases, ctx.getMergeDocCount(), ctx.docToEntry);
}
createOrdMapping(docBases, docMaps, nextSlice);
slices.add(nextSlice);
}
return Collections.unmodifiableList(slices);
}
/*
* In order to merge we need to map the ords used in each segment to the new
* global ords in the new segment. Additionally we need to drop values that
* are not referenced anymore due to deleted documents. This method walks all
* live documents and fetches their current ordinal. We store this ordinal per
* slice and (SortedSourceSlice#ordMapping) and remember the doc to ord
* mapping in docIDToRelativeOrd. After the merge SortedSourceSlice#ordMapping
* contains the new global ordinals for the relative index.
*/
private static void createOrdMapping(int[] docBases, MergeState.DocMap[] docMaps,
SortedSourceSlice currentSlice) {
final int readerIdx = currentSlice.readerIdx;
final MergeState.DocMap currentDocMap = docMaps[readerIdx];
final int docBase = currentSlice.docToOrdStart;
assert docBase == docBases[readerIdx];
if (currentDocMap != null && currentDocMap.hasDeletions()) { // we have deletes
for (int i = 0; i < currentDocMap.maxDoc(); i++) {
final int doc = currentDocMap.get(i);
if (doc != -1) { // not deleted
final int ord = currentSlice.source.ord(i); // collect ords strictly
// increasing
currentSlice.docIDToRelativeOrd[docBase + doc] = ord;
// use ord + 1 to identify unreferenced values (ie. == 0)
currentSlice.ordMapping[ord] = ord + 1;
}
}
} else { // no deletes
final int numDocs = currentSlice.docToOrdEnd - currentSlice.docToOrdStart;
for (int doc = 0; doc < numDocs; doc++) {
final int ord = currentSlice.source.ord(doc);
currentSlice.docIDToRelativeOrd[docBase + doc] = ord;
// use ord + 1 to identify unreferenced values (ie. == 0)
currentSlice.ordMapping[ord] = ord + 1;
}
}
}
/** Does the "real work" of merging the slices and
* computing the ord mapping. */
public static int mergeRecords(MergeContext ctx, BytesRefConsumer consumer,
List<SortedSourceSlice> slices) throws IOException {
final RecordMerger merger = new RecordMerger(new MergeQueue(slices.size(),
ctx.comp), slices.toArray(new SortedSourceSlice[0]));
long[] offsets = ctx.offsets;
final boolean recordOffsets = offsets != null;
long offset = 0;
BytesRef currentMergedBytes;
merger.pushTop();
while (merger.queue.size() > 0) {
merger.pullTop();
currentMergedBytes = merger.current;
assert ctx.sizePerValues == -1 || ctx.sizePerValues == currentMergedBytes.length : "size: "
+ ctx.sizePerValues + " spare: " + currentMergedBytes.length;
offset += currentMergedBytes.length;
if (recordOffsets) {
if (merger.currentOrd >= offsets.length) {
offsets = ArrayUtil.grow(offsets, merger.currentOrd + 1);
}
offsets[merger.currentOrd] = offset;
}
consumer.consume(currentMergedBytes, merger.currentOrd, offset);
merger.pushTop();
}
ctx.offsets = offsets;
assert offsets == null || offsets[merger.currentOrd - 1] == offset;
return merger.currentOrd;
}
/**
* Implementation of this interface consume the merged bytes with their
* corresponding ordinal and byte offset. The offset is the byte offset in
* target sorted source where the currently merged {@link BytesRef} instance
* should be stored at.
*/
public static interface BytesRefConsumer {
/**
* Consumes a single {@link BytesRef}. The provided {@link BytesRef}
* instances are strictly increasing with respect to the used
* {@link Comparator} used for merging
*
* @param ref
* the {@link BytesRef} to consume
* @param ord
* the ordinal of the given {@link BytesRef} in the merge target
* @param offset
* the byte offset of the given {@link BytesRef} in the merge
* target
* @throws IOException
* if an {@link IOException} occurs
*/
public void consume(BytesRef ref, int ord, long offset) throws IOException;
}
/**
* A simple {@link BytesRefConsumer} that writes the merged {@link BytesRef}
* instances sequentially to an {@link IndexOutput}.
*/
public static final class IndexOutputBytesRefConsumer implements BytesRefConsumer {
private final IndexOutput datOut;
/** Sole constructor. */
public IndexOutputBytesRefConsumer(IndexOutput datOut) {
this.datOut = datOut;
}
@Override
public void consume(BytesRef currentMergedBytes, int ord, long offset) throws IOException {
datOut.writeBytes(currentMergedBytes.bytes, currentMergedBytes.offset,
currentMergedBytes.length);
}
}
/**
* {@link RecordMerger} merges a list of {@link SortedSourceSlice} lazily by
* consuming the sorted source records one by one and de-duplicates records
* that are shared across slices. The algorithm is based on a lazy priority queue
* that prevents reading merge sources into heap memory.
*
* @lucene.internal
*/
private static final class RecordMerger {
private final MergeQueue queue;
private final SortedSourceSlice[] top;
private int numTop;
BytesRef current;
int currentOrd = -1;
RecordMerger(MergeQueue queue, SortedSourceSlice[] top) {
super();
this.queue = queue;
this.top = top;
this.numTop = top.length;
}
private void pullTop() {
// extract all subs from the queue that have the same
// top record
assert numTop == 0;
assert currentOrd >= 0;
while (true) {
final SortedSourceSlice popped = top[numTop++] = queue.pop();
// use ord + 1 to identify unreferenced values (ie. == 0)
popped.ordMapping[popped.relativeOrd] = currentOrd + 1;
if (queue.size() == 0
|| !(queue.top()).current.bytesEquals(top[0].current)) {
break;
}
}
current = top[0].current;
}
private void pushTop() {
// call next() on each top, and put back into queue
for (int i = 0; i < numTop; i++) {
top[i].current = top[i].next();
if (top[i].current != null) {
queue.add(top[i]);
}
}
currentOrd++;
numTop = 0;
}
}
/**
* {@link SortedSourceSlice} represents a single {@link SortedSource} merge candidate.
* It encapsulates ordinal and pre-calculated target doc id to ordinal mappings.
* This class also holds state private to the merge process.
* @lucene.internal
*/
public static class SortedSourceSlice {
final SortedSource source;
final int readerIdx;
/* global array indexed by docID containg the relative ord for the doc */
final int[] docIDToRelativeOrd;
/*
* maps relative ords to merged global ords - index is relative ord value
* new global ord this map gets updates as we merge ords. later we use the
* docIDtoRelativeOrd to get the previous relative ord to get the new ord
* from the relative ord map.
*/
final int[] ordMapping;
/* start index into docIDToRelativeOrd */
final int docToOrdStart;
/* end index into docIDToRelativeOrd */
final int docToOrdEnd;
BytesRef current = new BytesRef();
/* the currently merged relative ordinal */
int relativeOrd = -1;
SortedSourceSlice(int readerIdx, SortedSource source, int[] docBase, int mergeDocCount,
int[] docToOrd) {
super();
this.readerIdx = readerIdx;
this.source = source;
this.docIDToRelativeOrd = docToOrd;
this.ordMapping = new int[source.getValueCount()];
this.docToOrdStart = docBase[readerIdx];
this.docToOrdEnd = this.docToOrdStart + numDocs(docBase, mergeDocCount, readerIdx);
}
private static int numDocs(int[] docBase, int mergedDocCount, int readerIndex) {
if (readerIndex == docBase.length - 1) {
return mergedDocCount - docBase[readerIndex];
}
return docBase[readerIndex + 1] - docBase[readerIndex];
}
BytesRef next() {
for (int i = relativeOrd + 1; i < ordMapping.length; i++) {
if (ordMapping[i] != 0) { // skip ords that are not referenced anymore
source.getByOrd(i, current);
relativeOrd = i;
return current;
}
}
return null;
}
/** Fills in the absolute ords for this slice.
*
* @return the provided {@code docToOrd} */
public int[] toAbsolutOrds(int[] docToOrd) {
for (int i = docToOrdStart; i < docToOrdEnd; i++) {
final int mappedOrd = docIDToRelativeOrd[i];
assert mappedOrd < ordMapping.length;
assert ordMapping[mappedOrd] > 0 : "illegal mapping ord maps to an unreferenced value";
docToOrd[i] = ordMapping[mappedOrd] -1;
}
return docToOrd;
}
/** Writes ords for this slice. */
public void writeOrds(PackedInts.Writer writer) throws IOException {
for (int i = docToOrdStart; i < docToOrdEnd; i++) {
final int mappedOrd = docIDToRelativeOrd[i];
assert mappedOrd < ordMapping.length;
assert ordMapping[mappedOrd] > 0 : "illegal mapping ord maps to an unreferenced value";
writer.add(ordMapping[mappedOrd] - 1);
}
}
}
/*
* if a segment has no values at all we use this source to fill in the missing
* value in the right place (depending on the comparator used)
*/
private static final class MissingValueSource extends SortedSource {
private BytesRef missingValue;
public MissingValueSource(MergeContext ctx) {
super(ctx.type, ctx.comp);
this.missingValue = ctx.missingValue;
}
@Override
public int ord(int docID) {
return 0;
}
@Override
public BytesRef getByOrd(int ord, BytesRef bytesRef) {
bytesRef.copyBytes(missingValue);
return bytesRef;
}
@Override
public PackedInts.Reader getDocToOrd() {
return null;
}
@Override
public int getValueCount() {
return 1;
}
}
/*
* merge queue
*/
private static final class MergeQueue extends
PriorityQueue<SortedSourceSlice> {
final Comparator<BytesRef> comp;
public MergeQueue(int maxSize, Comparator<BytesRef> comp) {
super(maxSize);
this.comp = comp;
}
@Override
protected boolean lessThan(SortedSourceSlice a, SortedSourceSlice b) {
int cmp = comp.compare(a.current, b.current);
if (cmp != 0) {
return cmp < 0;
} else { // just a tie-breaker
return a.docToOrdStart < b.docToOrdStart;
}
}
}
}