SortedBytesMergeUtils.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.index;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.index.DocValues.SortedSource;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Utility class for merging SortedBytes DocValues
 * instances.
 *  
 * @lucene.internal
 */
public final class SortedBytesMergeUtils {

  private SortedBytesMergeUtils() {
    // no instance
  }

  /** Creates the {@link MergeContext} necessary for merging
   *  the ordinals. */
  public static MergeContext init(Type type, DocValues[] docValues,
      Comparator<BytesRef> comp, int mergeDocCount) {
    int size = -1;
    if (type == Type.BYTES_FIXED_SORTED) {
      for (DocValues indexDocValues : docValues) {
        if (indexDocValues != null) {
          size = indexDocValues.getValueSize();
          break;
        }
      }
      assert size >= 0;
    }
    return new MergeContext(comp, mergeDocCount, size, type);
  }
  /**
   * Encapsulates contextual information about the merge. 
   * This class holds document id to ordinal mappings, offsets for
   * variable length values and the comparator to sort the merged
   * bytes.
   * 
   * @lucene.internal
   */
  public static final class MergeContext {
    private final Comparator<BytesRef> comp;
    private final BytesRef missingValue = new BytesRef();

    /** How many bytes each value occupies, or -1 if it
     *  varies. */
    public final int sizePerValues; // -1 if var length

    final Type type;

    /** Maps each document to the ordinal for its value. */
    public final int[] docToEntry;

    /** File-offset for each document; will be null if it's
     *  not needed (eg fixed-size values). */
    public long[] offsets; // if non-null #mergeRecords collects byte offsets here

    /** Sole constructor. */
    public MergeContext(Comparator<BytesRef> comp, int mergeDocCount,
        int size, Type type) {
      assert type == Type.BYTES_FIXED_SORTED || type == Type.BYTES_VAR_SORTED;
      this.comp = comp;
      this.sizePerValues = size;
      this.type = type;
      if (size > 0) {
        missingValue.grow(size);
        missingValue.length = size;
      }
      docToEntry = new int[mergeDocCount];
    }

    /** Returns number of documents merged. */
    public int getMergeDocCount() {
      return docToEntry.length;
    }
  }

  /** Creates the {@link SortedSourceSlice}s for
   *  merging. */
  public static List<SortedSourceSlice> buildSlices(
      int[] docBases, MergeState.DocMap[] docMaps,
      DocValues[] docValues, MergeContext ctx) throws IOException {
    final List<SortedSourceSlice> slices = new ArrayList<SortedSourceSlice>();
    for (int i = 0; i < docValues.length; i++) {
      final SortedSourceSlice nextSlice;
      final Source directSource;
      if (docValues[i] != null
          && (directSource = docValues[i].getDirectSource()) != null) {
        final SortedSourceSlice slice = new SortedSourceSlice(i, directSource
            .asSortedSource(), docBases, ctx.getMergeDocCount(), ctx.docToEntry);
        nextSlice = slice;
      } else {
        nextSlice = new SortedSourceSlice(i, new MissingValueSource(ctx),
            docBases, ctx.getMergeDocCount(), ctx.docToEntry);
      }
      createOrdMapping(docBases, docMaps, nextSlice);
      slices.add(nextSlice);
    }
    return Collections.unmodifiableList(slices);
  }

  /*
   * In order to merge we need to map the ords used in each segment to the new
   * global ords in the new segment. Additionally we need to drop values that
   * are not referenced anymore due to deleted documents. This method walks all
   * live documents and fetches their current ordinal. We store this ordinal per
   * slice and (SortedSourceSlice#ordMapping) and remember the doc to ord
   * mapping in docIDToRelativeOrd. After the merge SortedSourceSlice#ordMapping
   * contains the new global ordinals for the relative index.
   */
  private static void createOrdMapping(int[] docBases, MergeState.DocMap[] docMaps,
      SortedSourceSlice currentSlice) {
    final int readerIdx = currentSlice.readerIdx;
    final MergeState.DocMap currentDocMap = docMaps[readerIdx];
    final int docBase = currentSlice.docToOrdStart;
    assert docBase == docBases[readerIdx];
    if (currentDocMap != null && currentDocMap.hasDeletions()) { // we have deletes
      for (int i = 0; i < currentDocMap.maxDoc(); i++) {
        final int doc = currentDocMap.get(i);
        if (doc != -1) { // not deleted
          final int ord = currentSlice.source.ord(i); // collect ords strictly
                                                      // increasing
          currentSlice.docIDToRelativeOrd[docBase + doc] = ord;
          // use ord + 1 to identify unreferenced values (ie. == 0)
          currentSlice.ordMapping[ord] = ord + 1;
        }
      }
    } else { // no deletes
      final int numDocs = currentSlice.docToOrdEnd - currentSlice.docToOrdStart;
      for (int doc = 0; doc < numDocs; doc++) {
        final int ord = currentSlice.source.ord(doc);
        currentSlice.docIDToRelativeOrd[docBase + doc] = ord;
        // use ord + 1 to identify unreferenced values (ie. == 0)
        currentSlice.ordMapping[ord] = ord + 1;
      }
    }
  }

  /** Does the "real work" of merging the slices and
   *  computing the ord mapping. */
  public static int mergeRecords(MergeContext ctx, BytesRefConsumer consumer,
      List<SortedSourceSlice> slices) throws IOException {
    final RecordMerger merger = new RecordMerger(new MergeQueue(slices.size(),
        ctx.comp), slices.toArray(new SortedSourceSlice[0]));
    long[] offsets = ctx.offsets;
    final boolean recordOffsets = offsets != null;
    long offset = 0;
    BytesRef currentMergedBytes;
    merger.pushTop();
    while (merger.queue.size() > 0) {
      merger.pullTop();
      currentMergedBytes = merger.current;
      assert ctx.sizePerValues == -1 || ctx.sizePerValues == currentMergedBytes.length : "size: "
          + ctx.sizePerValues + " spare: " + currentMergedBytes.length;
      offset += currentMergedBytes.length;
      if (recordOffsets) {
        if (merger.currentOrd >= offsets.length) {
          offsets = ArrayUtil.grow(offsets, merger.currentOrd + 1);
        }
        offsets[merger.currentOrd] = offset;
      }
      consumer.consume(currentMergedBytes, merger.currentOrd, offset);
      merger.pushTop();
    }
    ctx.offsets = offsets;
    assert offsets == null || offsets[merger.currentOrd - 1] == offset;
    return merger.currentOrd;
  }
  
  /**
   * Implementation of this interface consume the merged bytes with their
   * corresponding ordinal and byte offset. The offset is the byte offset in
   * target sorted source where the currently merged {@link BytesRef} instance
   * should be stored at.
   */
  public static interface BytesRefConsumer {
    
    /**
     * Consumes a single {@link BytesRef}. The provided {@link BytesRef}
     * instances are strictly increasing with respect to the used
     * {@link Comparator} used for merging
     * 
     * @param ref
     *          the {@link BytesRef} to consume
     * @param ord
     *          the ordinal of the given {@link BytesRef} in the merge target
     * @param offset
     *          the byte offset of the given {@link BytesRef} in the merge
     *          target
     * @throws IOException
     *           if an {@link IOException} occurs
     */
    public void consume(BytesRef ref, int ord, long offset) throws IOException;
  }
  
  /**
   * A simple {@link BytesRefConsumer} that writes the merged {@link BytesRef}
   * instances sequentially to an {@link IndexOutput}.
   */
  public static final class IndexOutputBytesRefConsumer implements BytesRefConsumer {
    private final IndexOutput datOut;
    
    /** Sole constructor. */
    public IndexOutputBytesRefConsumer(IndexOutput datOut) {
      this.datOut = datOut;
    }

    @Override
    public void consume(BytesRef currentMergedBytes, int ord, long offset) throws IOException {
      datOut.writeBytes(currentMergedBytes.bytes, currentMergedBytes.offset,
          currentMergedBytes.length);      
    }
  }
  
  /**
   * {@link RecordMerger} merges a list of {@link SortedSourceSlice} lazily by
   * consuming the sorted source records one by one and de-duplicates records
   * that are shared across slices. The algorithm is based on a lazy priority queue
   * that prevents reading merge sources into heap memory. 
   * 
   * @lucene.internal
   */
  private static final class RecordMerger {
    private final MergeQueue queue;
    private final SortedSourceSlice[] top;
    private int numTop;
    BytesRef current;
    int currentOrd = -1;

    RecordMerger(MergeQueue queue, SortedSourceSlice[] top) {
      super();
      this.queue = queue;
      this.top = top;
      this.numTop = top.length;
    }

    private void pullTop() {
      // extract all subs from the queue that have the same
      // top record
      assert numTop == 0;
      assert currentOrd >= 0;
      while (true) {
        final SortedSourceSlice popped = top[numTop++] = queue.pop();
        // use ord + 1 to identify unreferenced values (ie. == 0)
        popped.ordMapping[popped.relativeOrd] = currentOrd + 1;
        if (queue.size() == 0
            || !(queue.top()).current.bytesEquals(top[0].current)) {
          break;
        }
      }
      current = top[0].current;
    }

    private void pushTop() {
      // call next() on each top, and put back into queue
      for (int i = 0; i < numTop; i++) {
        top[i].current = top[i].next();
        if (top[i].current != null) {
          queue.add(top[i]);
        }
      }
      currentOrd++;
      numTop = 0;
    }
  }

  /**
   * {@link SortedSourceSlice} represents a single {@link SortedSource} merge candidate.
   * It encapsulates ordinal and pre-calculated target doc id to ordinal mappings.
   * This class also holds state private to the merge process.
   * @lucene.internal
   */
  public static class SortedSourceSlice {
    final SortedSource source;
    final int readerIdx;
    /* global array indexed by docID containg the relative ord for the doc */
    final int[] docIDToRelativeOrd;
    /*
     * maps relative ords to merged global ords - index is relative ord value
     * new global ord this map gets updates as we merge ords. later we use the
     * docIDtoRelativeOrd to get the previous relative ord to get the new ord
     * from the relative ord map.
     */
    final int[] ordMapping;

    /* start index into docIDToRelativeOrd */
    final int docToOrdStart;
    /* end index into docIDToRelativeOrd */
    final int docToOrdEnd;
    BytesRef current = new BytesRef();
    /* the currently merged relative ordinal */
    int relativeOrd = -1;

    SortedSourceSlice(int readerIdx, SortedSource source, int[] docBase, int mergeDocCount,
        int[] docToOrd) {
      super();
      this.readerIdx = readerIdx;
      this.source = source;
      this.docIDToRelativeOrd = docToOrd;
      this.ordMapping = new int[source.getValueCount()];
      this.docToOrdStart = docBase[readerIdx];
      this.docToOrdEnd = this.docToOrdStart + numDocs(docBase, mergeDocCount, readerIdx);
    }

    private static int numDocs(int[] docBase, int mergedDocCount, int readerIndex) {
      if (readerIndex == docBase.length - 1) {
        return mergedDocCount - docBase[readerIndex];
      }
      return docBase[readerIndex + 1] - docBase[readerIndex];
    }

    BytesRef next() {
      for (int i = relativeOrd + 1; i < ordMapping.length; i++) {
        if (ordMapping[i] != 0) { // skip ords that are not referenced anymore
          source.getByOrd(i, current);
          relativeOrd = i;
          return current;
        }
      }
      return null;
    }

    /** Fills in the absolute ords for this slice. 
     * 
     * @return the provided {@code docToOrd} */
    public int[] toAbsolutOrds(int[] docToOrd) {
      for (int i = docToOrdStart; i < docToOrdEnd; i++) {
        final int mappedOrd = docIDToRelativeOrd[i];
        assert mappedOrd < ordMapping.length;
        assert ordMapping[mappedOrd] > 0 : "illegal mapping ord maps to an unreferenced value";
        docToOrd[i] = ordMapping[mappedOrd] -1;
      }
      return docToOrd;
    }

    /** Writes ords for this slice. */
    public void writeOrds(PackedInts.Writer writer) throws IOException {
      for (int i = docToOrdStart; i < docToOrdEnd; i++) {
        final int mappedOrd = docIDToRelativeOrd[i];
        assert mappedOrd < ordMapping.length;
        assert ordMapping[mappedOrd] > 0 : "illegal mapping ord maps to an unreferenced value";
        writer.add(ordMapping[mappedOrd] - 1);
      }
    }
  }

  /*
   * if a segment has no values at all we use this source to fill in the missing
   * value in the right place (depending on the comparator used)
   */
  private static final class MissingValueSource extends SortedSource {

    private BytesRef missingValue;

    public MissingValueSource(MergeContext ctx) {
      super(ctx.type, ctx.comp);
      this.missingValue = ctx.missingValue;
    }

    @Override
    public int ord(int docID) {
      return 0;
    }

    @Override
    public BytesRef getByOrd(int ord, BytesRef bytesRef) {
      bytesRef.copyBytes(missingValue);
      return bytesRef;
    }

    @Override
    public PackedInts.Reader getDocToOrd() {
      return null;
    }

    @Override
    public int getValueCount() {
      return 1;
    }

  }

  /*
   * merge queue
   */
  private static final class MergeQueue extends
      PriorityQueue<SortedSourceSlice> {
    final Comparator<BytesRef> comp;

    public MergeQueue(int maxSize, Comparator<BytesRef> comp) {
      super(maxSize);
      this.comp = comp;
    }

    @Override
    protected boolean lessThan(SortedSourceSlice a, SortedSourceSlice b) {
      int cmp = comp.compare(a.current, b.current);
      if (cmp != 0) {
        return cmp < 0;
      } else { // just a tie-breaker
        return a.docToOrdStart < b.docToOrdStart;
      }
    }

  }
}