SortingMergePolicy.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.index.sorter;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer; // javadocs
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.MergeTrigger;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.search.Sort;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;

/** A {@link MergePolicy} that reorders documents according to a {@link Sort}
 *  before merging them. As a consequence, all segments resulting from a merge
 *  will be sorted while segments resulting from a flush will be in the order
 *  in which documents have been added.
 *  <p><b>NOTE</b>: Never use this policy if you rely on
 *  {@link IndexWriter#addDocuments(Iterable, Analyzer) IndexWriter.addDocuments}
 *  to have sequentially-assigned doc IDs, this policy will scatter doc IDs.
 *  <p><b>NOTE</b>: This policy should only be used with idempotent {@code Sort}s 
 *  so that the order of segments is predictable. For example, using 
 *  {@link Sort#INDEXORDER} in reverse (which is not idempotent) will make 
 *  the order of documents in a segment depend on the number of times the segment 
 *  has been merged.
 *  @lucene.experimental */
public final class SortingMergePolicy extends MergePolicy {

  /**
   * Put in the {@link SegmentInfo#getDiagnostics() diagnostics} to denote that
   * this segment is sorted.
   */
  public static final String SORTER_ID_PROP = "sorter";
  
  class SortingOneMerge extends OneMerge {

    List<AtomicReader> unsortedReaders;
    Sorter.DocMap docMap;
    AtomicReader sortedView;

    SortingOneMerge(List<SegmentCommitInfo> segments) {
      super(segments);
    }

    @Override
    public List<AtomicReader> getMergeReaders() throws IOException {
      if (unsortedReaders == null) {
        unsortedReaders = super.getMergeReaders();
        final AtomicReader atomicView;
        if (unsortedReaders.size() == 1) {
          atomicView = unsortedReaders.get(0);
        } else {
          final IndexReader multiReader = new MultiReader(unsortedReaders.toArray(new AtomicReader[unsortedReaders.size()]));
          atomicView = SlowCompositeReaderWrapper.wrap(multiReader);
        }
        docMap = sorter.sort(atomicView);
        sortedView = SortingAtomicReader.wrap(atomicView, docMap);
      }
      // a null doc map means that the readers are already sorted
      return docMap == null ? unsortedReaders : Collections.singletonList(sortedView);
    }
    
    @Override
    public void setInfo(SegmentCommitInfo info) {
      Map<String,String> diagnostics = info.info.getDiagnostics();
      diagnostics.put(SORTER_ID_PROP, sorter.getID());
      super.setInfo(info);
    }

    private MonotonicAppendingLongBuffer getDeletes(List<AtomicReader> readers) {
      MonotonicAppendingLongBuffer deletes = new MonotonicAppendingLongBuffer();
      int deleteCount = 0;
      for (AtomicReader reader : readers) {
        final int maxDoc = reader.maxDoc();
        final Bits liveDocs = reader.getLiveDocs();
        for (int i = 0; i < maxDoc; ++i) {
          if (liveDocs != null && !liveDocs.get(i)) {
            ++deleteCount;
          } else {
            deletes.add(deleteCount);
          }
        }
      }
      deletes.freeze();
      return deletes;
    }

    @Override
    public MergePolicy.DocMap getDocMap(final MergeState mergeState) {
      if (unsortedReaders == null) {
        throw new IllegalStateException();
      }
      if (docMap == null) {
        return super.getDocMap(mergeState);
      }
      assert mergeState.docMaps.length == 1; // we returned a singleton reader
      final MonotonicAppendingLongBuffer deletes = getDeletes(unsortedReaders);
      return new MergePolicy.DocMap() {
        @Override
        public int map(int old) {
          final int oldWithDeletes = old + (int) deletes.get(old);
          final int newWithDeletes = docMap.oldToNew(oldWithDeletes);
          return mergeState.docMaps[0].get(newWithDeletes);
        }
      };
    }

  }

  class SortingMergeSpecification extends MergeSpecification {

    @Override
    public void add(OneMerge merge) {
      super.add(new SortingOneMerge(merge.segments));
    }

    @Override
    public String segString(Directory dir) {
      return "SortingMergeSpec(" + super.segString(dir) + ", sorter=" + sorter + ")";
    }

  }

  /** Returns {@code true} if the given {@code reader} is sorted by the specified {@code sort}. */
  public static boolean isSorted(AtomicReader reader, Sort sort) {
    if (reader instanceof SegmentReader) {
      final SegmentReader segReader = (SegmentReader) reader;
      final Map<String, String> diagnostics = segReader.getSegmentInfo().info.getDiagnostics();
      if (diagnostics != null && sort.toString().equals(diagnostics.get(SORTER_ID_PROP))) {
        return true;
      }
    }
    return false;
  }

  private MergeSpecification sortedMergeSpecification(MergeSpecification specification) {
    if (specification == null) {
      return null;
    }
    MergeSpecification sortingSpec = new SortingMergeSpecification();
    for (OneMerge merge : specification.merges) {
      sortingSpec.add(merge);
    }
    return sortingSpec;
  }

  final MergePolicy in;
  final Sorter sorter;
  final Sort sort;

  /** Create a new {@code MergePolicy} that sorts documents with the given {@code sort}. */
  public SortingMergePolicy(MergePolicy in, Sort sort) {
    this.in = in;
    this.sorter = new Sorter(sort);
    this.sort = sort;
  }

  @Override
  public MergeSpecification findMerges(MergeTrigger mergeTrigger,
      SegmentInfos segmentInfos) throws IOException {
    return sortedMergeSpecification(in.findMerges(mergeTrigger, segmentInfos));
  }

  @Override
  public MergeSpecification findForcedMerges(SegmentInfos segmentInfos,
      int maxSegmentCount, Map<SegmentCommitInfo,Boolean> segmentsToMerge)
      throws IOException {
    return sortedMergeSpecification(in.findForcedMerges(segmentInfos, maxSegmentCount, segmentsToMerge));
  }

  @Override
  public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos)
      throws IOException {
    return sortedMergeSpecification(in.findForcedDeletesMerges(segmentInfos));
  }

  @Override
  public MergePolicy clone() {
    return new SortingMergePolicy(in.clone(), sort);
  }

  @Override
  public void close() {
    in.close();
  }

  @Override
  public boolean useCompoundFile(SegmentInfos segments,
      SegmentCommitInfo newSegment) throws IOException {
    return in.useCompoundFile(segments, newSegment);
  }

  @Override
  public void setIndexWriter(IndexWriter writer) {
    in.setIndexWriter(writer);
  }

  @Override
  public String toString() {
    return "SortingMergePolicy(" + in + ", sorter=" + sorter + ")";
  }

}