package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
/** <p>This class implements a {@link MergePolicy} that tries
* to merge segments into levels of exponentially
* increasing size, where each level has fewer segments than
* the value of the merge factor. Whenever extra segments
* (beyond the merge factor upper bound) are encountered,
* all segments within the level are merged. You can get or
* set the merge factor using {@link #getMergeFactor()} and
* {@link #setMergeFactor(int)} respectively.</p>
*
* <p>This class is abstract and requires a subclass to
* define the {@link #size} method which specifies how a
* segment's size is determined. {@link LogDocMergePolicy}
* is one subclass that measures size by document count in
* the segment. {@link LogByteSizeMergePolicy} is another
* subclass that measures size as the total byte size of the
* file(s) for the segment.</p>
*/
public abstract class LogMergePolicy extends MergePolicy {
/** Defines the allowed range of log(size) for each
* level. A level is computed by taking the max segment
* log size, minus LEVEL_LOG_SPAN, and finding all
* segments falling within that range. */
public static final double LEVEL_LOG_SPAN = 0.75;
/** Default merge factor, which is how many segments are
* merged at a time */
public static final int DEFAULT_MERGE_FACTOR = 10;
/** Default maximum segment size. A segment of this size
* or larger will never be merged. @see setMaxMergeDocs */
public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
/** Default noCFSRatio. If a merge's size is >= 10% of
* the index, then we disable compound file for it.
* @see #setNoCFSRatio */
public static final double DEFAULT_NO_CFS_RATIO = 0.1;
protected int mergeFactor = DEFAULT_MERGE_FACTOR;
protected long minMergeSize;
protected long maxMergeSize;
// Although the core MPs set it explicitly, we must default in case someone
// out there wrote his own LMP ...
protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
protected boolean requireContiguousMerge = false;
protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
protected boolean calibrateSizeByDeletes = true;
protected boolean useCompoundFile = true;
public LogMergePolicy() {
super();
}
protected boolean verbose() {
IndexWriter w = writer.get();
return w != null && w.verbose();
}
/** @see #setNoCFSRatio */
public double getNoCFSRatio() {
return noCFSRatio;
}
/** If a merged segment will be more than this percentage
* of the total size of the index, leave the segment as
* non-compound file even if compound file is enabled.
* Set to 1.0 to always use CFS regardless of merge
* size. */
public void setNoCFSRatio(double noCFSRatio) {
if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
}
this.noCFSRatio = noCFSRatio;
}
protected void message(String message) {
if (verbose())
writer.get().message("LMP: " + message);
}
/** If true, merges must be in-order slice of the
* segments. If false, then the merge policy is free to
* pick any segments. The default is false, which is
* in general more efficient than true since it gives the
* merge policy more freedom to pick closely sized
* segments. */
public void setRequireContiguousMerge(boolean v) {
requireContiguousMerge = v;
}
/** See {@link #setRequireContiguousMerge}. */
public boolean getRequireContiguousMerge() {
return requireContiguousMerge;
}
/** <p>Returns the number of segments that are merged at
* once and also controls the total number of segments
* allowed to accumulate in the index.</p> */
public int getMergeFactor() {
return mergeFactor;
}
/** Determines how often segment indices are merged by
* addDocument(). With smaller values, less RAM is used
* while indexing, and searches on unoptimized indices are
* faster, but indexing speed is slower. With larger
* values, more RAM is used during indexing, and while
* searches on unoptimized indices are slower, indexing is
* faster. Thus larger values (> 10) are best for batch
* index creation, and smaller values (< 10) for indices
* that are interactively maintained. */
public void setMergeFactor(int mergeFactor) {
if (mergeFactor < 2)
throw new IllegalArgumentException("mergeFactor cannot be less than 2");
this.mergeFactor = mergeFactor;
}
// Javadoc inherited
@Override
public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException {
final boolean doCFS;
if (!useCompoundFile) {
doCFS = false;
} else if (noCFSRatio == 1.0) {
doCFS = true;
} else {
long totalSize = 0;
for (SegmentInfo info : infos)
totalSize += size(info);
doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
}
return doCFS;
}
/** Sets whether compound file format should be used for
* newly flushed and newly merged segments. */
public void setUseCompoundFile(boolean useCompoundFile) {
this.useCompoundFile = useCompoundFile;
}
/** Returns true if newly flushed and newly merge segments
* are written in compound file format. @see
* #setUseCompoundFile */
public boolean getUseCompoundFile() {
return useCompoundFile;
}
/** Sets whether the segment size should be calibrated by
* the number of deletes when choosing segments for merge. */
public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) {
this.calibrateSizeByDeletes = calibrateSizeByDeletes;
}
/** Returns true if the segment size should be calibrated
* by the number of deletes when choosing segments for merge. */
public boolean getCalibrateSizeByDeletes() {
return calibrateSizeByDeletes;
}
@Override
public void close() {}
abstract protected long size(SegmentInfo info) throws IOException;
protected long sizeDocs(SegmentInfo info) throws IOException {
if (calibrateSizeByDeletes) {
int delCount = writer.get().numDeletedDocs(info);
assert delCount <= info.docCount;
return (info.docCount - (long)delCount);
} else {
return info.docCount;
}
}
protected long sizeBytes(SegmentInfo info) throws IOException {
long byteSize = info.sizeInBytes(true);
if (calibrateSizeByDeletes) {
int delCount = writer.get().numDeletedDocs(info);
double delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount));
assert delRatio <= 1.0;
return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0 - delRatio)));
} else {
return byteSize;
}
}
protected boolean isOptimized(SegmentInfos infos, int maxNumSegments, Set<SegmentInfo> segmentsToOptimize) throws IOException {
final int numSegments = infos.size();
int numToOptimize = 0;
SegmentInfo optimizeInfo = null;
for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) {
final SegmentInfo info = infos.info(i);
if (segmentsToOptimize.contains(info)) {
numToOptimize++;
optimizeInfo = info;
}
}
return numToOptimize <= maxNumSegments &&
(numToOptimize != 1 || isOptimized(optimizeInfo));
}
/** Returns true if this single info is optimized (has no
* pending norms or deletes, is in the same dir as the
* writer, and matches the current compound file setting */
protected boolean isOptimized(SegmentInfo info)
throws IOException {
IndexWriter w = writer.get();
assert w != null;
boolean hasDeletions = w.numDeletedDocs(info) > 0;
return !hasDeletions &&
!info.hasSeparateNorms() &&
info.dir == w.getDirectory() &&
(info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
}
/**
* Returns the merges necessary to optimize the index, taking the max merge
* size or max merge docs into consideration. This method attempts to respect
* the {@code maxNumSegments} parameter, however it might be, due to size
* constraints, that more than that number of segments will remain in the
* index. Also, this method does not guarantee that exactly {@code
* maxNumSegments} will remain, but <= that number.
*/
private MergeSpecification findMergesForOptimizeSizeLimit(
SegmentInfos infos, int maxNumSegments, int last) throws IOException {
MergeSpecification spec = new MergeSpecification();
int start = last - 1;
while (start >= 0) {
SegmentInfo info = infos.info(start);
if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
if (verbose()) {
message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
}
// need to skip that segment + add a merge for the 'right' segments,
// unless there is only 1 which is optimized.
if (last - start - 1 > 1 || (start != last - 1 && !isOptimized(infos.info(start + 1)))) {
// there is more than 1 segment to the right of this one, or an unoptimized single segment.
spec.add(new OneMerge(infos.range(start + 1, last)));
}
last = start;
} else if (last - start == mergeFactor) {
// mergeFactor eligible segments were found, add them as a merge.
spec.add(new OneMerge(infos.range(start, last)));
last = start;
}
--start;
}
// Add any left-over segments, unless there is just 1 already optimized.
if (last > 0 && (++start + 1 < last || !isOptimized(infos.info(start)))) {
spec.add(new OneMerge(infos.range(start, last)));
}
return spec.merges.size() == 0 ? null : spec;
}
/**
* Returns the merges necessary to optimize the index. This method constraints
* the returned merges only by the {@code maxNumSegments} parameter, and
* guaranteed that exactly that number of segments will remain in the index.
*/
private MergeSpecification findMergesForOptimizeMaxNumSegments(SegmentInfos infos, int maxNumSegments, int last) throws IOException {
MergeSpecification spec = new MergeSpecification();
// First, enroll all "full" merges (size
// mergeFactor) to potentially be run concurrently:
while (last - maxNumSegments + 1 >= mergeFactor) {
spec.add(new OneMerge(infos.range(last - mergeFactor, last)));
last -= mergeFactor;
}
// Only if there are no full merges pending do we
// add a final partial (< mergeFactor segments) merge:
if (0 == spec.merges.size()) {
if (maxNumSegments == 1) {
// Since we must optimize down to 1 segment, the
// choice is simple:
if (last > 1 || !isOptimized(infos.info(0))) {
spec.add(new OneMerge(infos.range(0, last)));
}
} else if (last > maxNumSegments) {
// Take care to pick a partial merge that is
// least cost, but does not make the index too
// lopsided. If we always just picked the
// partial tail then we could produce a highly
// lopsided index over time:
// We must merge this many segments to leave
// maxNumSegments in the index (from when
// optimize was first kicked off):
final int finalMergeSize = last - maxNumSegments + 1;
// Consider all possible starting points:
long bestSize = 0;
int bestStart = 0;
for(int i=0;i<last-finalMergeSize+1;i++) {
long sumSize = 0;
for(int j=0;j<finalMergeSize;j++)
sumSize += size(infos.info(j+i));
if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
bestStart = i;
bestSize = sumSize;
}
}
spec.add(new OneMerge(infos.range(bestStart, bestStart + finalMergeSize)));
}
}
return spec.merges.size() == 0 ? null : spec;
}
/** Returns the merges necessary to optimize the index.
* This merge policy defines "optimized" to mean only the
* requested number of segments is left in the index, and
* respects the {@link #maxMergeSizeForOptimize} setting.
* By default, and assuming {@code maxNumSegments=1}, only
* one segment will be left in the index, where that segment
* has no deletions pending nor separate norms, and it is in
* compound file format if the current useCompoundFile
* setting is true. This method returns multiple merges
* (mergeFactor at a time) so the {@link MergeScheduler}
* in use may make use of concurrency. */
@Override
public MergeSpecification findMergesForOptimize(SegmentInfos infos,
int maxNumSegments, Set<SegmentInfo> segmentsToOptimize) throws IOException {
assert maxNumSegments > 0;
if (verbose()) {
message("findMergesForOptimize: maxNumSegs=" + maxNumSegments + " segsToOptimize= "+ segmentsToOptimize);
}
// If the segments are already optimized (e.g. there's only 1 segment), or
// there are <maxNumSegements, all optimized, nothing to do.
if (isOptimized(infos, maxNumSegments, segmentsToOptimize)) {
if (verbose()) {
message("already optimized; skip");
}
return null;
}
// TODO: handle non-contiguous merge case differently?
// Find the newest (rightmost) segment that needs to
// be optimized (other segments may have been flushed
// since optimize started):
int last = infos.size();
while (last > 0) {
final SegmentInfo info = infos.info(--last);
if (segmentsToOptimize.contains(info)) {
last++;
break;
}
}
if (last == 0) {
if (verbose()) {
message("last == 0; skip");
}
return null;
}
// There is only one segment already, and it is optimized
if (maxNumSegments == 1 && last == 1 && isOptimized(infos.info(0))) {
if (verbose()) {
message("already 1 seg; skip");
}
return null;
}
// Check if there are any segments above the threshold
boolean anyTooLarge = false;
for (int i = 0; i < last; i++) {
SegmentInfo info = infos.info(i);
if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
anyTooLarge = true;
break;
}
}
if (anyTooLarge) {
return findMergesForOptimizeSizeLimit(infos, maxNumSegments, last);
} else {
return findMergesForOptimizeMaxNumSegments(infos, maxNumSegments, last);
}
}
/**
* Finds merges necessary to expunge all deletes from the
* index. We simply merge adjacent segments that have
* deletes, up to mergeFactor at a time.
*/
@Override
public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos)
throws CorruptIndexException, IOException {
final int numSegments = segmentInfos.size();
if (verbose())
message("findMergesToExpungeDeletes: " + numSegments + " segments");
MergeSpecification spec = new MergeSpecification();
int firstSegmentWithDeletions = -1;
IndexWriter w = writer.get();
assert w != null;
for(int i=0;i<numSegments;i++) {
final SegmentInfo info = segmentInfos.info(i);
int delCount = w.numDeletedDocs(info);
if (delCount > 0) {
if (verbose())
message(" segment " + info.name + " has deletions");
if (firstSegmentWithDeletions == -1)
firstSegmentWithDeletions = i;
else if (i - firstSegmentWithDeletions == mergeFactor) {
// We've seen mergeFactor segments in a row with
// deletions, so force a merge now:
if (verbose())
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i)));
firstSegmentWithDeletions = i;
}
} else if (firstSegmentWithDeletions != -1) {
// End of a sequence of segments with deletions, so,
// merge those past segments even if it's fewer than
// mergeFactor segments
if (verbose())
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i)));
firstSegmentWithDeletions = -1;
}
}
if (firstSegmentWithDeletions != -1) {
if (verbose())
message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, numSegments)));
}
return spec;
}
private static class SegmentInfoAndLevel implements Comparable<SegmentInfoAndLevel> {
SegmentInfo info;
float level;
int index;
public SegmentInfoAndLevel(SegmentInfo info, float level, int index) {
this.info = info;
this.level = level;
this.index = index;
}
// Sorts largest to smallest
public int compareTo(SegmentInfoAndLevel other) {
if (level < other.level)
return 1;
else if (level > other.level)
return -1;
else
return 0;
}
}
private static class SortByIndex implements Comparator<SegmentInfoAndLevel> {
public int compare(SegmentInfoAndLevel o1, SegmentInfoAndLevel o2) {
return o1.index - o2.index;
}
}
private static final SortByIndex sortByIndex = new SortByIndex();
/** Checks if any merges are now necessary and returns a
* {@link MergePolicy.MergeSpecification} if so. A merge
* is necessary when there are more than {@link
* #setMergeFactor} segments at a given level. When
* multiple levels have too many segments, this method
* will return multiple merges, allowing the {@link
* MergeScheduler} to use concurrency. */
@Override
public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
final int numSegments = infos.size();
if (verbose())
message("findMerges: " + numSegments + " segments");
// Compute levels, which is just log (base mergeFactor)
// of the size of each segment
final List<SegmentInfoAndLevel> levels = new ArrayList<SegmentInfoAndLevel>();
final float norm = (float) Math.log(mergeFactor);
final Collection<SegmentInfo> mergingSegments = writer.get().getMergingSegments();
for(int i=0;i<numSegments;i++) {
final SegmentInfo info = infos.info(i);
long size = size(info);
// When we require contiguous merge, we still add the
// segment to levels to avoid merging "across" a set
// of segment being merged:
if (!requireContiguousMerge && mergingSegments.contains(info)) {
if (verbose()) {
message("seg " + info.name + " already being merged; skip");
}
continue;
}
// Floor tiny segments
if (size < 1) {
size = 1;
}
final SegmentInfoAndLevel infoLevel = new SegmentInfoAndLevel(info, (float) Math.log(size)/norm, i);
levels.add(infoLevel);
if (verbose()) {
message("seg " + info.name + " level=" + infoLevel.level + " size=" + size);
}
}
if (!requireContiguousMerge) {
Collections.sort(levels);
}
final float levelFloor;
if (minMergeSize <= 0)
levelFloor = (float) 0.0;
else
levelFloor = (float) (Math.log(minMergeSize)/norm);
// Now, we quantize the log values into levels. The
// first level is any segment whose log size is within
// LEVEL_LOG_SPAN of the max size, or, who has such as
// segment "to the right". Then, we find the max of all
// other segments and use that to define the next level
// segment, etc.
MergeSpecification spec = null;
final int numMergeableSegments = levels.size();
int start = 0;
while(start < numMergeableSegments) {
// Find max level of all segments not already
// quantized.
float maxLevel = levels.get(start).level;
for(int i=1+start;i<numMergeableSegments;i++) {
final float level = levels.get(i).level;
if (level > maxLevel)
maxLevel = level;
}
// Now search backwards for the rightmost segment that
// falls into this level:
float levelBottom;
if (maxLevel <= levelFloor)
// All remaining segments fall into the min level
levelBottom = -1.0F;
else {
levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
// Force a boundary at the level floor
if (levelBottom < levelFloor && maxLevel >= levelFloor)
levelBottom = levelFloor;
}
int upto = numMergeableSegments-1;
while(upto >= start) {
if (levels.get(upto).level >= levelBottom) {
break;
}
upto--;
}
if (verbose())
message(" level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments");
// Finally, record all merges that are viable at this level:
int end = start + mergeFactor;
while(end <= 1+upto) {
boolean anyTooLarge = false;
for(int i=start;i<end;i++) {
final SegmentInfo info = levels.get(i).info;
anyTooLarge |= (size(info) >= maxMergeSize || sizeDocs(info) >= maxMergeDocs);
}
if (!anyTooLarge) {
if (spec == null)
spec = new MergeSpecification();
if (verbose()) {
message(" " + start + " to " + end + ": add this merge");
}
Collections.sort(levels.subList(start, end), sortByIndex);
final SegmentInfos mergeInfos = new SegmentInfos();
for(int i=start;i<end;i++) {
mergeInfos.add(levels.get(i).info);
assert infos.contains(levels.get(i).info);
}
spec.add(new OneMerge(mergeInfos));
} else if (verbose()) {
message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
}
start = end;
end = start + mergeFactor;
}
start = 1+upto;
}
return spec;
}
/** <p>Determines the largest segment (measured by
* document count) that may be merged with other segments.
* Small values (e.g., less than 10,000) are best for
* interactive indexing, as this limits the length of
* pauses while indexing to a few seconds. Larger values
* are best for batched indexing and speedier
* searches.</p>
*
* <p>The default value is {@link Integer#MAX_VALUE}.</p>
*
* <p>The default merge policy ({@link
* LogByteSizeMergePolicy}) also allows you to set this
* limit by net size (in MB) of the segment, using {@link
* LogByteSizeMergePolicy#setMaxMergeMB}.</p>
*/
public void setMaxMergeDocs(int maxMergeDocs) {
this.maxMergeDocs = maxMergeDocs;
}
/** Returns the largest segment (measured by document
* count) that may be merged with other segments.
* @see #setMaxMergeDocs */
public int getMaxMergeDocs() {
return maxMergeDocs;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
sb.append("minMergeSize=").append(minMergeSize).append(", ");
sb.append("mergeFactor=").append(mergeFactor).append(", ");
sb.append("maxMergeSize=").append(maxMergeSize).append(", ");
sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", ");
sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", ");
sb.append("maxMergeDocs=").append(maxMergeDocs).append(", ");
sb.append("useCompoundFile=").append(useCompoundFile).append(", ");
sb.append("requireContiguousMerge=").append(requireContiguousMerge);
sb.append("]");
return sb.toString();
}
}