package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Set; /** * Merge policy that tries to balance not doing large * segment merges with not accumulating too many segments in * the index, to provide for better performance in near * real-time setting. * * <p>This is based on code from zoie, described in more detail * at http://code.google.com/p/zoie/wiki/ZoieMergePolicy.</p> */ public class BalancedSegmentMergePolicy extends LogByteSizeMergePolicy { public static final int DEFAULT_NUM_LARGE_SEGMENTS = 10; private boolean _partialExpunge = false; private int _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS; private int _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR; private int _maxSegments = _numLargeSegments + _maxSmallSegments; public BalancedSegmentMergePolicy() { } public void setMergePolicyParams(MergePolicyParams params) { if (params!=null) { setPartialExpunge(params._doPartialExpunge); setNumLargeSegments(params._numLargeSegments); setMaxSmallSegments(params._maxSmallSegments); setPartialExpunge(params._doPartialExpunge); setMergeFactor(params._mergeFactor); setUseCompoundFile(params._useCompoundFile); setMaxMergeDocs(params._maxMergeDocs); } } @Override protected long size(SegmentInfo info) throws IOException { long byteSize = info.sizeInBytes(); float delRatio = (info.docCount <= 0 ? 0.0f : ((float)info.getDelCount() / (float)info.docCount)); return (info.docCount <= 0 ? byteSize : (long)((1.0f - delRatio) * byteSize)); } public void setPartialExpunge(boolean doPartialExpunge) { _partialExpunge = doPartialExpunge; } public boolean getPartialExpunge() { return _partialExpunge; } public void setNumLargeSegments(int numLargeSegments) { if (numLargeSegments < 2) { throw new IllegalArgumentException("numLargeSegments cannot be less than 2"); } _numLargeSegments = numLargeSegments; _maxSegments = _numLargeSegments + 2 * getMergeFactor(); } public int getNumLargeSegments() { return _numLargeSegments; } public void setMaxSmallSegments(int maxSmallSegments) { if (maxSmallSegments < getMergeFactor()) { throw new IllegalArgumentException("maxSmallSegments cannot be less than mergeFactor"); } _maxSmallSegments = maxSmallSegments; _maxSegments = _numLargeSegments + _maxSmallSegments; } public int getMaxSmallSegments() { return _maxSmallSegments; } @Override public void setMergeFactor(int mergeFactor) { super.setMergeFactor(mergeFactor); if (_maxSmallSegments < getMergeFactor()) { _maxSmallSegments = getMergeFactor(); _maxSegments = _numLargeSegments + _maxSmallSegments; } } private boolean isOptimized(SegmentInfos infos, IndexWriter writer, int maxNumSegments, Set<SegmentInfo> segmentsToOptimize) throws IOException { final int numSegments = infos.size(); int numToOptimize = 0; SegmentInfo optimizeInfo = null; for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) { final SegmentInfo info = infos.info(i); if (segmentsToOptimize.contains(info)) { numToOptimize++; optimizeInfo = info; } } return numToOptimize <= maxNumSegments && (numToOptimize != 1 || isOptimized(writer, optimizeInfo)); } private boolean isOptimized(IndexWriter writer, SegmentInfo info) throws IOException { assert writer != null; return !info.hasDeletions() && !info.hasSeparateNorms() && info.dir == writer.getDirectory() && info.getUseCompoundFile() == getUseCompoundFile(); } @Override public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxNumSegments, Set<SegmentInfo> segmentsToOptimize) throws IOException { assert maxNumSegments > 0; MergeSpecification spec = null; if (!isOptimized(infos, writer.get(), maxNumSegments, segmentsToOptimize)) { // Find the newest (rightmost) segment that needs to // be optimized (other segments may have been flushed // since optimize started): int last = infos.size(); while(last > 0) { final SegmentInfo info = infos.info(--last); if (segmentsToOptimize.contains(info)) { last++; break; } } if (last > 0) { if (maxNumSegments == 1) { // Since we must optimize down to 1 segment, the // choice is simple: boolean useCompoundFile = getUseCompoundFile(); if (last > 1 || !isOptimized(writer.get(), infos.info(0))) { spec = new MergeSpecification(); spec.add(new OneMerge(infos.range(0, last), useCompoundFile)); } } else if (last > maxNumSegments) { // find most balanced merges spec = findBalancedMerges(infos, last, maxNumSegments, _partialExpunge); } } } return spec; } private MergeSpecification findBalancedMerges(SegmentInfos infos, int infoLen, int maxNumSegments, boolean partialExpunge) throws IOException { if (infoLen <= maxNumSegments) return null; MergeSpecification spec = new MergeSpecification(); boolean useCompoundFile = getUseCompoundFile(); // use Viterbi algorithm to find the best segmentation. // we will try to minimize the size variance of resulting segments. double[][] variance = createVarianceTable(infos, infoLen, maxNumSegments); final int maxMergeSegments = infoLen - maxNumSegments + 1; double[] sumVariance = new double[maxMergeSegments]; int[][] backLink = new int[maxNumSegments][maxMergeSegments]; for(int i = (maxMergeSegments - 1); i >= 0; i--) { sumVariance[i] = variance[0][i]; backLink[0][i] = 0; } for(int i = 1; i < maxNumSegments; i++) { for(int j = (maxMergeSegments - 1); j >= 0; j--) { double minV = Double.MAX_VALUE; int minK = 0; for(int k = j; k >= 0; k--) { double v = sumVariance[k] + variance[i + k][j - k]; if(v < minV) { minV = v; minK = k; } } sumVariance[j] = minV; backLink[i][j] = minK; } } // now, trace back the back links to find all merges, // also find a candidate for partial expunge if requested int mergeEnd = infoLen; int prev = maxMergeSegments - 1; int expungeCandidate = -1; int maxDelCount = 0; for(int i = maxNumSegments - 1; i >= 0; i--) { prev = backLink[i][prev]; int mergeStart = i + prev; if((mergeEnd - mergeStart) > 1) { spec.add(new OneMerge(infos.range(mergeStart, mergeEnd), useCompoundFile)); } else { if(partialExpunge) { SegmentInfo info = infos.info(mergeStart); int delCount = info.getDelCount(); if(delCount > maxDelCount) { expungeCandidate = mergeStart; maxDelCount = delCount; } } } mergeEnd = mergeStart; } if(partialExpunge && maxDelCount > 0) { // expunge deletes spec.add(new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), useCompoundFile)); } return spec; } private double[][] createVarianceTable(SegmentInfos infos, int last, int maxNumSegments) throws IOException { int maxMergeSegments = last - maxNumSegments + 1; double[][] variance = new double[last][maxMergeSegments]; // compute the optimal segment size long optSize = 0; long[] sizeArr = new long[last]; for(int i = 0; i < sizeArr.length; i++) { sizeArr[i] = size(infos.info(i)); optSize += sizeArr[i]; } optSize = (optSize / maxNumSegments); for(int i = 0; i < last; i++) { long size = 0; for(int j = 0; j < maxMergeSegments; j++) { if((i + j) < last) { size += sizeArr[i + j]; double residual = ((double)size/(double)optSize) - 1.0d; variance[i][j] = residual * residual; } else { variance[i][j] = Double.NaN; } } } return variance; } @Override public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos) throws CorruptIndexException, IOException { final int numSegs = infos.size(); final int numLargeSegs = (numSegs < _numLargeSegments ? numSegs : _numLargeSegments); MergeSpecification spec = null; if(numLargeSegs < numSegs) { SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs); spec = super.findMergesToExpungeDeletes(smallSegments); } if(spec == null) spec = new MergeSpecification(); for(int i = 0; i < numLargeSegs; i++) { SegmentInfo info = infos.info(i); if(info.hasDeletions()) { spec.add(new OneMerge(infos.range(i, i + 1), getUseCompoundFile())); } } return spec; } @Override public MergeSpecification findMerges(SegmentInfos infos) throws IOException { final int numSegs = infos.size(); final int numLargeSegs = _numLargeSegments; if (numSegs <= numLargeSegs) { return null; } long totalLargeSegSize = 0; long totalSmallSegSize = 0; SegmentInfo info; // compute the total size of large segments for(int i = 0; i < numLargeSegs; i++) { info = infos.info(i); totalLargeSegSize += size(info); } // compute the total size of small segments for(int i = numLargeSegs; i < numSegs; i++) { info = infos.info(i); totalSmallSegSize += size(info); } long targetSegSize = (totalLargeSegSize / (numLargeSegs - 1)); if(targetSegSize <= totalSmallSegSize) { // the total size of small segments is big enough, // promote the small segments to a large segment and do balanced merge, if(totalSmallSegSize < targetSegSize * 2) { MergeSpecification spec = findBalancedMerges(infos, numLargeSegs, (numLargeSegs - 1), _partialExpunge); if(spec == null) spec = new MergeSpecification(); // should not happen spec.add(new OneMerge(infos.range(numLargeSegs, numSegs), getUseCompoundFile())); return spec; } else { return findBalancedMerges(infos, numSegs, numLargeSegs, _partialExpunge); } } else if (_maxSegments < numSegs) { // we have more than _maxSegments, merge small segments smaller than targetSegSize/4 MergeSpecification spec = new MergeSpecification(); int startSeg = numLargeSegs; long sizeThreshold = (targetSegSize / 4); while(startSeg < numSegs) { info = infos.info(startSeg); if(size(info) < sizeThreshold) break; startSeg++; } spec.add(new OneMerge(infos.range(startSeg, numSegs), getUseCompoundFile())); return spec; } else { // apply the log merge policy to small segments. SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs); MergeSpecification spec = super.findMerges(smallSegments); if(_partialExpunge) { OneMerge expunge = findOneSegmentToExpunge(infos, numLargeSegs); if(expunge != null) { if(spec == null) spec = new MergeSpecification(); spec.add(expunge); } } return spec; } } private OneMerge findOneSegmentToExpunge(SegmentInfos infos, int maxNumSegments) throws IOException { int expungeCandidate = -1; int maxDelCount = 0; for(int i = maxNumSegments - 1; i >= 0; i--) { SegmentInfo info = infos.info(i); int delCount = info.getDelCount(); if (delCount > maxDelCount) { expungeCandidate = i; maxDelCount = delCount; } } if (maxDelCount > 0) { return new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), getUseCompoundFile()); } return null; } public static class MergePolicyParams { private int _numLargeSegments; private int _maxSmallSegments; private boolean _doPartialExpunge; private int _mergeFactor; private boolean _useCompoundFile; private int _maxMergeDocs; public MergePolicyParams() { _useCompoundFile = true; _doPartialExpunge = false; _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS; _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR; _maxSmallSegments = _numLargeSegments + _maxSmallSegments; _mergeFactor = LogMergePolicy.DEFAULT_MERGE_FACTOR; _maxMergeDocs = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS; } public void setNumLargeSegments(int numLargeSegments) { _numLargeSegments = numLargeSegments; } public int getNumLargeSegments() { return _numLargeSegments; } public void setMaxSmallSegments(int maxSmallSegments) { _maxSmallSegments = maxSmallSegments; } public int getMaxSmallSegments() { return _maxSmallSegments; } public void setPartialExpunge(boolean doPartialExpunge) { _doPartialExpunge = doPartialExpunge; } public boolean getPartialExpunge() { return _doPartialExpunge; } public void setMergeFactor(int mergeFactor) { _mergeFactor = mergeFactor; } public int getMergeFactor() { return _mergeFactor; } public void setMaxMergeDocs(int maxMergeDocs) { _maxMergeDocs = maxMergeDocs; } public int getMaxMergeDocs() { return _maxMergeDocs; } public void setUseCompoundFile(boolean useCompoundFile) { _useCompoundFile = useCompoundFile; } public boolean isUseCompoundFile() { return _useCompoundFile; } } }