/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterCodecReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowCodecReaderWrapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.common.cloud.CompositeIdRouter;
import org.apache.solr.common.cloud.DocRouter;
import org.apache.solr.common.cloud.HashBasedRouter;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.BitsFilteredPostingsEnum;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SolrIndexSplitter {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
SolrIndexSearcher searcher;
SchemaField field;
List<DocRouter.Range> ranges;
DocRouter.Range[] rangesArr; // same as ranges list, but an array for extra speed in inner loops
List<String> paths;
List<SolrCore> cores;
DocRouter router;
HashBasedRouter hashRouter;
int numPieces;
int currPartition = 0;
String routeFieldName;
String splitKey;
public SolrIndexSplitter(SplitIndexCommand cmd) {
searcher = cmd.getReq().getSearcher();
ranges = cmd.ranges;
paths = cmd.paths;
cores = cmd.cores;
router = cmd.router;
hashRouter = router instanceof HashBasedRouter ? (HashBasedRouter)router : null;
if (ranges == null) {
numPieces = paths != null ? paths.size() : cores.size();
} else {
numPieces = ranges.size();
rangesArr = ranges.toArray(new DocRouter.Range[ranges.size()]);
}
routeFieldName = cmd.routeFieldName;
if (routeFieldName == null) {
field = searcher.getSchema().getUniqueKeyField();
} else {
field = searcher.getSchema().getField(routeFieldName);
}
if (cmd.splitKey != null) {
splitKey = getRouteKey(cmd.splitKey);
}
}
public void split() throws IOException {
List<LeafReaderContext> leaves = searcher.getRawReader().leaves();
List<FixedBitSet[]> segmentDocSets = new ArrayList<>(leaves.size());
log.info("SolrIndexSplitter: partitions=" + numPieces + " segments="+leaves.size());
for (LeafReaderContext readerContext : leaves) {
assert readerContext.ordInParent == segmentDocSets.size(); // make sure we're going in order
FixedBitSet[] docSets = split(readerContext);
segmentDocSets.add( docSets );
}
// would it be more efficient to write segment-at-a-time to each new index?
// - need to worry about number of open descriptors
// - need to worry about if IW.addIndexes does a sync or not...
// - would be more efficient on the read side, but prob less efficient merging
for (int partitionNumber=0; partitionNumber<numPieces; partitionNumber++) {
log.info("SolrIndexSplitter: partition #" + partitionNumber + " partitionCount=" + numPieces + (ranges != null ? " range=" + ranges.get(partitionNumber) : ""));
boolean success = false;
RefCounted<IndexWriter> iwRef = null;
IndexWriter iw = null;
if (cores != null) {
SolrCore subCore = cores.get(partitionNumber);
iwRef = subCore.getUpdateHandler().getSolrCoreState().getIndexWriter(subCore);
iw = iwRef.get();
} else {
SolrCore core = searcher.getCore();
String path = paths.get(partitionNumber);
iw = SolrIndexWriter.create(core, "SplittingIndexWriter"+partitionNumber + (ranges != null ? " " + ranges.get(partitionNumber) : ""), path,
core.getDirectoryFactory(), true, core.getLatestSchema(),
core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
}
try {
// This removes deletions but optimize might still be needed because sub-shards will have the same number of segments as the parent shard.
for (int segmentNumber = 0; segmentNumber<leaves.size(); segmentNumber++) {
log.info("SolrIndexSplitter: partition #" + partitionNumber + " partitionCount=" + numPieces + (ranges != null ? " range=" + ranges.get(partitionNumber) : "") + " segment #"+segmentNumber + " segmentCount=" + leaves.size());
CodecReader subReader = SlowCodecReaderWrapper.wrap(leaves.get(segmentNumber).reader());
iw.addIndexes(new LiveDocsReader(subReader, segmentDocSets.get(segmentNumber)[partitionNumber]));
}
// we commit explicitly instead of sending a CommitUpdateCommand through the processor chain
// because the sub-shard cores will just ignore such a commit because the update log is not
// in active state at this time.
//TODO no commitUpdateCommand
SolrIndexWriter.setCommitData(iw, -1);
iw.commit();
success = true;
} finally {
if (iwRef != null) {
iwRef.decref();
} else {
if (success) {
iw.close();
} else {
IOUtils.closeWhileHandlingException(iw);
}
}
}
}
}
FixedBitSet[] split(LeafReaderContext readerContext) throws IOException {
LeafReader reader = readerContext.reader();
FixedBitSet[] docSets = new FixedBitSet[numPieces];
for (int i=0; i<docSets.length; i++) {
docSets[i] = new FixedBitSet(reader.maxDoc());
}
Bits liveDocs = reader.getLiveDocs();
Fields fields = reader.fields();
Terms terms = fields==null ? null : fields.terms(field.getName());
TermsEnum termsEnum = terms==null ? null : terms.iterator();
if (termsEnum == null) return docSets;
BytesRef term = null;
PostingsEnum postingsEnum = null;
int[] docsMatchingRanges = null;
if (ranges != null) {
// +1 because documents can belong to *zero*, one, several or all ranges in rangesArr
docsMatchingRanges = new int[rangesArr.length+1];
}
CharsRefBuilder idRef = new CharsRefBuilder();
for (;;) {
term = termsEnum.next();
if (term == null) break;
// figure out the hash for the term
// FUTURE: if conversion to strings costs too much, we could
// specialize and use the hash function that can work over bytes.
field.getType().indexedToReadable(term, idRef);
String idString = idRef.toString();
if (splitKey != null) {
// todo have composite routers support these kind of things instead
String part1 = getRouteKey(idString);
if (part1 == null)
continue;
if (!splitKey.equals(part1)) {
continue;
}
}
int hash = 0;
if (hashRouter != null) {
hash = hashRouter.sliceHash(idString, null, null, null);
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, liveDocs);
for (;;) {
int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) break;
if (ranges == null) {
docSets[currPartition].set(doc);
currPartition = (currPartition + 1) % numPieces;
} else {
int matchingRangesCount = 0;
for (int i=0; i<rangesArr.length; i++) { // inner-loop: use array here for extra speed.
if (rangesArr[i].includes(hash)) {
docSets[i].set(doc);
++matchingRangesCount;
}
}
docsMatchingRanges[matchingRangesCount]++;
}
}
}
if (docsMatchingRanges != null) {
for (int ii = 0; ii < docsMatchingRanges.length; ii++) {
if (0 == docsMatchingRanges[ii]) continue;
switch (ii) {
case 0:
// document loss
log.error("Splitting {}: {} documents belong to no shards and will be dropped",
reader, docsMatchingRanges[ii]);
break;
case 1:
// normal case, each document moves to one of the sub-shards
log.info("Splitting {}: {} documents will move into a sub-shard",
reader, docsMatchingRanges[ii]);
break;
default:
// document duplication
log.error("Splitting {}: {} documents will be moved to multiple ({}) sub-shards",
reader, docsMatchingRanges[ii], ii);
break;
}
}
}
return docSets;
}
public static String getRouteKey(String idString) {
int idx = idString.indexOf(CompositeIdRouter.SEPARATOR);
if (idx <= 0) return null;
String part1 = idString.substring(0, idx);
int commaIdx = part1.indexOf(CompositeIdRouter.bitsSeparator);
if (commaIdx > 0) {
if (commaIdx + 1 < part1.length()) {
char ch = part1.charAt(commaIdx + 1);
if (ch >= '0' && ch <= '9') {
part1 = part1.substring(0, commaIdx);
}
}
}
return part1;
}
// change livedocs on the reader to delete those docs we don't want
static class LiveDocsReader extends FilterCodecReader {
final FixedBitSet liveDocs;
final int numDocs;
public LiveDocsReader(CodecReader in, FixedBitSet liveDocs) throws IOException {
super(in);
this.liveDocs = liveDocs;
this.numDocs = liveDocs.cardinality();
}
@Override
public int numDocs() {
return numDocs;
}
@Override
public Bits getLiveDocs() {
return liveDocs;
}
@Override
public CacheHelper getCoreCacheHelper() {
return in.getCoreCacheHelper();
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
}