/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.update; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FilterAtomicReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.OpenBitSet; import org.apache.solr.common.cloud.HashPartitioner; import org.apache.solr.common.util.Hash; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.RefCounted; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class SolrIndexSplitter { public static Logger log = LoggerFactory.getLogger(SolrIndexSplitter.class); SolrIndexSearcher searcher; SchemaField field; List<HashPartitioner.Range> ranges; HashPartitioner.Range[] rangesArr; // same as ranges list, but an array for extra speed in inner loops List<String> paths; List<SolrCore> cores; public SolrIndexSplitter(SplitIndexCommand cmd) { field = cmd.getReq().getSchema().getUniqueKeyField(); searcher = cmd.getReq().getSearcher(); ranges = cmd.ranges; rangesArr = ranges.toArray(new HashPartitioner.Range[ranges.size()]); paths = cmd.paths; cores = cmd.cores; } public void split() throws IOException { List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves(); List<OpenBitSet[]> segmentDocSets = new ArrayList<OpenBitSet[]>(leaves.size()); log.info("SolrIndexSplitter: partitions=" + ranges.size() + " segments="+leaves.size()); for (AtomicReaderContext readerContext : leaves) { assert readerContext.ordInParent == segmentDocSets.size(); // make sure we're going in order OpenBitSet[] docSets = split(readerContext); segmentDocSets.add( docSets ); } // would it be more efficient to write segment-at-a-time to each new index? // - need to worry about number of open descriptors // - need to worry about if IW.addIndexes does a sync or not... // - would be more efficient on the read side, but prob less efficient merging IndexReader[] subReaders = new IndexReader[leaves.size()]; for (int partitionNumber=0; partitionNumber<ranges.size(); partitionNumber++) { log.info("SolrIndexSplitter: partition #" + partitionNumber + " range=" + ranges.get(partitionNumber)); for (int segmentNumber = 0; segmentNumber<subReaders.length; segmentNumber++) { subReaders[segmentNumber] = new LiveDocsReader( leaves.get(segmentNumber), segmentDocSets.get(segmentNumber)[partitionNumber] ); } boolean success = false; RefCounted<IndexWriter> iwRef = null; IndexWriter iw = null; if (cores != null) { SolrCore subCore = cores.get(partitionNumber); iwRef = subCore.getUpdateHandler().getSolrCoreState().getIndexWriter(subCore); iw = iwRef.get(); } else { SolrCore core = searcher.getCore(); String path = paths.get(partitionNumber); iw = SolrIndexWriter.create("SplittingIndexWriter"+partitionNumber + " " + ranges.get(partitionNumber), path, core.getDirectoryFactory(), true, core.getSchema(), core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), true); } try { // This merges the subreaders and will thus remove deletions (i.e. no optimize needed) iw.addIndexes(subReaders); success = true; } finally { if (iwRef != null) { iwRef.decref(); } else { if (success) { IOUtils.close(iw); } else { IOUtils.closeWhileHandlingException(iw); } } } } } OpenBitSet[] split(AtomicReaderContext readerContext) throws IOException { AtomicReader reader = readerContext.reader(); OpenBitSet[] docSets = new OpenBitSet[ranges.size()]; for (int i=0; i<docSets.length; i++) { docSets[i] = new OpenBitSet(reader.maxDoc()); } Bits liveDocs = reader.getLiveDocs(); Fields fields = reader.fields(); Terms terms = fields==null ? null : fields.terms(field.getName()); TermsEnum termsEnum = terms==null ? null : terms.iterator(null); if (termsEnum == null) return docSets; BytesRef term = null; DocsEnum docsEnum = null; for (;;) { term = termsEnum.next(); if (term == null) break; // figure out the hash for the term // TODO: hook in custom hashes (or store hashes) int hash = Hash.murmurhash3_x86_32(term.bytes, term.offset, term.length, 0); docsEnum = termsEnum.docs(liveDocs, docsEnum, 0x0); for (;;) { int doc = docsEnum.nextDoc(); if (doc == DocsEnum.NO_MORE_DOCS) break; for (int i=0; i<rangesArr.length; i++) { // inner-loop: use array here for extra speed. if (rangesArr[i].includes(hash)) { docSets[i].fastSet(doc); } } } } return docSets; } // change livedocs on the reader to delete those docs we don't want static class LiveDocsReader extends FilterAtomicReader { final OpenBitSet liveDocs; final int numDocs; public LiveDocsReader(AtomicReaderContext context, OpenBitSet liveDocs) throws IOException { super(context.reader()); this.liveDocs = liveDocs; this.numDocs = (int)liveDocs.cardinality(); } @Override public int numDocs() { return numDocs; } @Override public boolean hasDeletions() { return (in.maxDoc() != numDocs); } @Override public Bits getLiveDocs() { return liveDocs; } } }