SolrIndexSplitter.java example

Explorer
solr-analytics-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.update;

import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterAtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OpenBitSet;
import org.apache.solr.common.cloud.HashPartitioner;
import org.apache.solr.common.util.Hash;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class SolrIndexSplitter {
  public static Logger log = LoggerFactory.getLogger(SolrIndexSplitter.class);

  SolrIndexSearcher searcher;
  SchemaField field;
  List<HashPartitioner.Range> ranges;
  HashPartitioner.Range[] rangesArr; // same as ranges list, but an array for extra speed in inner loops
  List<String> paths;
  List<SolrCore> cores;

  public SolrIndexSplitter(SplitIndexCommand cmd) {
    field = cmd.getReq().getSchema().getUniqueKeyField();
    searcher = cmd.getReq().getSearcher();
    ranges = cmd.ranges;
    rangesArr = ranges.toArray(new HashPartitioner.Range[ranges.size()]);
    paths = cmd.paths;
    cores = cmd.cores;
  }

  public void split() throws IOException {

    List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
    List<OpenBitSet[]> segmentDocSets = new ArrayList<OpenBitSet[]>(leaves.size());

    log.info("SolrIndexSplitter: partitions=" + ranges.size() + " segments="+leaves.size());

    for (AtomicReaderContext readerContext : leaves) {
      assert readerContext.ordInParent == segmentDocSets.size();  // make sure we're going in order
      OpenBitSet[] docSets = split(readerContext);
      segmentDocSets.add( docSets );
    }


    // would it be more efficient to write segment-at-a-time to each new index?
    // - need to worry about number of open descriptors
    // - need to worry about if IW.addIndexes does a sync or not...
    // - would be more efficient on the read side, but prob less efficient merging

    IndexReader[] subReaders = new IndexReader[leaves.size()];
    for (int partitionNumber=0; partitionNumber<ranges.size(); partitionNumber++) {
      log.info("SolrIndexSplitter: partition #" + partitionNumber + " range=" + ranges.get(partitionNumber));

      for (int segmentNumber = 0; segmentNumber<subReaders.length; segmentNumber++) {
        subReaders[segmentNumber] = new LiveDocsReader( leaves.get(segmentNumber), segmentDocSets.get(segmentNumber)[partitionNumber] );
      }

      boolean success = false;

      RefCounted<IndexWriter> iwRef = null;
      IndexWriter iw = null;
      if (cores != null) {
        SolrCore subCore = cores.get(partitionNumber);
        iwRef = subCore.getUpdateHandler().getSolrCoreState().getIndexWriter(subCore);
        iw = iwRef.get();
      } else {
        SolrCore core = searcher.getCore();
        String path = paths.get(partitionNumber);
        iw = SolrIndexWriter.create("SplittingIndexWriter"+partitionNumber + " " + ranges.get(partitionNumber), path,
                                    core.getDirectoryFactory(), true, core.getSchema(),
                                    core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec(), true);
      }

      try {
        // This merges the subreaders and will thus remove deletions (i.e. no optimize needed)
        iw.addIndexes(subReaders);
        success = true;
      } finally {
        if (iwRef != null) {
          iwRef.decref();
        } else {
          if (success) {
            IOUtils.close(iw);
          } else {
            IOUtils.closeWhileHandlingException(iw);
          }
        }
      }

    }

  }



  OpenBitSet[] split(AtomicReaderContext readerContext) throws IOException {
    AtomicReader reader = readerContext.reader();
    OpenBitSet[] docSets = new OpenBitSet[ranges.size()];
    for (int i=0; i<docSets.length; i++) {
      docSets[i] = new OpenBitSet(reader.maxDoc());
    }
    Bits liveDocs = reader.getLiveDocs();

    Fields fields = reader.fields();
    Terms terms = fields==null ? null : fields.terms(field.getName());
    TermsEnum termsEnum = terms==null ? null : terms.iterator(null);
    if (termsEnum == null) return docSets;

    BytesRef term = null;
    DocsEnum docsEnum = null;

    for (;;) {
      term = termsEnum.next();
      if (term == null) break;

      // figure out the hash for the term
      // TODO: hook in custom hashes (or store hashes)
      int hash = Hash.murmurhash3_x86_32(term.bytes, term.offset, term.length, 0);

      docsEnum = termsEnum.docs(liveDocs, docsEnum, 0x0);
      for (;;) {
        int doc = docsEnum.nextDoc();
        if (doc == DocsEnum.NO_MORE_DOCS) break;
        for (int i=0; i<rangesArr.length; i++) {      // inner-loop: use array here for extra speed.
          if (rangesArr[i].includes(hash)) {
            docSets[i].fastSet(doc);
          }
        }
      }
    }

    return docSets;
  }


  // change livedocs on the reader to delete those docs we don't want
  static class LiveDocsReader extends FilterAtomicReader {
    final OpenBitSet liveDocs;
    final int numDocs;

    public LiveDocsReader(AtomicReaderContext context, OpenBitSet liveDocs) throws IOException {
      super(context.reader());
      this.liveDocs = liveDocs;
      this.numDocs = (int)liveDocs.cardinality();
    }

    @Override
    public int numDocs() {
      return numDocs;
    }

    @Override
    public boolean hasDeletions() {
      return (in.maxDoc() != numDocs);
    }

    @Override
    public Bits getLiveDocs() {
      return liveDocs;
    }
  }

}