SegmentMerger.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.index;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PerDocConsumer;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;

/**
 * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
 * into a single Segment.  After adding the appropriate readers, call the merge method to combine the
 * segments.
 *
 * @see #merge
 * @see #add
 */
final class SegmentMerger {
  private final Directory directory;
  private final int termIndexInterval;

  private final Codec codec;
  
  private final IOContext context;
  
  private final MergeState mergeState = new MergeState();
  private final FieldInfos.Builder fieldInfosBuilder;

  // note, just like in codec apis Directory 'dir' is NOT the same as segmentInfo.dir!!
  SegmentMerger(SegmentInfo segmentInfo, InfoStream infoStream, Directory dir, int termIndexInterval,
                MergeState.CheckAbort checkAbort, FieldInfos.FieldNumbers fieldNumbers, IOContext context) {
    mergeState.segmentInfo = segmentInfo;
    mergeState.infoStream = infoStream;
    mergeState.readers = new ArrayList<AtomicReader>();
    mergeState.checkAbort = checkAbort;
    directory = dir;
    this.termIndexInterval = termIndexInterval;
    this.codec = segmentInfo.getCodec();
    this.context = context;
    this.fieldInfosBuilder = new FieldInfos.Builder(fieldNumbers);
  }

  /**
   * Add an IndexReader to the collection of readers that are to be merged
   */
  final void add(IndexReader reader) {
    for (final AtomicReaderContext ctx : reader.leaves()) {
      final AtomicReader r = ctx.reader();
      mergeState.readers.add(r);
    }
  }

  final void add(SegmentReader reader) {
    mergeState.readers.add(reader);
  }

  /**
   * Merges the readers specified by the {@link #add} method into the directory passed to the constructor
   * @return The number of documents that were merged
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
  final MergeState merge() throws IOException {
    // NOTE: it's important to add calls to
    // checkAbort.work(...) if you make any changes to this
    // method that will spend alot of time.  The frequency
    // of this check impacts how long
    // IndexWriter.close(false) takes to actually stop the
    // threads.
    
    mergeState.segmentInfo.setDocCount(setDocMaps());
    mergeDocValuesAndNormsFieldInfos();
    setMatchingSegmentReaders();
    int numMerged = mergeFields();
    assert numMerged == mergeState.segmentInfo.getDocCount();

    final SegmentWriteState segmentWriteState = new SegmentWriteState(mergeState.infoStream, directory, mergeState.segmentInfo,
                                                                      mergeState.fieldInfos, termIndexInterval, null, context);
    mergeTerms(segmentWriteState);
    mergePerDoc(segmentWriteState);
    
    if (mergeState.fieldInfos.hasNorms()) {
      mergeNorms(segmentWriteState);
    }

    if (mergeState.fieldInfos.hasVectors()) {
      numMerged = mergeVectors();
      assert numMerged == mergeState.segmentInfo.getDocCount();
    }
    
    // write the merged infos
    FieldInfosWriter fieldInfosWriter = codec.fieldInfosFormat().getFieldInfosWriter();
    fieldInfosWriter.write(directory, mergeState.segmentInfo.name, mergeState.fieldInfos, context);

    return mergeState;
  }

  private void setMatchingSegmentReaders() {
    // If the i'th reader is a SegmentReader and has
    // identical fieldName -> number mapping, then this
    // array will be non-null at position i:
    int numReaders = mergeState.readers.size();
    mergeState.matchingSegmentReaders = new SegmentReader[numReaders];

    // If this reader is a SegmentReader, and all of its
    // field name -> number mappings match the "merged"
    // FieldInfos, then we can do a bulk copy of the
    // stored fields:
    for (int i = 0; i < numReaders; i++) {
      AtomicReader reader = mergeState.readers.get(i);
      // TODO: we may be able to broaden this to
      // non-SegmentReaders, since FieldInfos is now
      // required?  But... this'd also require exposing
      // bulk-copy (TVs and stored fields) API in foreign
      // readers..
      if (reader instanceof SegmentReader) {
        SegmentReader segmentReader = (SegmentReader) reader;
        boolean same = true;
        FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
        for (FieldInfo fi : segmentFieldInfos) {
          FieldInfo other = mergeState.fieldInfos.fieldInfo(fi.number);
          if (other == null || !other.name.equals(fi.name)) {
            same = false;
            break;
          }
        }
        if (same) {
          mergeState.matchingSegmentReaders[i] = segmentReader;
          mergeState.matchedCount++;
        }
      }
    }

    if (mergeState.infoStream.isEnabled("SM")) {
      mergeState.infoStream.message("SM", "merge store matchedCount=" + mergeState.matchedCount + " vs " + mergeState.readers.size());
      if (mergeState.matchedCount != mergeState.readers.size()) {
        mergeState.infoStream.message("SM", "" + (mergeState.readers.size() - mergeState.matchedCount) + " non-bulk merges");
      }
    }
  }
  
  // returns an updated typepromoter (tracking type and size) given a previous one,
  // and a newly encountered docvalues
  private TypePromoter mergeDocValuesType(TypePromoter previous, DocValues docValues) {
    TypePromoter incoming = TypePromoter.create(docValues.getType(),  docValues.getValueSize());
    if (previous == null) {
      previous = TypePromoter.getIdentityPromoter();
    }
    return previous.promote(incoming);
  }

  // NOTE: this is actually merging all the fieldinfos
  public void mergeDocValuesAndNormsFieldInfos() throws IOException {
    // mapping from all docvalues fields found to their promoted types
    // this is because FieldInfos does not store the
    // valueSize
    Map<FieldInfo,TypePromoter> docValuesTypes = new HashMap<FieldInfo,TypePromoter>();
    Map<FieldInfo,TypePromoter> normValuesTypes = new HashMap<FieldInfo,TypePromoter>();

    for (AtomicReader reader : mergeState.readers) {
      FieldInfos readerFieldInfos = reader.getFieldInfos();
      for (FieldInfo fi : readerFieldInfos) {
        FieldInfo merged = fieldInfosBuilder.add(fi);
        // update the type promotion mapping for this reader
        if (fi.hasDocValues()) {
          TypePromoter previous = docValuesTypes.get(merged);
          docValuesTypes.put(merged, mergeDocValuesType(previous, reader.docValues(fi.name))); 
        }
        if (fi.hasNorms()) {
          TypePromoter previous = normValuesTypes.get(merged);
          normValuesTypes.put(merged, mergeDocValuesType(previous, reader.normValues(fi.name))); 
        }
      }
    }
    updatePromoted(normValuesTypes, true);
    updatePromoted(docValuesTypes, false);
    mergeState.fieldInfos = fieldInfosBuilder.finish();
  }
  
  protected void updatePromoted(Map<FieldInfo,TypePromoter> infoAndPromoter, boolean norms) {
    // update any promoted doc values types:
    for (Map.Entry<FieldInfo,TypePromoter> e : infoAndPromoter.entrySet()) {
      FieldInfo fi = e.getKey();
      TypePromoter promoter = e.getValue();
      if (promoter == null) {
        if (norms) {
          fi.setNormValueType(null);
        } else {
          fi.setDocValuesType(null);
        }
      } else {
        assert promoter != TypePromoter.getIdentityPromoter();
        if (norms) {
          if (fi.getNormType() != promoter.type() && !fi.omitsNorms()) {
            // reset the type if we got promoted
            fi.setNormValueType(promoter.type());
          }  
        } else {
          if (fi.getDocValuesType() != promoter.type()) {
            // reset the type if we got promoted
            fi.setDocValuesType(promoter.type());
          }
        }
      }
    }
  }


  /**
   *
   * @return The number of documents in all of the readers
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */
  private int mergeFields() throws IOException {
    final StoredFieldsWriter fieldsWriter = codec.storedFieldsFormat().fieldsWriter(directory, mergeState.segmentInfo, context);
    
    try {
      return fieldsWriter.merge(mergeState);
    } finally {
      fieldsWriter.close();
    }
  }

  /**
   * Merge the TermVectors from each of the segments into the new one.
   * @throws IOException if there is a low-level IO error
   */
  private final int mergeVectors() throws IOException {
    final TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, mergeState.segmentInfo, context);
    
    try {
      return termVectorsWriter.merge(mergeState);
    } finally {
      termVectorsWriter.close();
    }
  }

  // NOTE: removes any "all deleted" readers from mergeState.readers
  private int setDocMaps() throws IOException {
    final int numReaders = mergeState.readers.size();

    // Remap docIDs
    mergeState.docMaps = new MergeState.DocMap[numReaders];
    mergeState.docBase = new int[numReaders];

    int docBase = 0;

    int i = 0;
    while(i < mergeState.readers.size()) {

      final AtomicReader reader = mergeState.readers.get(i);

      mergeState.docBase[i] = docBase;
      final MergeState.DocMap docMap = MergeState.DocMap.build(reader);
      mergeState.docMaps[i] = docMap;
      docBase += docMap.numDocs();

      i++;
    }

    return docBase;
  }

  private final void mergeTerms(SegmentWriteState segmentWriteState) throws IOException {
    
    final List<Fields> fields = new ArrayList<Fields>();
    final List<ReaderSlice> slices = new ArrayList<ReaderSlice>();

    int docBase = 0;

    for(int readerIndex=0;readerIndex<mergeState.readers.size();readerIndex++) {
      final AtomicReader reader = mergeState.readers.get(readerIndex);
      final Fields f = reader.fields();
      final int maxDoc = reader.maxDoc();
      if (f != null) {
        slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
        fields.add(f);
      }
      docBase += maxDoc;
    }

    final FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState);
    boolean success = false;
    try {
      consumer.merge(mergeState,
                     new MultiFields(fields.toArray(Fields.EMPTY_ARRAY),
                                     slices.toArray(ReaderSlice.EMPTY_ARRAY)));
      success = true;
    } finally {
      if (success) {
        IOUtils.close(consumer);
      } else {
        IOUtils.closeWhileHandlingException(consumer);
      }
    }
  }

  private void mergePerDoc(SegmentWriteState segmentWriteState) throws IOException {
      final PerDocConsumer docsConsumer = codec.docValuesFormat()
          .docsConsumer(new PerDocWriteState(segmentWriteState));
      // TODO: remove this check when 3.x indexes are no longer supported
      // (3.x indexes don't have docvalues)
      if (docsConsumer == null) {
        return;
      }
      boolean success = false;
      try {
        docsConsumer.merge(mergeState);
        success = true;
      } finally {
        if (success) {
          IOUtils.close(docsConsumer);
        } else {
          IOUtils.closeWhileHandlingException(docsConsumer);
        }
      }
  }
  
  private void mergeNorms(SegmentWriteState segmentWriteState) throws IOException {
    final PerDocConsumer docsConsumer = codec.normsFormat()
        .docsConsumer(new PerDocWriteState(segmentWriteState));
    // TODO: remove this check when 3.x indexes are no longer supported
    // (3.x indexes don't have docvalues)
    if (docsConsumer == null) {
      return;
    }
    boolean success = false;
    try {
      docsConsumer.merge(mergeState);
      success = true;
    } finally {
      if (success) {
        IOUtils.close(docsConsumer);
      } else {
        IOUtils.closeWhileHandlingException(docsConsumer);
      }
    }
  }
}