SimpleStandardTermsIndexWriter.java example

Explorer
solrcene-master
package org.apache.lucene.index.codecs.standard;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;

import java.util.List;
import java.util.ArrayList;
import java.io.IOException;

/** @lucene.experimental */
public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
  protected final IndexOutput out;

  final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
  final static int VERSION_START = 0;
  final static int VERSION_CURRENT = VERSION_START;

  final private int termIndexInterval;

  private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
  private final FieldInfos fieldInfos; // unread
  private IndexOutput termsOut;

  public SimpleStandardTermsIndexWriter(SegmentWriteState state) throws IOException {
    final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.TERMS_INDEX_EXTENSION);
    state.flushedFiles.add(indexFileName);
    termIndexInterval = state.termIndexInterval;
    out = state.directory.createOutput(indexFileName);
    fieldInfos = state.fieldInfos;
    writeHeader(out);
    out.writeInt(termIndexInterval);
  }
  
  protected void writeHeader(IndexOutput out) throws IOException {
    CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
    // Placeholder for dir offset
    out.writeLong(0);
  }

  @Override
  public void setTermsOutput(IndexOutput termsOut) {
    this.termsOut = termsOut;
  }
  
  @Override
  public FieldWriter addField(FieldInfo field) {
    SimpleFieldWriter writer = new SimpleFieldWriter(field);
    fields.add(writer);
    return writer;
  }

  private class SimpleFieldWriter extends FieldWriter {
    final FieldInfo fieldInfo;
    int numIndexTerms;
    final long indexStart;
    final long termsStart;
    long packedIndexStart;
    long packedOffsetsStart;
    private long numTerms;

    // TODO: we could conceivably make a PackedInts wrapper
    // that auto-grows... then we wouldn't force 6 bytes RAM
    // per index term:
    private short[] termLengths;
    private int[] termsPointerDeltas;
    private long lastTermsPointer;
    private long totTermLength;

    private final BytesRef lastTerm = new BytesRef();

    SimpleFieldWriter(FieldInfo fieldInfo) {
      this.fieldInfo = fieldInfo;
      indexStart = out.getFilePointer();
      termsStart = lastTermsPointer = termsOut.getFilePointer();
      termLengths = new short[0];
      termsPointerDeltas = new int[0];
    }

    @Override
    public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
      // First term is first indexed term:
      if (0 == (numTerms++ % termIndexInterval)) {

        // we can safely strip off the non-distinguishing
        // suffix to save RAM in the loaded terms index.
        final int limit = Math.min(lastTerm.length, text.length);
        int minPrefixDiff = Math.min(1+lastTerm.length, text.length);
        for(int byteIdx=0;byteIdx<limit;byteIdx++) {
          if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
            minPrefixDiff = byteIdx+1;
            break;
          }
        }

        // write only the min prefix that shows the diff
        // against prior term
        out.writeBytes(text.bytes, text.offset, minPrefixDiff);

        if (termLengths.length == numIndexTerms) {
          termLengths = ArrayUtil.grow(termLengths);
        }
        if (termsPointerDeltas.length == numIndexTerms) {
          termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
        }

        // save delta terms pointer
        final long fp = termsOut.getFilePointer();
        termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer);
        lastTermsPointer = fp;

        // save term length (in bytes)
        assert minPrefixDiff <= Short.MAX_VALUE;
        termLengths[numIndexTerms] = (short) minPrefixDiff;
        totTermLength += minPrefixDiff;

        lastTerm.copy(text);
        numIndexTerms++;
        return true;
      } else {
        if (0 == numTerms % termIndexInterval) {
          // save last term just before next index term so we
          // can compute wasted suffix
          lastTerm.copy(text);
        }
        return false;
      }
    }

    @Override
    public void finish() throws IOException {

      // write primary terms dict offsets
      packedIndexStart = out.getFilePointer();

      final long maxValue = termsOut.getFilePointer();
      PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue));

      // relative to our indexStart
      long upto = 0;
      for(int i=0;i<numIndexTerms;i++) {
        upto += termsPointerDeltas[i];
        w.add(upto);
      }
      w.finish();

      packedOffsetsStart = out.getFilePointer();

      // write offsets into the byte[] terms
      w = PackedInts.getWriter(out, 1+numIndexTerms, PackedInts.bitsRequired(totTermLength));
      upto = 0;
      for(int i=0;i<numIndexTerms;i++) {
        w.add(upto);
        upto += termLengths[i];
      }
      w.add(upto);
      w.finish();

      // our referrer holds onto us, while other fields are
      // being written, so don't tie up this RAM:
      termLengths = null;
      termsPointerDeltas = null;
    }
  }

  @Override
  public void close() throws IOException {
    final long dirStart = out.getFilePointer();
    final int fieldCount = fields.size();

    out.writeInt(fieldCount);
    for(int i=0;i<fieldCount;i++) {
      SimpleFieldWriter field = fields.get(i);
      out.writeInt(field.fieldInfo.number);
      out.writeInt(field.numIndexTerms);
      out.writeLong(field.termsStart);
      out.writeLong(field.indexStart);
      out.writeLong(field.packedIndexStart);
      out.writeLong(field.packedOffsetsStart);
    }
    writeTrailer(dirStart);
    out.close();
  }

  protected void writeTrailer(long dirStart) throws IOException {
    out.seek(CodecUtil.headerLength(CODEC_NAME));
    out.writeLong(dirStart);
  }
}