Lucene40TermVectorsFormat.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.codecs.lucene40;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;

/**
 * Lucene 4.0 Term Vectors format.
 * <p>Term Vector support is an optional on a field by field basis. It consists of
 * 3 files.</p>
 * <ol>
 * <li><a name="tvx" id="tvx"></a>
 * <p>The Document Index or .tvx file.</p>
 * <p>For each document, this stores the offset into the document data (.tvd) and
 * field data (.tvf) files.</p>
 * <p>DocumentIndex (.tvx) --> Header,<DocumentPosition,FieldPosition>
 * <sup>NumDocs</sup></p>
 * <ul>
 *   <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
 *   <li>DocumentPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
 *   <li>FieldPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
 * </ul>
 * </li>
 * <li><a name="tvd" id="tvd"></a>
 * <p>The Document or .tvd file.</p>
 * <p>This contains, for each document, the number of fields, a list of the fields
 * with term vector info and finally a list of pointers to the field information
 * in the .tvf (Term Vector Fields) file.</p>
 * <p>The .tvd file is used to map out the fields that have term vectors stored
 * and where the field information is in the .tvf file.</p>
 * <p>Document (.tvd) --> Header,<NumFields, FieldNums,
 * FieldPositions> <sup>NumDocs</sup></p>
 * <ul>
 *   <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
 *   <li>NumFields --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>FieldNums --> <FieldNumDelta> <sup>NumFields</sup></li>
 *   <li>FieldNumDelta --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>FieldPositions --> <FieldPositionDelta> <sup>NumFields-1</sup></li>
 *   <li>FieldPositionDelta --> {@link DataOutput#writeVLong VLong}</li>
 * </ul>
 * </li>
 * <li><a name="tvf" id="tvf"></a>
 * <p>The Field or .tvf file.</p>
 * <p>This file contains, for each field that has a term vector stored, a list of
 * the terms, their frequencies and, optionally, position, offset, and payload
 * information.</p>
 * <p>Field (.tvf) --> Header,<NumTerms, Flags, TermFreqs>
 * <sup>NumFields</sup></p>
 * <ul>
 *   <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
 *   <li>NumTerms --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>Flags --> {@link DataOutput#writeByte Byte}</li>
 *   <li>TermFreqs --> <TermText, TermFreq, Positions?, PayloadData?, Offsets?>
 *       <sup>NumTerms</sup></li>
 *   <li>TermText --> <PrefixLength, Suffix></li>
 *   <li>PrefixLength --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>Suffix --> {@link DataOutput#writeString String}</li>
 *   <li>TermFreq --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>Positions --> <PositionDelta PayloadLength?><sup>TermFreq</sup></li>
 *   <li>PositionDelta --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>PayloadLength --> {@link DataOutput#writeVInt VInt}</li>
 *   <li>PayloadData --> {@link DataOutput#writeByte Byte}<sup>NumPayloadBytes</sup></li>
 *   <li>Offsets --> <{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li>
 * </ul>
 * <p>Notes:</p>
 * <ul>
 * <li>Flags byte stores whether this term vector has position, offset, payload.
 * information stored.</li>
 * <li>Term byte prefixes are shared. The PrefixLength is the number of initial
 * bytes from the previous term which must be pre-pended to a term's suffix
 * in order to form the term's bytes. Thus, if the previous term's text was "bone"
 * and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
 * <li>PositionDelta is, if payloads are disabled for the term's field, the
 * difference between the position of the current occurrence in the document and
 * the previous occurrence (or zero, if this is the first occurrence in this
 * document). If payloads are enabled for the term's field, then PositionDelta/2
 * is the difference between the current and the previous position. If payloads
 * are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
 * the length of the payload at the current term position.</li>
 * <li>PayloadData is metadata associated with a term position. If
 * PayloadLength is stored at the current position, then it indicates the length
 * of this payload. If PayloadLength is not stored, then this payload has the same
 * length as the payload at the previous position. PayloadData encodes the 
 * concatenated bytes for all of a terms occurrences.</li>
 * <li>Offsets are stored as delta encoded VInts. The first VInt is the
 * startOffset, the second is the endOffset.</li>
 * </ul>
 * </li>
 * </ol>
 */
public class Lucene40TermVectorsFormat extends TermVectorsFormat {

  /** Sole constructor. */
  public Lucene40TermVectorsFormat() {
  }
  
  @Override
  public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
    return new Lucene40TermVectorsReader(directory, segmentInfo, fieldInfos, context);
  }

  @Override
  public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException {
    return new Lucene40TermVectorsWriter(directory, segmentInfo.name, context);
  }
}