package org.apache.lucene.codecs.lucene40; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.DataOutput; // javadocs import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; /** * Lucene 4.0 Term Vectors format. * <p>Term Vector support is an optional on a field by field basis. It consists of * 3 files.</p> * <ol> * <li><a name="tvx" id="tvx"></a> * <p>The Document Index or .tvx file.</p> * <p>For each document, this stores the offset into the document data (.tvd) and * field data (.tvf) files.</p> * <p>DocumentIndex (.tvx) --> Header,<DocumentPosition,FieldPosition> * <sup>NumDocs</sup></p> * <ul> * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> * <li>DocumentPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li> * <li>FieldPosition --> {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li> * </ul> * </li> * <li><a name="tvd" id="tvd"></a> * <p>The Document or .tvd file.</p> * <p>This contains, for each document, the number of fields, a list of the fields * with term vector info and finally a list of pointers to the field information * in the .tvf (Term Vector Fields) file.</p> * <p>The .tvd file is used to map out the fields that have term vectors stored * and where the field information is in the .tvf file.</p> * <p>Document (.tvd) --> Header,<NumFields, FieldNums, * FieldPositions> <sup>NumDocs</sup></p> * <ul> * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> * <li>NumFields --> {@link DataOutput#writeVInt VInt}</li> * <li>FieldNums --> <FieldNumDelta> <sup>NumFields</sup></li> * <li>FieldNumDelta --> {@link DataOutput#writeVInt VInt}</li> * <li>FieldPositions --> <FieldPositionDelta> <sup>NumFields-1</sup></li> * <li>FieldPositionDelta --> {@link DataOutput#writeVLong VLong}</li> * </ul> * </li> * <li><a name="tvf" id="tvf"></a> * <p>The Field or .tvf file.</p> * <p>This file contains, for each field that has a term vector stored, a list of * the terms, their frequencies and, optionally, position, offset, and payload * information.</p> * <p>Field (.tvf) --> Header,<NumTerms, Flags, TermFreqs> * <sup>NumFields</sup></p> * <ul> * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li> * <li>NumTerms --> {@link DataOutput#writeVInt VInt}</li> * <li>Flags --> {@link DataOutput#writeByte Byte}</li> * <li>TermFreqs --> <TermText, TermFreq, Positions?, PayloadData?, Offsets?> * <sup>NumTerms</sup></li> * <li>TermText --> <PrefixLength, Suffix></li> * <li>PrefixLength --> {@link DataOutput#writeVInt VInt}</li> * <li>Suffix --> {@link DataOutput#writeString String}</li> * <li>TermFreq --> {@link DataOutput#writeVInt VInt}</li> * <li>Positions --> <PositionDelta PayloadLength?><sup>TermFreq</sup></li> * <li>PositionDelta --> {@link DataOutput#writeVInt VInt}</li> * <li>PayloadLength --> {@link DataOutput#writeVInt VInt}</li> * <li>PayloadData --> {@link DataOutput#writeByte Byte}<sup>NumPayloadBytes</sup></li> * <li>Offsets --> <{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li> * </ul> * <p>Notes:</p> * <ul> * <li>Flags byte stores whether this term vector has position, offset, payload. * information stored.</li> * <li>Term byte prefixes are shared. The PrefixLength is the number of initial * bytes from the previous term which must be pre-pended to a term's suffix * in order to form the term's bytes. Thus, if the previous term's text was "bone" * and the term is "boy", the PrefixLength is two and the suffix is "y".</li> * <li>PositionDelta is, if payloads are disabled for the term's field, the * difference between the position of the current occurrence in the document and * the previous occurrence (or zero, if this is the first occurrence in this * document). If payloads are enabled for the term's field, then PositionDelta/2 * is the difference between the current and the previous position. If payloads * are enabled and PositionDelta is odd, then PayloadLength is stored, indicating * the length of the payload at the current term position.</li> * <li>PayloadData is metadata associated with a term position. If * PayloadLength is stored at the current position, then it indicates the length * of this payload. If PayloadLength is not stored, then this payload has the same * length as the payload at the previous position. PayloadData encodes the * concatenated bytes for all of a terms occurrences.</li> * <li>Offsets are stored as delta encoded VInts. The first VInt is the * startOffset, the second is the endOffset.</li> * </ul> * </li> * </ol> */ public class Lucene40TermVectorsFormat extends TermVectorsFormat { /** Sole constructor. */ public Lucene40TermVectorsFormat() { } @Override public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException { return new Lucene40TermVectorsReader(directory, segmentInfo, fieldInfos, context); } @Override public TermVectorsWriter vectorsWriter(Directory directory, SegmentInfo segmentInfo, IOContext context) throws IOException { return new Lucene40TermVectorsWriter(directory, segmentInfo.name, context); } }