/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.memory; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; /** * FST-based term dict, using ord as FST output. * * The FST holds the mapping between <term, ord>, and * term's metadata is delta encoded into a single byte block. * * Typically the byte block consists of four parts: * 1. term statistics: docFreq, totalTermFreq; * 2. monotonic long[], e.g. the pointer to the postings list for that term; * 3. generic byte[], e.g. other information customized by postings base. * 4. single-level skip list to speed up metadata decoding by ord. * * <p> * Files: * <ul> * <li><tt>.tix</tt>: <a href="#Termindex">Term Index</a></li> * <li><tt>.tbk</tt>: <a href="#Termblock">Term Block</a></li> * </ul> * * <a name="Termindex"></a> * <h3>Term Index</h3> * <p> * The .tix contains a list of FSTs, one for each field. * The FST maps a term to its corresponding order in current field. * </p> * * <ul> * <li>TermIndex(.tix) --> Header, TermFST<sup>NumFields</sup>, Footer</li> * <li>TermFST --> {@link FST FST<long>}</li> * <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li> * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * * <p>Notes:</p> * <ul> * <li> * Since terms are already sorted before writing to <a href="#Termblock">Term Block</a>, * their ords can directly used to seek term metadata from term block. * </li> * </ul> * * <a name="Termblock"></a> * <h3>Term Block</h3> * <p> * The .tbk contains all the statistics and metadata for terms, along with field summary (e.g. * per-field data like number of documents in current field). For each field, there are four blocks: * <ul> * <li>statistics bytes block: contains term statistics; </li> * <li>metadata longs block: delta-encodes monotonic part of metadata; </li> * <li>metadata bytes block: encodes other parts of metadata; </li> * <li>skip block: contains skip data, to speed up metadata seeking and decoding</li> * </ul> * * <p>File Format:</p> * <ul> * <li>TermBlock(.tbk) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li> * <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq, * DocCount, LongsSize, DataBlock > <sup>NumFields</sup>, Footer</li> * * <li>DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, * SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock </li> * <li>SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, * MetaLongsSkipDelta<sup>LongsSize</sup> ><sup>NumTerms</sup> * <li>StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > <sup>NumTerms</sup> * <li>MetaLongsBlock --> < LongDelta<sup>LongsSize</sup>, BytesSize > <sup>NumTerms</sup> * <li>MetaBytesBlock --> Byte <sup>MetaBytesBlockLength</sup> * <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li> * <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li> * <li>NumFields, FieldNumber, DocCount, DocFreq, LongsSize, * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}</li> * <li>NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, * StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq, * LongDelta,--> {@link DataOutput#writeVLong VLong}</li> * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * <p>Notes: </p> * <ul> * <li> * The format of PostingsHeader and MetaBytes are customized by the specific postings implementation: * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data * (non-monotonic ones like pulsed postings data). * </li> * <li> * During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek * term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset * for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates * the value of preceding metadata longs for every SkipInterval's term. * </li> * <li> * DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term. * Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case, * so that encoding of TotalTermFreq may be omitted. * </li> * </ul> * * @lucene.experimental */ public class FSTOrdTermsWriter extends FieldsConsumer { static final String TERMS_INDEX_EXTENSION = "tix"; static final String TERMS_BLOCK_EXTENSION = "tbk"; static final String TERMS_CODEC_NAME = "FSTOrdTerms"; static final String TERMS_INDEX_CODEC_NAME = "FSTOrdIndex"; public static final int VERSION_START = 2; public static final int VERSION_CURRENT = VERSION_START; public static final int SKIP_INTERVAL = 8; final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; final int maxDoc; final List<FieldMetaData> fields = new ArrayList<>(); IndexOutput blockOut = null; IndexOutput indexOut = null; public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION); this.postingsWriter = postingsWriter; this.fieldInfos = state.fieldInfos; this.maxDoc = state.segmentInfo.maxDoc(); boolean success = false; try { this.indexOut = state.directory.createOutput(termsIndexFileName, state.context); this.blockOut = state.directory.createOutput(termsBlockFileName, state.context); CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); this.postingsWriter.init(blockOut, state); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(indexOut, blockOut); } } } @Override public void write(Fields fields) throws IOException { for(String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } FieldInfo fieldInfo = fieldInfos.fieldInfo(field); boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; TermsEnum termsEnum = terms.iterator(); TermsWriter termsWriter = new TermsWriter(fieldInfo); long sumTotalTermFreq = 0; long sumDocFreq = 0; FixedBitSet docsSeen = new FixedBitSet(maxDoc); while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen); if (termState != null) { termsWriter.finishTerm(term, termState); sumTotalTermFreq += termState.totalTermFreq; sumDocFreq += termState.docFreq; } } termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); } } @Override public void close() throws IOException { if (blockOut != null) { boolean success = false; try { final long blockDirStart = blockOut.getFilePointer(); // write field summary blockOut.writeVInt(fields.size()); for (FieldMetaData field : fields) { blockOut.writeVInt(field.fieldInfo.number); blockOut.writeVLong(field.numTerms); if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { blockOut.writeVLong(field.sumTotalTermFreq); } blockOut.writeVLong(field.sumDocFreq); blockOut.writeVInt(field.docCount); blockOut.writeVInt(field.longsSize); blockOut.writeVLong(field.statsOut.getFilePointer()); blockOut.writeVLong(field.metaLongsOut.getFilePointer()); blockOut.writeVLong(field.metaBytesOut.getFilePointer()); field.skipOut.writeTo(blockOut); field.statsOut.writeTo(blockOut); field.metaLongsOut.writeTo(blockOut); field.metaBytesOut.writeTo(blockOut); field.dict.save(indexOut); } writeTrailer(blockOut, blockDirStart); CodecUtil.writeFooter(indexOut); CodecUtil.writeFooter(blockOut); success = true; } finally { if (success) { IOUtils.close(blockOut, indexOut, postingsWriter); } else { IOUtils.closeWhileHandlingException(blockOut, indexOut, postingsWriter); } blockOut = null; } } } private void writeTrailer(IndexOutput out, long dirStart) throws IOException { out.writeLong(dirStart); } private static class FieldMetaData { public FieldInfo fieldInfo; public long numTerms; public long sumTotalTermFreq; public long sumDocFreq; public int docCount; public int longsSize; public FST<Long> dict; // TODO: block encode each part // vint encode next skip point (fully decoded when reading) public RAMOutputStream skipOut; // vint encode df, (ttf-df) public RAMOutputStream statsOut; // vint encode monotonic long[] and length for corresponding byte[] public RAMOutputStream metaLongsOut; // generic byte[] public RAMOutputStream metaBytesOut; } final class TermsWriter { private final Builder<Long> builder; private final PositiveIntOutputs outputs; private final FieldInfo fieldInfo; private final int longsSize; private long numTerms; private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); private final RAMOutputStream statsOut = new RAMOutputStream(); private final RAMOutputStream metaLongsOut = new RAMOutputStream(); private final RAMOutputStream metaBytesOut = new RAMOutputStream(); private final RAMOutputStream skipOut = new RAMOutputStream(); private long lastBlockStatsFP; private long lastBlockMetaLongsFP; private long lastBlockMetaBytesFP; private long[] lastBlockLongs; private long[] lastLongs; private long lastMetaBytesFP; TermsWriter(FieldInfo fieldInfo) { this.numTerms = 0; this.fieldInfo = fieldInfo; this.longsSize = postingsWriter.setField(fieldInfo); this.outputs = PositiveIntOutputs.getSingleton(); this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); this.lastBlockStatsFP = 0; this.lastBlockMetaLongsFP = 0; this.lastBlockMetaBytesFP = 0; this.lastBlockLongs = new long[longsSize]; this.lastLongs = new long[longsSize]; this.lastMetaBytesFP = 0; } public void finishTerm(BytesRef text, BlockTermState state) throws IOException { if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) { bufferSkip(); } // write term meta data into fst final long longs[] = new long[longsSize]; final long delta = state.totalTermFreq - state.docFreq; if (state.totalTermFreq > 0) { if (delta == 0) { statsOut.writeVInt(state.docFreq<<1|1); } else { statsOut.writeVInt(state.docFreq<<1); statsOut.writeVLong(state.totalTermFreq-state.docFreq); } } else { statsOut.writeVInt(state.docFreq); } postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true); for (int i = 0; i < longsSize; i++) { metaLongsOut.writeVLong(longs[i] - lastLongs[i]); lastLongs[i] = longs[i]; } metaLongsOut.writeVLong(metaBytesOut.getFilePointer() - lastMetaBytesFP); builder.add(Util.toIntsRef(text, scratchTerm), numTerms); numTerms++; lastMetaBytesFP = metaBytesOut.getFilePointer(); } public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { if (numTerms > 0) { final FieldMetaData metadata = new FieldMetaData(); metadata.fieldInfo = fieldInfo; metadata.numTerms = numTerms; metadata.sumTotalTermFreq = sumTotalTermFreq; metadata.sumDocFreq = sumDocFreq; metadata.docCount = docCount; metadata.longsSize = longsSize; metadata.skipOut = skipOut; metadata.statsOut = statsOut; metadata.metaLongsOut = metaLongsOut; metadata.metaBytesOut = metaBytesOut; metadata.dict = builder.finish(); fields.add(metadata); } } private void bufferSkip() throws IOException { skipOut.writeVLong(statsOut.getFilePointer() - lastBlockStatsFP); skipOut.writeVLong(metaLongsOut.getFilePointer() - lastBlockMetaLongsFP); skipOut.writeVLong(metaBytesOut.getFilePointer() - lastBlockMetaBytesFP); for (int i = 0; i < longsSize; i++) { skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]); } lastBlockStatsFP = statsOut.getFilePointer(); lastBlockMetaLongsFP = metaLongsOut.getFilePointer(); lastBlockMetaBytesFP = metaBytesOut.getFilePointer(); System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize); } } }