/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.index.codecs.siren10;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.MappingMultiDocsAndPositionsEnum;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.analysis.filter.VIntPayloadCodec;
import org.sindice.siren.index.MappingMultiDocsNodesAndPositionsEnum;
import org.sindice.siren.index.codecs.block.BlockIndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Writes the document identifiers, node frequencies, node labels, term
* frequencies, term positions and block skip data.
*/
public class Siren10PostingsWriter extends PostingsWriterBase {
final static String CODEC = "Siren10PostingsWriter";
final static String DOC_EXTENSION = "doc";
final static String SKIP_EXTENSION = "skp";
final static String NOD_EXTENSION = "nod";
final static String POS_EXTENSION = "pos";
// Increment version to change it:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
DocsFreqBlockIndexOutput docOut;
DocsFreqBlockIndexOutput.DocsFreqBlockWriter docWriter;
DocsFreqBlockIndexOutput.Index docIndex;
NodBlockIndexOutput nodOut;
NodBlockIndexOutput.NodBlockWriter nodWriter;
NodBlockIndexOutput.Index nodIndex;
PosBlockIndexOutput posOut;
PosBlockIndexOutput.PosBlockWriter posWriter;
PosBlockIndexOutput.Index posIndex;
IndexOutput skipOut;
IndexOutput termsOut;
final Siren10SkipListWriter skipWriter;
/**
* Expert: The fraction of blocks stored in skip tables,
* used to accelerate {@link DocsEnum#advance(int)}. Larger values result in
* smaller indexes, greater acceleration, but fewer accelerable cases, while
* smaller values result in bigger indexes, less acceleration and more
* accelerable cases.
*/
final int blockSkipInterval;
static final int DEFAULT_BLOCK_SKIP_INTERVAL = 2;
/**
* Expert: minimum block to write any skip data at all
*/
final int blockSkipMinimum;
/**
* Expert: maximum block size allowed.
*/
final int maxBlockSize;
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*/
final int maxSkipLevels = 10;
final int totalNumDocs;
IndexOptions indexOptions;
FieldInfo fieldInfo;
int blockCount;
// Holds pending byte[] blob for the current terms block
private final RAMOutputStream indexBytesWriter = new RAMOutputStream();
protected static final Logger logger = LoggerFactory.getLogger(Siren10PostingsWriter.class);
public Siren10PostingsWriter(final SegmentWriteState state,
final Siren10BlockStreamFactory factory)
throws IOException {
this(state, DEFAULT_BLOCK_SKIP_INTERVAL, factory);
}
public Siren10PostingsWriter(final SegmentWriteState state,
final int blockSkipInterval,
final Siren10BlockStreamFactory factory)
throws IOException {
nodOut = null;
nodIndex = null;
posOut = null;
posIndex = null;
boolean success = false;
try {
this.blockSkipInterval = blockSkipInterval;
this.blockSkipMinimum = blockSkipInterval; /* set to the same for now */
final String docFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, DOC_EXTENSION);
docOut = factory.createDocsFreqOutput(state.directory, docFileName, state.context);
docWriter = docOut.getBlockWriter();
docIndex = docOut.index();
this.maxBlockSize = docWriter.getMaxBlockSize();
final String nodFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, NOD_EXTENSION);
nodOut = factory.createNodOutput(state.directory, nodFileName, state.context);
nodWriter = nodOut.getBlockWriter();
nodIndex = nodOut.index();
final String posFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, POS_EXTENSION);
posOut = factory.createPosOutput(state.directory, posFileName, state.context);
posWriter = posOut.getBlockWriter();
posIndex = posOut.index();
final String skipFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, SKIP_EXTENSION);
skipOut = state.directory.createOutput(skipFileName, state.context);
totalNumDocs = state.segmentInfo.getDocCount();
// EStimate number of blocks that will be written
final int numBlocks = (int) Math.ceil(totalNumDocs / (double) docWriter.getMaxBlockSize());
skipWriter = new Siren10SkipListWriter(blockSkipInterval, maxSkipLevels,
numBlocks, docOut);
docWriter.setNodeBlockIndex(nodIndex);
docWriter.setPosBlockIndex(posIndex);
success = true;
}
finally {
if (!success) {
IOUtils.closeWhileHandlingException(docOut, skipOut, nodOut, posOut);
}
}
}
@Override
public void start(final IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeInt(blockSkipInterval); // write skipInterval
termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
termsOut.writeInt(blockSkipMinimum); // write skipMinimum
termsOut.writeInt(maxBlockSize); // write maxBlockSize
}
@Override
public void startTerm() throws IOException {
docIndex.mark();
nodIndex.mark();
posIndex.mark();
skipWriter.resetSkip(docIndex);
}
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
public void setField(final FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
this.indexOptions = fieldInfo.getIndexOptions();
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
skipWriter.setIndexOptions(indexOptions);
}
/**
* Adds a new doc in this term. If this returns null
* then we just skip consuming positions.
* <p>
* {@code termDocFreq} parameter is ignored as term frequency in document is
* not used.
*/
@Override
public void startDoc(final int docID, final int termDocFreq)
throws IOException {
if (docID < 0) {
throw new CorruptIndexException("docs out of order (" + docID + ") (docOut: " + docOut + ")");
}
if (docWriter.isFull()) {
if ((++blockCount % blockSkipInterval) == 0) {
skipWriter.setSkipData(docWriter.getFirstDocId());
skipWriter.bufferSkip(blockCount);
}
docWriter.flush();
nodWriter.flush(); // flush node block to synchronise it with doc block
posWriter.flush(); // flush pos block to synchronise it with doc block
}
docWriter.write(docID);
// reset current node for delta computation
nodWriter.resetCurrentNode();
// reset payload hash to sentinel value
lastNodeHash = Long.MAX_VALUE;
}
/**
* Sentinel value {@link Long.MAX_VALUE} is necessary in order to avoid
* equality with nodes composed of '0' values.
* <p>
* Use long to avoid collision between sentinel value and payload hashcode.
* <p>
* Using payload hashcode seems to be the fastest way for testing node
* equality. See micro-benchmark {@link NodeEqualityBenchmark}.
*/
private long lastNodeHash = Long.MAX_VALUE;
private final VIntPayloadCodec sirenPayload = new VIntPayloadCodec();
private int nodeFreqInDoc = 0;
private int termFreqInNode = 0;
@Override
public void addPosition(final int position, final BytesRef payload,
final int startOffset, final int endOffset)
throws IOException {
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
// we always receive node ids in the payload
assert payload != null;
// decode payload
sirenPayload.decode(payload);
final IntsRef node = sirenPayload.getNode();
// check if we received the same node
// TODO: we pay the cost of decoding the node before testing the equality
// we could instead directly compute the node hash based on the byte array
final int nodeHash = node.hashCode();
if (lastNodeHash != nodeHash) { // if different node
// add term freq for previous node if not first payload.
if (lastNodeHash != Long.MAX_VALUE) {
this.addTermFreqInNode();
}
// add new node
this.addNode(node);
}
lastNodeHash = nodeHash;
// add position
this.addPosition(sirenPayload.getPosition());
}
private void addNode(final IntsRef node) {
nodWriter.write(node);
nodeFreqInDoc++;
// reset current position for delta computation
posWriter.resetCurrentPosition();
}
private void addPosition(final int position) {
posWriter.write(position);
termFreqInNode++;
}
private void addNodeFreqInDoc() {
docWriter.writeNodeFreq(nodeFreqInDoc);
nodeFreqInDoc = 0;
}
private void addTermFreqInNode() {
nodWriter.writeTermFreq(termFreqInNode);
termFreqInNode = 0;
}
@Override
public void finishDoc() {
this.addNodeFreqInDoc();
this.addTermFreqInNode();
}
private static class PendingTerm {
public final BlockIndexOutput.Index docIndex;
public final long skipFP;
public final int blockCount;
public PendingTerm(final BlockIndexOutput.Index docIndex,
final long skipFP, final int blockCount) {
this.docIndex = docIndex;
this.skipFP = skipFP;
this.blockCount = blockCount;
}
}
private final List<PendingTerm> pendingTerms = new ArrayList<PendingTerm>();
/**
* Called when we are done adding docs to this term
*/
@Override
public void finishTerm(final TermStats stats) throws IOException {
assert stats.docFreq > 0;
// if block flush pending, write last skip data
if (!docWriter.isEmpty() && (++blockCount % blockSkipInterval) == 0) {
skipWriter.setSkipData(docWriter.getFirstDocId());
skipWriter.bufferSkip(blockCount);
}
// flush doc block
docWriter.flush();
final BlockIndexOutput.Index docIndexCopy = docOut.index();
docIndexCopy.copyFrom(docIndex, false);
// flush node block
nodWriter.flush();
final BlockIndexOutput.Index nodIndexCopy = nodOut.index();
nodIndexCopy.copyFrom(nodIndex, false);
// flush pos block
posWriter.flush();
final BlockIndexOutput.Index posIndexCopy = posOut.index();
posIndexCopy.copyFrom(posIndex, false);
// Write skip data to the output file
final long skipFP;
if (blockCount >= blockSkipMinimum) {
skipFP = skipOut.getFilePointer();
skipWriter.writeSkip(skipOut);
}
else {
skipFP = -1;
}
pendingTerms.add(new PendingTerm(docIndexCopy, skipFP, blockCount));
// reset block counter
blockCount = 0;
}
@Override
public void flushTermsBlock(final int start, final int count) throws IOException {
// logger.debug("flushTermsBlock: {}", this.hashCode());
assert indexBytesWriter.getFilePointer() == 0;
final int absStart = pendingTerms.size() - start;
final List<PendingTerm> slice = pendingTerms.subList(absStart, absStart+count);
long lastSkipFP = 0;
if (count == 0) {
termsOut.writeByte((byte) 0);
return;
}
final PendingTerm firstTerm = slice.get(0);
final BlockIndexOutput.Index docIndexFlush = firstTerm.docIndex;
for (int idx = 0; idx < slice.size(); idx++) {
final boolean isFirstTerm = idx == 0;
final PendingTerm t = slice.get(idx);
// write block count stat
// logger.debug("Write blockCount: {}", t.blockCount);
indexBytesWriter.writeVInt(t.blockCount);
docIndexFlush.copyFrom(t.docIndex, false);
// logger.debug("Write docIndex: {}", docIndexFlush);
docIndexFlush.write(indexBytesWriter, isFirstTerm);
if (t.skipFP != -1) {
if (isFirstTerm) {
indexBytesWriter.writeVLong(t.skipFP);
}
else {
indexBytesWriter.writeVLong(t.skipFP - lastSkipFP);
}
lastSkipFP = t.skipFP;
}
}
termsOut.writeVLong((int) indexBytesWriter.getFilePointer());
indexBytesWriter.writeTo(termsOut);
indexBytesWriter.reset();
slice.clear();
}
@Override
public void close() throws IOException {
IOUtils.close(docOut, skipOut, nodOut, posOut);
}
private final MappingMultiDocsNodesAndPositionsEnum postingsEnum = new MappingMultiDocsNodesAndPositionsEnum();
/**
* Default merge impl: append documents, nodes and positions, mapping around
* deletes.
* <p>
* Bypass the {@link Siren10PostingsWriter} methods and work directly with
* the BlockWriters for maximum efficiency.
* <p>
* TODO - Optimisation: If document blocks match the block size, and no
* document deleted, then it would be possible to copy block directly as byte
* array, avoiding decoding and encoding.
**/
@Override
public TermStats merge(final MergeState mergeState, final DocsEnum postings,
final FixedBitSet visitedDocs)
throws IOException {
int df = 0;
long totTF = 0;
postingsEnum.setMergeState(mergeState);
postingsEnum.reset((MappingMultiDocsAndPositionsEnum) postings);
while (postingsEnum.nextDocument()) {
final int doc = postingsEnum.doc();
visitedDocs.set(doc);
this.startDoc(doc, -1);
final int nodeFreq = postingsEnum.nodeFreqInDoc();
docWriter.writeNodeFreq(nodeFreq);
while (postingsEnum.nextNode()) {
final IntsRef node = postingsEnum.node();
nodWriter.write(node);
final int termFreqInNode = postingsEnum.termFreqInNode();
nodWriter.writeTermFreq(termFreqInNode);
// reset current position for delta computation
posWriter.resetCurrentPosition();
while (postingsEnum.nextPosition()) {
final int position = postingsEnum.pos();
posWriter.write(position);
totTF++;
}
}
df++;
}
return new TermStats(df, totTF);
}
}