/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.index.codecs.siren10;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.index.DocsNodesAndPositionsEnum;
import org.sindice.siren.index.SirenDocsEnum;
import org.sindice.siren.index.codecs.block.BlockIndexInput;
import org.sindice.siren.search.node.NodeScorer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Reads the document identifiers, node frequencies, node labels, term
* frequencies, term positions and block skip data.
*/
public class Siren10PostingsReader extends PostingsReaderBase {
final DocsFreqBlockIndexInput docIn;
final NodBlockIndexInput nodIn;
final PosBlockIndexInput posIn;
final IndexInput skipIn;
int blockSkipInterval;
int maxSkipLevels;
int blockSkipMinimum;
int maxBlockSize;
protected static final Logger logger = LoggerFactory.getLogger(Siren10PostingsReader.class);
public Siren10PostingsReader(final Directory dir, final SegmentInfo segmentInfo,
final IOContext context, final String segmentSuffix,
final Siren10BlockStreamFactory factory)
throws IOException {
boolean success = false;
try {
final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name,
segmentSuffix, Siren10PostingsWriter.DOC_EXTENSION);
docIn = factory.openDocsFreqInput(dir, docFileName, context);
nodIn = factory.openNodInput(dir, IndexFileNames.segmentFileName(segmentInfo.name,
segmentSuffix, Siren10PostingsWriter.NOD_EXTENSION), context);
skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name,
segmentSuffix, Siren10PostingsWriter.SKIP_EXTENSION), context);
posIn = factory.openPosInput(dir, IndexFileNames.segmentFileName(segmentInfo.name,
segmentSuffix, Siren10PostingsWriter.POS_EXTENSION), context);
success = true;
}
finally {
if (!success) {
this.close();
}
}
}
@Override
public void init(final IndexInput termsIn) throws IOException {
// Make sure we are talking to the matching past writer
CodecUtil.checkHeader(termsIn, Siren10PostingsWriter.CODEC,
Siren10PostingsWriter.VERSION_START, Siren10PostingsWriter.VERSION_START);
blockSkipInterval = termsIn.readInt();
maxSkipLevels = termsIn.readInt();
blockSkipMinimum = termsIn.readInt();
maxBlockSize = termsIn.readInt();
}
@Override
public void close() throws IOException {
try {
if (nodIn != null)
nodIn.close();
} finally {
try {
if (docIn != null)
docIn.close();
} finally {
try {
if (skipIn != null)
skipIn.close();
} finally {
if (posIn != null) {
posIn.close();
}
}
}
}
}
private static final class Siren10TermState extends BlockTermState {
// We store only the seek point to the docs file because
// the rest of the info (freqIndex, posIndex, etc.) is
// stored in the docs file:
BlockIndexInput.Index docIndex;
long skipFP;
int blockCount;
// Only used for "primary" term state; these are never
// copied on clone:
// TODO: these should somehow be stored per-TermsEnum
// not per TermState; maybe somehow the terms dict
// should load/manage the byte[]/DataReader for us?
byte[] bytes;
ByteArrayDataInput bytesReader;
@Override
public Siren10TermState clone() {
final Siren10TermState other = new Siren10TermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(final TermState _other) {
super.copyFrom(_other);
final Siren10TermState other = (Siren10TermState) _other;
blockCount = other.blockCount;
if (docIndex == null) {
docIndex = (BlockIndexInput.Index) other.docIndex.clone();
}
else {
docIndex.set(other.docIndex);
}
skipFP = other.skipFP;
}
@Override
public String toString() {
return super.toString() + " docIndex=" + docIndex + " skipFP=" + skipFP
+ " blockCount=" + blockCount;
}
}
@Override
public BlockTermState newTermState() throws IOException {
final Siren10TermState state = new Siren10TermState();
state.docIndex = docIn.index();
return state;
}
@Override
public void readTermsBlock(final IndexInput termsIn, final FieldInfo fieldInfo,
final BlockTermState _termState) throws IOException {
final Siren10TermState termState = (Siren10TermState) _termState;
final int len = termsIn.readVInt();
if (termState.bytes == null) {
termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
termState.bytesReader = new ByteArrayDataInput(termState.bytes);
}
else if (termState.bytes.length < len) {
termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
}
termState.bytesReader.reset(termState.bytes, 0, len);
termsIn.readBytes(termState.bytes, 0, len);
}
@Override
public void nextTerm(final FieldInfo fieldInfo, final BlockTermState _termState) throws IOException {
final Siren10TermState termState = (Siren10TermState) _termState;
final boolean isFirstTerm = termState.termBlockOrd == 0;
termState.blockCount = termState.bytesReader.readVInt();
termState.docIndex.read(termState.bytesReader, isFirstTerm);
if (termState.blockCount >= blockSkipMinimum) {
if (isFirstTerm) {
termState.skipFP = termState.bytesReader.readVLong();
} else {
termState.skipFP += termState.bytesReader.readVLong();
}
}
else if (isFirstTerm) {
termState.skipFP = 0;
}
}
@Override
public DocsEnum docs(final FieldInfo fieldInfo, final BlockTermState termState,
final Bits liveDocs, final DocsEnum reuse,
final int flags) throws IOException {
Siren10DocsEnum docsEnum;
if (this.canReuse(reuse, liveDocs)) {
docsEnum = (Siren10DocsEnum) reuse;
}
else {
docsEnum = new Siren10DocsEnum();
}
return docsEnum.init(fieldInfo, (Siren10TermState) termState, liveDocs);
}
private boolean canReuse(final DocsEnum reuse, final Bits liveDocs) {
if (reuse != null && (reuse instanceof Siren10DocsEnum)) {
final Siren10DocsEnum docsEnum = (Siren10DocsEnum) reuse;
// If you are using ParellelReader, and pass in a
// reused DocsEnum, it could have come from another
// reader also using standard codec
if (docsEnum.getDocsNodesAndPositionsEnum().startDocIn == docIn) {
// we only reuse if the the actual the incoming enum has the same liveDocs as the given liveDocs
return liveDocs == docsEnum.getDocsNodesAndPositionsEnum().liveDocs;
}
}
return false;
}
@Override
public DocsAndPositionsEnum docsAndPositions(final FieldInfo fieldInfo,
final BlockTermState termState,
final Bits liveDocs,
final DocsAndPositionsEnum reuse,
final int flags)
throws IOException {
return (DocsAndPositionsEnum) this.docs(fieldInfo, termState, liveDocs, reuse, flags);
}
/**
* This {@link DocsAndPositionsEnum} implementation is a decorator over a
* {@link DocsNodesAndPositionsEnum} which:
* <ul>
* <li> is used to supply the {@link DocsNodesAndPositionsEnum} in
* {@link Siren10PostingsWriter#merge(org.apache.lucene.index.MergeState, DocsEnum, org.apache.lucene.util.FixedBitSet)}
* and in {@link NodeScorer}.
* <li> emulate a {@link DocsAndPositionsEnum} to be compatible with Lucene's
* internal mechanism, especially with {@link CheckIndex}.
* </ul>
* <p>
* This implementation is very inefficient and should not be used outside
* unit tests.
* <p>
* Positions in {@link DocsNodesAndPositionsEnum} are local to a node.
* This implementation emulates {@link #nextPosition()} by scaling
* up positions with a position gap that are relative to the node id.
* Therefore, the positions returned by this enum are not the real positions.
* <p>
* The position gap is based on a hash of the node id. The hash of the
* node id is computed by normalising the node order between 0 and
* {@link Integer#MAX_VALUE}.
* <p>
* If this enum is used with Lucene's Positional Scorers, there is a chance
* of false-positive results.
* <p>
* There is a chance that the position returned is negative, in case the
* number of nodes are close to {@link Integer#MAX_VALUE}.
* <p>
* This position is only used in {@link CheckIndex} after unit tests, where
* the node tree structure is relatively simple.
*/
class Siren10DocsEnum extends SirenDocsEnum {
private final Siren10DocsNodesAndPositionsEnum docEnum;
private final LinkedList<Integer> positions = new LinkedList<Integer>();
Siren10DocsEnum() throws IOException {
docEnum = new Siren10DocsNodesAndPositionsEnum();
}
Siren10DocsEnum init(final FieldInfo fieldInfo, final Siren10TermState termState, final Bits liveDocs)
throws IOException {
docEnum.init(fieldInfo, termState, liveDocs);
return this;
}
@Override
public Siren10DocsNodesAndPositionsEnum getDocsNodesAndPositionsEnum() {
return docEnum;
}
@Override
public int nextDoc() throws IOException {
docEnum.nextDocument();
return docEnum.doc();
}
@Override
public int freq() throws IOException {
// clear position cache
positions.clear();
// compute increment for scaling up node hash
final int inc = Integer.MAX_VALUE / (docEnum.nodeFreqInDoc() + 1);
int freq = 0;
int hash = 0;
while (docEnum.nextNode()) {
// scale up hash node based on hash increment
hash += inc;
while (docEnum.nextPosition()) {
freq++;
// cache position
positions.add(hash + docEnum.pos());
}
}
return freq;
}
@Override
public int docID() {
return docEnum.doc();
}
@Override
public int advance(final int target) throws IOException {
docEnum.skipTo(target);
return docEnum.doc();
}
@Override
public int nextPosition() throws IOException {
return positions.poll();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
}
/**
* Implementation of {@link DocsNodesAndPositionsEnum} for the SIREn 1.0
* postings format.
*
* <p>
*
* Relies on lazy-loading of the {@link BlockIndexInput}s as much as possible.
*/
class Siren10DocsNodesAndPositionsEnum extends DocsNodesAndPositionsEnum {
int docLimit;
int blockLimit;
int doc = -1;
int docCount;
int nodFreq = 0;
int termFreqInNode = 0;
IntsRef node = new IntsRef(new int[] { -1 }, 0, 1);;
int pos = -1;
// flag to know if nextNode() has been called
boolean termFreqInNodeReadPending = false;
private int pendingNodFreqCount;
private int pendingNodCount;
private int pendingTermFreqInNodeCount;
private int pendingPosNodCount;
private Bits liveDocs;
private final DocsFreqBlockIndexInput.DocsFreqBlockReader docReader;
private final NodBlockIndexInput.NodBlockReader nodReader;
private final PosBlockIndexInput.PosBlockReader posReader;
private long skipFP;
private final BlockIndexInput.Index docIndex;
private final BlockIndexInput.Index nodIndex;
private final BlockIndexInput.Index posIndex;
private final DocsFreqBlockIndexInput startDocIn;
boolean skipped;
Siren10SkipListReader skipper;
Siren10DocsNodesAndPositionsEnum() throws IOException {
startDocIn = docIn;
docReader = docIn.getBlockReader();
docIndex = docIn.index();
nodReader = nodIn.getBlockReader();
nodIndex = nodIn.index();
posReader = posIn.getBlockReader();
posIndex = posIn.index();
// register node and pos index in the doc reader
docReader.setNodeBlockIndex(nodIndex);
docReader.setPosBlockIndex(posIndex);
}
Siren10DocsNodesAndPositionsEnum init(final FieldInfo fieldInfo,
final Siren10TermState termState,
final Bits liveDocs)
throws IOException {
// logger.debug("Init DocsNodesAndPositionsEnum - id={}", this.hashCode());
this.liveDocs = liveDocs;
// Init readers
docReader.init();
nodReader.init();
posReader.init();
// TODO: can't we only do this if consumer
// skipped consuming the previous docs?
// logger.debug("Set docIndex: {}", termState.docIndex);
docIndex.set(termState.docIndex);
docIndex.seek(docReader);
docLimit = termState.docFreq;
blockLimit = termState.blockCount;
// NOTE: unused if blockCount < skipMinimum:
skipFP = termState.skipFP;
doc = -1;
this.resetFreqNodAndPos();
docCount = 0;
this.resetPendingCounters();
skipped = false;
return this;
}
private void resetPendingCounters() {
pendingNodFreqCount = 0;
pendingNodCount = 0;
pendingTermFreqInNodeCount = 0;
pendingPosNodCount = 0;
termFreqInNodeReadPending = false;
}
/**
* TODO: is it needed ?
*/
private final IntsRef UNSET_NODE = new IntsRef(new int[] { -1 }, 0, 1);
/**
* Reset the freqs to 0 and the current node and position to -1.
*/
private void resetFreqNodAndPos() {
nodFreq = termFreqInNode = 0; // lazy load of freq
node = UNSET_NODE;
pos = -1;
}
@Override
public boolean nextDocument() throws IOException {
do {
if (docCount == docLimit) {
doc = NO_MORE_DOC;
node = NO_MORE_NOD;
pos = NO_MORE_POS;
// to stop reading and decoding data in #nextNode and #nextPosition
this.resetPendingCounters();
return false;
}
docCount++;
// If block exhausted, decode next block
if (docReader.isExhausted()) {
docReader.nextBlock();
nodIndex.seek(nodReader); // move node reader to next block
nodReader.nextBlock(); // doc and node blocks are synchronised
posIndex.seek(posReader); // move node reader to next block
posReader.nextBlock(); // doc and pos blocks are synchronised
this.resetPendingCounters(); // reset counters as we move to next block
}
// decode next doc
doc = docReader.nextDocument();
this.resetFreqNodAndPos(); // reset freqs, node and pos
termFreqInNodeReadPending = false; // reset flag
// increment node freq pending counters
pendingNodFreqCount++;
} while (liveDocs != null && !liveDocs.get(doc));
return true;
}
@Override
public boolean nextNode() throws IOException {
termFreqInNode = 0; // lazy load of freq
termFreqInNodeReadPending = true;
pos = -1; // reset position
final int nodeFreqInDoc = this.nodeFreqInDoc(); // load node freq
// scan over any nodes that were ignored during doc iteration
while (pendingNodCount > nodeFreqInDoc) {
// no need to check for exhaustion as doc and node blocks are synchronised
node = nodReader.nextNode();
pendingNodCount--;
}
if (pendingNodCount > 0) {
if (pendingNodCount == nodeFreqInDoc) { // start of the new doc
// reset current node for delta computation
nodReader.resetCurrentNode();
}
// no need to check for exhaustion as doc and node blocks are synchronised
node = nodReader.nextNode();
pendingNodCount--;
assert pendingNodCount >= 0;
return true;
}
assert pendingNodCount == 0;
node = NO_MORE_NOD; // set to sentinel value
return false;
}
@Override
public boolean skipTo(final int target) throws IOException {
if ((target - (blockSkipInterval * maxBlockSize)) >= doc &&
docLimit >= (blockSkipMinimum * maxBlockSize)) {
// There are enough docs in the posting to have
// skip data, and its not too close
if (skipper == null) {
// This DocsEnum has never done any skipping
skipper = new Siren10SkipListReader(skipIn.clone(),
docIn, maxSkipLevels,
blockSkipInterval, maxBlockSize);
}
if (!skipped) {
// We haven't yet skipped for this posting
skipper.init(skipFP, docIndex, blockLimit);
skipper.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
skipped = true;
}
final int newCount = skipper.skipTo(target);
if (newCount > docCount) {
// Skipper did move
skipper.getDocIndex().seek(docReader);
docCount = newCount;
doc = skipper.getDoc();
// reset so that it is consider exhausted in #nextDocument and move
// to the next block
docReader.initBlock();
}
}
// Now, linear scan for the rest:
// TODO: Implement linear block skipping based on first and last doc ids
do {
if (!this.nextDocument()) {
return false;
}
} while (target > doc);
return true;
}
@Override
public int doc() {
return doc;
}
@Override
public IntsRef node() {
return node;
}
@Override
public boolean nextPosition() throws IOException {
final int termFreqInNode = this.termFreqInNode(); // load term freq
// scan over any positions that were ignored during doc iteration
while (pendingPosNodCount > termFreqInNode) {
// no need to check for exhaustion as doc and pos blocks are synchronised
pos = posReader.nextPosition();
pendingPosNodCount--;
}
assert pendingPosNodCount <= termFreqInNode;
if (pendingPosNodCount > 0) {
if (pendingPosNodCount == termFreqInNode) { // start of the new node
// reset current position for delta computation
posReader.resetCurrentPosition();
}
// no need to check for exhaustion as doc and pos blocks are synchronised
pos = posReader.nextPosition();
pendingPosNodCount--;
assert pendingPosNodCount >= 0;
return true;
}
assert pendingPosNodCount == 0;
pos = NO_MORE_POS; // set to sentinel value
return false;
}
@Override
public int pos() {
return pos;
}
@Override
public int nodeFreqInDoc() throws IOException {
if (nodFreq == 0) {
// scan over any freqs that were ignored during doc iteration
while (pendingNodFreqCount > 0) {
nodFreq = docReader.nextNodeFreq();
pendingNodFreqCount--;
pendingNodCount += nodFreq;
pendingTermFreqInNodeCount += nodFreq;
}
}
return nodFreq;
}
@Override
public int termFreqInNode() throws IOException {
// nextNode should be called first
if (termFreqInNodeReadPending) {
// scan over any freqs that were ignored during doc iteration
while (pendingTermFreqInNodeCount > nodFreq) {
termFreqInNode = nodReader.nextTermFreqInNode();
pendingTermFreqInNodeCount--;
pendingPosNodCount += termFreqInNode;
}
// scan next freq
termFreqInNode = nodReader.nextTermFreqInNode();
pendingTermFreqInNodeCount--;
pendingPosNodCount += termFreqInNode;
// reset flag
termFreqInNodeReadPending = false;
}
return termFreqInNode;
}
}
}