package org.apache.lucene.index.codecs.sep;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
u * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
/** Writes frq to .frq, docs to .doc, pos to .pos, payloads
* to .pyl, skip data to .skp
*
* @lucene.experimental */
public final class SepPostingsWriterImpl extends StandardPostingsWriter {
final static String CODEC = "SepDocFreqSkip";
final static String DOC_EXTENSION = "doc";
final static String SKIP_EXTENSION = "skp";
final static String FREQ_EXTENSION = "frq";
final static String POS_EXTENSION = "pos";
final static String PAYLOAD_EXTENSION = "pyl";
// Increment version to change it:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final IntIndexOutput freqOut;
final IntIndexOutput.Index freqIndex;
final IntIndexOutput posOut;
final IntIndexOutput.Index posIndex;
final IntIndexOutput docOut;
final IntIndexOutput.Index docIndex;
final IndexOutput payloadOut;
final IndexOutput skipOut;
IndexOutput termsOut;
final SepSkipListWriter skipListWriter;
final int skipInterval;
final int maxSkipLevels;
final int totalNumDocs;
boolean storePayloads;
boolean omitTF;
// Starts a new term
long lastSkipStart;
FieldInfo fieldInfo;
int lastPayloadLength;
int lastPosition;
long payloadStart;
long lastPayloadStart;
int lastDocID;
int df;
public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException {
super();
final String docFileName = IndexFileNames.segmentFileName(state.segmentName, "", DOC_EXTENSION);
state.flushedFiles.add(docFileName);
docOut = factory.createOutput(state.directory, docFileName);
docIndex = docOut.index();
if (state.fieldInfos.hasProx()) {
final String frqFileName = IndexFileNames.segmentFileName(state.segmentName, "", FREQ_EXTENSION);
state.flushedFiles.add(frqFileName);
freqOut = factory.createOutput(state.directory, frqFileName);
freqIndex = freqOut.index();
final String posFileName = IndexFileNames.segmentFileName(state.segmentName, "", POS_EXTENSION);
posOut = factory.createOutput(state.directory, posFileName);
state.flushedFiles.add(posFileName);
posIndex = posOut.index();
// TODO: -- only if at least one field stores payloads?
final String payloadFileName = IndexFileNames.segmentFileName(state.segmentName, "", PAYLOAD_EXTENSION);
state.flushedFiles.add(payloadFileName);
payloadOut = state.directory.createOutput(payloadFileName);
} else {
freqOut = null;
freqIndex = null;
posOut = null;
posIndex = null;
payloadOut = null;
}
final String skipFileName = IndexFileNames.segmentFileName(state.segmentName, "", SKIP_EXTENSION);
state.flushedFiles.add(skipFileName);
skipOut = state.directory.createOutput(skipFileName);
totalNumDocs = state.numDocs;
// TODO: -- abstraction violation
skipListWriter = new SepSkipListWriter(state.skipInterval,
state.maxSkipLevels,
state.numDocs,
freqOut, docOut,
posOut, payloadOut);
skipInterval = state.skipInterval;
maxSkipLevels = state.maxSkipLevels;
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
// TODO: -- just ask skipper to "start" here
termsOut.writeInt(skipInterval); // write skipInterval
termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
}
@Override
public void startTerm() throws IOException {
docIndex.mark();
if (!omitTF) {
freqIndex.mark();
posIndex.mark();
payloadStart = payloadOut.getFilePointer();
lastPayloadLength = -1;
}
skipListWriter.resetSkip(docIndex, freqIndex, posIndex);
}
// TODO: -- should we NOT reuse across fields? would
// be cleaner
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
public void setField(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
omitTF = fieldInfo.omitTermFreqAndPositions;
skipListWriter.setOmitTF(omitTF);
storePayloads = !omitTF && fieldInfo.storePayloads;
}
/** Adds a new doc in this term. If this returns null
* then we just skip consuming positions/payloads. */
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
final int delta = docID - lastDocID;
if (docID < 0 || (df > 0 && delta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
}
if ((++df % skipInterval) == 0) {
// TODO: -- awkward we have to make these two
// separate calls to skipper
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.bufferSkip(df);
}
lastDocID = docID;
docOut.write(delta);
if (!omitTF) {
freqOut.write(termDocFreq);
}
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
assert !omitTF;
final int delta = position - lastPosition;
lastPosition = position;
if (storePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
// TODO: explore whether we get better compression
// by not storing payloadLength into prox stream?
posOut.write((delta<<1)|1);
posOut.write(payloadLength);
} else {
posOut.write(delta << 1);
}
if (payloadLength > 0) {
payloadOut.writeBytes(payload.bytes, payload.offset, payloadLength);
}
} else {
posOut.write(delta);
}
lastPosition = position;
}
/** Called when we are done adding positions & payloads */
@Override
public void finishDoc() {
lastPosition = 0;
}
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
long skipPos = skipOut.getFilePointer();
// TODO: -- wasteful we are counting this in two places?
assert docCount > 0;
assert docCount == df;
// TODO: -- only do this if once (consolidate the
// conditional things that are written)
if (!omitTF) {
freqIndex.write(termsOut, isIndexTerm);
}
docIndex.write(termsOut, isIndexTerm);
if (df >= skipInterval) {
skipListWriter.writeSkip(skipOut);
}
if (isIndexTerm) {
termsOut.writeVLong(skipPos);
lastSkipStart = skipPos;
} else if (df >= skipInterval) {
termsOut.writeVLong(skipPos-lastSkipStart);
lastSkipStart = skipPos;
}
if (!omitTF) {
posIndex.write(termsOut, isIndexTerm);
if (isIndexTerm) {
// Write absolute at seek points
termsOut.writeVLong(payloadStart);
} else {
termsOut.writeVLong(payloadStart-lastPayloadStart);
}
lastPayloadStart = payloadStart;
}
lastDocID = 0;
df = 0;
}
@Override
public void close() throws IOException {
try {
docOut.close();
} finally {
try {
skipOut.close();
} finally {
if (freqOut != null) {
try {
freqOut.close();
} finally {
try {
posOut.close();
} finally {
payloadOut.close();
}
}
}
}
}
}
public static void getExtensions(Set<String> extensions) {
extensions.add(DOC_EXTENSION);
extensions.add(FREQ_EXTENSION);
extensions.add(SKIP_EXTENSION);
extensions.add(POS_EXTENSION);
extensions.add(PAYLOAD_EXTENSION);
}
}