package org.apache.lucene.index.codecs.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.packed.PackedInts;
import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
/** @lucene.experimental */
public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
protected final IndexOutput out;
final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final private int termIndexInterval;
private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
private final FieldInfos fieldInfos; // unread
private IndexOutput termsOut;
public SimpleStandardTermsIndexWriter(SegmentWriteState state) throws IOException {
final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, "", StandardCodec.TERMS_INDEX_EXTENSION);
state.flushedFiles.add(indexFileName);
termIndexInterval = state.termIndexInterval;
out = state.directory.createOutput(indexFileName);
fieldInfos = state.fieldInfos;
writeHeader(out);
out.writeInt(termIndexInterval);
}
protected void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
// Placeholder for dir offset
out.writeLong(0);
}
@Override
public void setTermsOutput(IndexOutput termsOut) {
this.termsOut = termsOut;
}
@Override
public FieldWriter addField(FieldInfo field) {
SimpleFieldWriter writer = new SimpleFieldWriter(field);
fields.add(writer);
return writer;
}
private class SimpleFieldWriter extends FieldWriter {
final FieldInfo fieldInfo;
int numIndexTerms;
final long indexStart;
final long termsStart;
long packedIndexStart;
long packedOffsetsStart;
private long numTerms;
// TODO: we could conceivably make a PackedInts wrapper
// that auto-grows... then we wouldn't force 6 bytes RAM
// per index term:
private short[] termLengths;
private int[] termsPointerDeltas;
private long lastTermsPointer;
private long totTermLength;
private final BytesRef lastTerm = new BytesRef();
SimpleFieldWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
termsStart = lastTermsPointer = termsOut.getFilePointer();
termLengths = new short[0];
termsPointerDeltas = new int[0];
}
@Override
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {
// we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
final int limit = Math.min(lastTerm.length, text.length);
int minPrefixDiff = Math.min(1+lastTerm.length, text.length);
for(int byteIdx=0;byteIdx<limit;byteIdx++) {
if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
minPrefixDiff = byteIdx+1;
break;
}
}
// write only the min prefix that shows the diff
// against prior term
out.writeBytes(text.bytes, text.offset, minPrefixDiff);
if (termLengths.length == numIndexTerms) {
termLengths = ArrayUtil.grow(termLengths);
}
if (termsPointerDeltas.length == numIndexTerms) {
termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
}
// save delta terms pointer
final long fp = termsOut.getFilePointer();
termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer);
lastTermsPointer = fp;
// save term length (in bytes)
assert minPrefixDiff <= Short.MAX_VALUE;
termLengths[numIndexTerms] = (short) minPrefixDiff;
totTermLength += minPrefixDiff;
lastTerm.copy(text);
numIndexTerms++;
return true;
} else {
if (0 == numTerms % termIndexInterval) {
// save last term just before next index term so we
// can compute wasted suffix
lastTerm.copy(text);
}
return false;
}
}
@Override
public void finish() throws IOException {
// write primary terms dict offsets
packedIndexStart = out.getFilePointer();
final long maxValue = termsOut.getFilePointer();
PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue));
// relative to our indexStart
long upto = 0;
for(int i=0;i<numIndexTerms;i++) {
upto += termsPointerDeltas[i];
w.add(upto);
}
w.finish();
packedOffsetsStart = out.getFilePointer();
// write offsets into the byte[] terms
w = PackedInts.getWriter(out, 1+numIndexTerms, PackedInts.bitsRequired(totTermLength));
upto = 0;
for(int i=0;i<numIndexTerms;i++) {
w.add(upto);
upto += termLengths[i];
}
w.add(upto);
w.finish();
// our referrer holds onto us, while other fields are
// being written, so don't tie up this RAM:
termLengths = null;
termsPointerDeltas = null;
}
}
@Override
public void close() throws IOException {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
out.writeInt(fieldCount);
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
out.writeInt(field.fieldInfo.number);
out.writeInt(field.numIndexTerms);
out.writeLong(field.termsStart);
out.writeLong(field.indexStart);
out.writeLong(field.packedIndexStart);
out.writeLong(field.packedOffsetsStart);
}
writeTrailer(dirStart);
out.close();
}
protected void writeTrailer(long dirStart) throws IOException {
out.seek(CodecUtil.headerLength(CODEC_NAME));
out.writeLong(dirStart);
}
}