package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.TreeSet;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.MissingOrdRemapper;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosReader.LegacyDocValuesType;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
class Lucene40DocValuesWriter extends DocValuesConsumer {
private final Directory dir;
private final SegmentWriteState state;
private final String legacyKey;
private final static String segmentSuffix = "dv";
// note: intentionally ignores seg suffix
Lucene40DocValuesWriter(SegmentWriteState state, String filename, String legacyKey) throws IOException {
this.state = state;
this.legacyKey = legacyKey;
this.dir = new CompoundFileDirectory(state.directory, filename, state.context, true);
}
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
// examine the values to determine best type to use
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
for (Number n : values) {
long v = n == null ? 0 : n.longValue();
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
}
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "dat");
IndexOutput data = dir.createOutput(fileName, state.context);
boolean success = false;
try {
if (minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE && PackedInts.bitsRequired(maxValue-minValue) > 4) {
// fits in a byte[], would be more than 4bpv, just write byte[]
addBytesField(field, data, values);
} else if (minValue >= Short.MIN_VALUE && maxValue <= Short.MAX_VALUE && PackedInts.bitsRequired(maxValue-minValue) > 8) {
// fits in a short[], would be more than 8bpv, just write short[]
addShortsField(field, data, values);
} else if (minValue >= Integer.MIN_VALUE && maxValue <= Integer.MAX_VALUE && PackedInts.bitsRequired(maxValue-minValue) > 16) {
// fits in a int[], would be more than 16bpv, just write int[]
addIntsField(field, data, values);
} else {
addVarIntsField(field, data, values, minValue, maxValue);
}
success = true;
} finally {
if (success) {
IOUtils.close(data);
} else {
IOUtils.closeWhileHandlingException(data);
}
}
}
private void addBytesField(FieldInfo field, IndexOutput output, Iterable<Number> values) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.FIXED_INTS_8.name());
CodecUtil.writeHeader(output,
Lucene40DocValuesFormat.INTS_CODEC_NAME,
Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
output.writeInt(1); // size
for (Number n : values) {
output.writeByte(n == null ? 0 : n.byteValue());
}
}
private void addShortsField(FieldInfo field, IndexOutput output, Iterable<Number> values) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.FIXED_INTS_16.name());
CodecUtil.writeHeader(output,
Lucene40DocValuesFormat.INTS_CODEC_NAME,
Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
output.writeInt(2); // size
for (Number n : values) {
output.writeShort(n == null ? 0 : n.shortValue());
}
}
private void addIntsField(FieldInfo field, IndexOutput output, Iterable<Number> values) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.FIXED_INTS_32.name());
CodecUtil.writeHeader(output,
Lucene40DocValuesFormat.INTS_CODEC_NAME,
Lucene40DocValuesFormat.INTS_VERSION_CURRENT);
output.writeInt(4); // size
for (Number n : values) {
output.writeInt(n == null ? 0 : n.intValue());
}
}
private void addVarIntsField(FieldInfo field, IndexOutput output, Iterable<Number> values, long minValue, long maxValue) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.VAR_INTS.name());
CodecUtil.writeHeader(output,
Lucene40DocValuesFormat.VAR_INTS_CODEC_NAME,
Lucene40DocValuesFormat.VAR_INTS_VERSION_CURRENT);
final long delta = maxValue - minValue;
if (delta < 0) {
// writes longs
output.writeByte(Lucene40DocValuesFormat.VAR_INTS_FIXED_64);
for (Number n : values) {
output.writeLong(n == null ? 0 : n.longValue());
}
} else {
// writes packed ints
output.writeByte(Lucene40DocValuesFormat.VAR_INTS_PACKED);
output.writeLong(minValue);
output.writeLong(0 - minValue); // default value (representation of 0)
PackedInts.Writer writer = PackedInts.getWriter(output,
state.segmentInfo.getDocCount(),
PackedInts.bitsRequired(delta),
PackedInts.DEFAULT);
for (Number n : values) {
long v = n == null ? 0 : n.longValue();
writer.add(v - minValue);
}
writer.finish();
}
}
@Override
public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
// examine the values to determine best type to use
HashSet<BytesRef> uniqueValues = new HashSet<>();
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
for (BytesRef b : values) {
if (b == null) {
b = new BytesRef(); // 4.0 doesnt distinguish
}
if (b.length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
}
minLength = Math.min(minLength, b.length);
maxLength = Math.max(maxLength, b.length);
if (uniqueValues != null) {
if (uniqueValues.add(BytesRef.deepCopyOf(b))) {
if (uniqueValues.size() > 256) {
uniqueValues = null;
}
}
}
}
int maxDoc = state.segmentInfo.getDocCount();
final boolean fixed = minLength == maxLength;
final boolean dedup = uniqueValues != null && uniqueValues.size() * 2 < maxDoc;
if (dedup) {
// we will deduplicate and deref values
boolean success = false;
IndexOutput data = null;
IndexOutput index = null;
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "dat");
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "idx");
try {
data = dir.createOutput(dataName, state.context);
index = dir.createOutput(indexName, state.context);
if (fixed) {
addFixedDerefBytesField(field, data, index, values, minLength);
} else {
addVarDerefBytesField(field, data, index, values);
}
success = true;
} finally {
if (success) {
IOUtils.close(data, index);
} else {
IOUtils.closeWhileHandlingException(data, index);
}
}
} else {
// we dont deduplicate, just write values straight
if (fixed) {
// fixed byte[]
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "dat");
IndexOutput data = dir.createOutput(fileName, state.context);
boolean success = false;
try {
addFixedStraightBytesField(field, data, values, minLength);
success = true;
} finally {
if (success) {
IOUtils.close(data);
} else {
IOUtils.closeWhileHandlingException(data);
}
}
} else {
// variable byte[]
boolean success = false;
IndexOutput data = null;
IndexOutput index = null;
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "dat");
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "idx");
try {
data = dir.createOutput(dataName, state.context);
index = dir.createOutput(indexName, state.context);
addVarStraightBytesField(field, data, index, values);
success = true;
} finally {
if (success) {
IOUtils.close(data, index);
} else {
IOUtils.closeWhileHandlingException(data, index);
}
}
}
}
}
private void addFixedStraightBytesField(FieldInfo field, IndexOutput output, Iterable<BytesRef> values, int length) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_FIXED_STRAIGHT.name());
CodecUtil.writeHeader(output,
Lucene40DocValuesFormat.BYTES_FIXED_STRAIGHT_CODEC_NAME,
Lucene40DocValuesFormat.BYTES_FIXED_STRAIGHT_VERSION_CURRENT);
output.writeInt(length);
for (BytesRef v : values) {
if (v != null) {
output.writeBytes(v.bytes, v.offset, v.length);
}
}
}
// NOTE: 4.0 file format docs are crazy/wrong here...
private void addVarStraightBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_STRAIGHT.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT);
/* values */
final long startPos = data.getFilePointer();
for (BytesRef v : values) {
if (v != null) {
data.writeBytes(v.bytes, v.offset, v.length);
}
}
/* addresses */
final long maxAddress = data.getFilePointer() - startPos;
index.writeVLong(maxAddress);
final int maxDoc = state.segmentInfo.getDocCount();
assert maxDoc != Integer.MAX_VALUE; // unsupported by the 4.0 impl
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc+1, PackedInts.bitsRequired(maxAddress), PackedInts.DEFAULT);
long currentPosition = 0;
for (BytesRef v : values) {
w.add(currentPosition);
if (v != null) {
currentPosition += v.length;
}
}
// write sentinel
assert currentPosition == maxAddress;
w.add(currentPosition);
w.finish();
}
private void addFixedDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values, int length) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_FIXED_DEREF.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_FIXED_DEREF_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_FIXED_DEREF_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_FIXED_DEREF_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_FIXED_DEREF_VERSION_CURRENT);
// deduplicate
TreeSet<BytesRef> dictionary = new TreeSet<>();
for (BytesRef v : values) {
dictionary.add(v == null ? new BytesRef() : BytesRef.deepCopyOf(v));
}
/* values */
data.writeInt(length);
for (BytesRef v : dictionary) {
data.writeBytes(v.bytes, v.offset, v.length);
}
/* ordinals */
int valueCount = dictionary.size();
assert valueCount > 0;
index.writeInt(valueCount);
final int maxDoc = state.segmentInfo.getDocCount();
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT);
for (BytesRef v : values) {
if (v == null) {
v = new BytesRef();
}
int ord = dictionary.headSet(v).size();
w.add(ord);
}
w.finish();
}
private void addVarDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_DEREF.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);
// deduplicate
TreeSet<BytesRef> dictionary = new TreeSet<>();
for (BytesRef v : values) {
dictionary.add(v == null ? new BytesRef() : BytesRef.deepCopyOf(v));
}
/* values */
long startPosition = data.getFilePointer();
long currentAddress = 0;
HashMap<BytesRef,Long> valueToAddress = new HashMap<>();
for (BytesRef v : dictionary) {
currentAddress = data.getFilePointer() - startPosition;
valueToAddress.put(v, currentAddress);
writeVShort(data, v.length);
data.writeBytes(v.bytes, v.offset, v.length);
}
/* ordinals */
long totalBytes = data.getFilePointer() - startPosition;
index.writeLong(totalBytes);
final int maxDoc = state.segmentInfo.getDocCount();
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(currentAddress), PackedInts.DEFAULT);
for (BytesRef v : values) {
w.add(valueToAddress.get(v == null ? new BytesRef() : v));
}
w.finish();
}
// the little vint encoding used for var-deref
private static void writeVShort(IndexOutput o, int i) throws IOException {
assert i >= 0 && i <= Short.MAX_VALUE;
if (i < 128) {
o.writeByte((byte)i);
} else {
o.writeByte((byte) (0x80 | (i >> 8)));
o.writeByte((byte) (i & 0xff));
}
}
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
// examine the values to determine best type to use
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
for (BytesRef b : values) {
minLength = Math.min(minLength, b.length);
maxLength = Math.max(maxLength, b.length);
}
// but dont use fixed if there are missing values (we are simulating how lucene40 wrote dv...)
boolean anyMissing = false;
for (Number n : docToOrd) {
if (n.longValue() == -1) {
anyMissing = true;
break;
}
}
boolean success = false;
IndexOutput data = null;
IndexOutput index = null;
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "dat");
String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name + "_" + Integer.toString(field.number), segmentSuffix, "idx");
try {
data = dir.createOutput(dataName, state.context);
index = dir.createOutput(indexName, state.context);
if (minLength == maxLength && !anyMissing) {
// fixed byte[]
addFixedSortedBytesField(field, data, index, values, docToOrd, minLength);
} else {
// var byte[]
// three cases for simulating the old writer:
// 1. no missing
// 2. missing (and empty string in use): remap ord=-1 -> ord=0
// 3. missing (and empty string not in use): remap all ords +1, insert empty string into values
if (!anyMissing) {
addVarSortedBytesField(field, data, index, values, docToOrd);
} else if (minLength == 0) {
addVarSortedBytesField(field, data, index, values, MissingOrdRemapper.mapMissingToOrd0(docToOrd));
} else {
addVarSortedBytesField(field, data, index, MissingOrdRemapper.insertEmptyValue(values), MissingOrdRemapper.mapAllOrds(docToOrd));
}
}
success = true;
} finally {
if (success) {
IOUtils.close(data, index);
} else {
IOUtils.closeWhileHandlingException(data, index);
}
}
}
private void addFixedSortedBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values, Iterable<Number> docToOrd, int length) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_FIXED_SORTED.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_FIXED_SORTED_VERSION_CURRENT);
/* values */
data.writeInt(length);
int valueCount = 0;
for (BytesRef v : values) {
data.writeBytes(v.bytes, v.offset, v.length);
valueCount++;
}
/* ordinals */
index.writeInt(valueCount);
int maxDoc = state.segmentInfo.getDocCount();
assert valueCount > 0;
final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT);
for (Number n : docToOrd) {
w.add(n.longValue());
}
w.finish();
}
private void addVarSortedBytesField(FieldInfo field, IndexOutput data, IndexOutput index, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
field.putAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_SORTED.name());
CodecUtil.writeHeader(data,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_DAT,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);
CodecUtil.writeHeader(index,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_IDX,
Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);
/* values */
final long startPos = data.getFilePointer();
int valueCount = 0;
for (BytesRef v : values) {
data.writeBytes(v.bytes, v.offset, v.length);
valueCount++;
}
/* addresses */
final long maxAddress = data.getFilePointer() - startPos;
index.writeLong(maxAddress);
assert valueCount != Integer.MAX_VALUE; // unsupported by the 4.0 impl
final PackedInts.Writer w = PackedInts.getWriter(index, valueCount+1, PackedInts.bitsRequired(maxAddress), PackedInts.DEFAULT);
long currentPosition = 0;
for (BytesRef v : values) {
w.add(currentPosition);
currentPosition += v.length;
}
// write sentinel
assert currentPosition == maxAddress;
w.add(currentPosition);
w.finish();
/* ordinals */
final int maxDoc = state.segmentInfo.getDocCount();
assert valueCount > 0;
final PackedInts.Writer ords = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT);
for (Number n : docToOrd) {
ords.add(n.longValue());
}
ords.finish();
}
@Override
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
throw new UnsupportedOperationException("Lucene 4.0 does not support SortedSet docvalues");
}
@Override
public void close() throws IOException {
dir.close();
}
}