package org.apache.lucene.codecs.lucene40.values; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.codecs.lucene40.values.Bytes.BytesReaderBase; import org.apache.lucene.codecs.lucene40.values.Bytes.BytesSourceBase; import org.apache.lucene.codecs.lucene40.values.Bytes.DerefBytesWriterBase; import org.apache.lucene.index.DocValues.Type; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.PackedInts; // Stores variable-length byte[] by deref, ie when two docs // have the same value, they store only 1 byte[] and both // docs reference that single source /** * @lucene.experimental */ class VarDerefBytesImpl { static final String CODEC_NAME_IDX = "VarDerefBytesIdx"; static final String CODEC_NAME_DAT = "VarDerefBytesDat"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; /* * TODO: if impls like this are merged we are bound to the amount of memory we * can store into a BytesRefHash and therefore how much memory a ByteBlockPool * can address. This is currently limited to 2GB. While we could extend that * and use 64bit for addressing this still limits us to the existing main * memory as all distinct bytes will be loaded up into main memory. We could * move the byte[] writing to #finish(int) and store the bytes in sorted * order and merge them in a streamed fashion. */ static class Writer extends DerefBytesWriterBase { public Writer(Directory dir, String id, Counter bytesUsed, IOContext context) { super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_CURRENT, bytesUsed, context, Type.BYTES_VAR_DEREF); size = 0; } @Override protected void checkSize(BytesRef bytes) { // allow var bytes sizes } // Important that we get docCount, in case there were // some last docs that we didn't see @Override public void finishInternal(int docCount) throws IOException { fillDefault(docCount); final int size = hash.size(); final long[] addresses = new long[size]; final IndexOutput datOut = getOrCreateDataOut(); int addr = 0; final BytesRef bytesRef = new BytesRef(); for (int i = 0; i < size; i++) { hash.get(i, bytesRef); addresses[i] = addr; addr += writePrefixLength(datOut, bytesRef) + bytesRef.length; datOut.writeBytes(bytesRef.bytes, bytesRef.offset, bytesRef.length); } final IndexOutput idxOut = getOrCreateIndexOut(); // write the max address to read directly on source load idxOut.writeLong(addr); writeIndex(idxOut, docCount, addresses[addresses.length-1], addresses, docToEntry); } } public static class VarDerefReader extends BytesReaderBase { private final long totalBytes; VarDerefReader(Directory dir, String id, int maxDoc, IOContext context) throws IOException { super(dir, id, CODEC_NAME_IDX, CODEC_NAME_DAT, VERSION_START, true, context, Type.BYTES_VAR_DEREF); totalBytes = idxIn.readLong(); } @Override public Source load() throws IOException { return new VarDerefSource(cloneData(), cloneIndex(), totalBytes); } @Override public Source getDirectSource() throws IOException { return new DirectVarDerefSource(cloneData(), cloneIndex(), getType()); } } final static class VarDerefSource extends BytesSourceBase { private final PackedInts.Reader addresses; public VarDerefSource(IndexInput datIn, IndexInput idxIn, long totalBytes) throws IOException { super(datIn, idxIn, new PagedBytes(PAGED_BYTES_BITS), totalBytes, Type.BYTES_VAR_DEREF); addresses = PackedInts.getReader(idxIn); } @Override public BytesRef getBytes(int docID, BytesRef bytesRef) { return data.fillSliceWithPrefix(bytesRef, addresses.get(docID)); } } final static class DirectVarDerefSource extends DirectSource { private final PackedInts.Reader index; DirectVarDerefSource(IndexInput data, IndexInput index, Type type) throws IOException { super(data, type); this.index = PackedInts.getDirectReader(index); } @Override protected int position(int docID) throws IOException { data.seek(baseOffset + index.get(docID)); final byte sizeByte = data.readByte(); if ((sizeByte & 128) == 0) { // length is 1 byte return sizeByte; } else { return ((sizeByte & 0x7f) << 8) | ((data.readByte() & 0xff)); } } } }