package org.apache.lucene.codecs.lucene40.values; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Base class for specific Bytes Reader/Writer implementations */ import java.io.IOException; import java.util.Comparator; import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.index.DocValues.SortedSource; import org.apache.lucene.index.DocValues.Source; import org.apache.lucene.index.DocValues.Type; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexableField; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool.Allocator; import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash.TrackingDirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.PackedInts; /** * Provides concrete Writer/Reader implementations for <tt>byte[]</tt> value per * document. There are 6 package-private default implementations of this, for * all combinations of {@link Mode#DEREF}/{@link Mode#STRAIGHT} x fixed-length/variable-length. * * <p> * NOTE: Currently the total amount of byte[] data stored (across a single * segment) cannot exceed 2GB. * </p> * <p> * NOTE: Each byte[] must be <= 32768 bytes in length * </p> * * @lucene.experimental */ public final class Bytes { static final String DV_SEGMENT_SUFFIX = "dv"; // TODO - add bulk copy where possible private Bytes() { /* don't instantiate! */ } /** * Defines the {@link Writer}s store mode. The writer will either store the * bytes sequentially ({@link #STRAIGHT}, dereferenced ({@link #DEREF}) or * sorted ({@link #SORTED}) * * @lucene.experimental */ public static enum Mode { /** * Mode for sequentially stored bytes */ STRAIGHT, /** * Mode for dereferenced stored bytes */ DEREF, /** * Mode for sorted stored bytes */ SORTED }; /** * Creates a new <tt>byte[]</tt> {@link Writer} instances for the given * directory. * * @param dir * the directory to write the values to * @param id * the id used to create a unique file name. Usually composed out of * the segment name and a unique id per segment. * @param mode * the writers store mode * @param fixedSize * <code>true</code> if all bytes subsequently passed to the * {@link Writer} will have the same length * @param sortComparator {@link BytesRef} comparator used by sorted variants. * If <code>null</code> {@link BytesRef#getUTF8SortedAsUnicodeComparator()} * is used instead * @param bytesUsed * an {@link AtomicLong} instance to track the used bytes within the * {@link Writer}. A call to {@link Writer#finish(int)} will release * all internally used resources and frees the memory tracking * reference. * @param acceptableOverheadRatio * how to trade space for speed. This option is only applicable for * docvalues of type {@link Type#BYTES_FIXED_SORTED} and * {@link Type#BYTES_VAR_SORTED}. * @param context I/O Context * @return a new {@link Writer} instance * @see PackedInts#getReader(org.apache.lucene.store.DataInput) */ public static DocValuesConsumer getWriter(Directory dir, String id, Mode mode, boolean fixedSize, Comparator<BytesRef> sortComparator, Counter bytesUsed, IOContext context, float acceptableOverheadRatio) { // TODO -- i shouldn't have to specify fixed? can // track itself & do the write thing at write time? if (sortComparator == null) { sortComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); } if (fixedSize) { if (mode == Mode.STRAIGHT) { return new FixedStraightBytesImpl.Writer(dir, id, bytesUsed, context); } else if (mode == Mode.DEREF) { return new FixedDerefBytesImpl.Writer(dir, id, bytesUsed, context); } else if (mode == Mode.SORTED) { return new FixedSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio); } } else { if (mode == Mode.STRAIGHT) { return new VarStraightBytesImpl.Writer(dir, id, bytesUsed, context); } else if (mode == Mode.DEREF) { return new VarDerefBytesImpl.Writer(dir, id, bytesUsed, context); } else if (mode == Mode.SORTED) { return new VarSortedBytesImpl.Writer(dir, id, sortComparator, bytesUsed, context, acceptableOverheadRatio); } } throw new IllegalArgumentException(""); } /** * Creates a new {@link DocValues} instance that provides either memory * resident or iterative access to a per-document stored <tt>byte[]</tt> * value. The returned {@link DocValues} instance will be initialized without * consuming a significant amount of memory. * * @param dir * the directory to load the {@link DocValues} from. * @param id * the file ID in the {@link Directory} to load the values from. * @param mode * the mode used to store the values * @param fixedSize * <code>true</code> iff the values are stored with fixed-size, * otherwise <code>false</code> * @param maxDoc * the number of document values stored for the given ID * @param sortComparator {@link BytesRef} comparator used by sorted variants. * If <code>null</code> {@link BytesRef#getUTF8SortedAsUnicodeComparator()} * is used instead * @return an initialized {@link DocValues} instance. * @throws IOException * if an {@link IOException} occurs */ public static DocValues getValues(Directory dir, String id, Mode mode, boolean fixedSize, int maxDoc, Comparator<BytesRef> sortComparator, IOContext context) throws IOException { if (sortComparator == null) { sortComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); } // TODO -- I can peek @ header to determing fixed/mode? if (fixedSize) { if (mode == Mode.STRAIGHT) { return new FixedStraightBytesImpl.FixedStraightReader(dir, id, maxDoc, context); } else if (mode == Mode.DEREF) { return new FixedDerefBytesImpl.FixedDerefReader(dir, id, maxDoc, context); } else if (mode == Mode.SORTED) { return new FixedSortedBytesImpl.Reader(dir, id, maxDoc, context, Type.BYTES_FIXED_SORTED, sortComparator); } } else { if (mode == Mode.STRAIGHT) { return new VarStraightBytesImpl.VarStraightReader(dir, id, maxDoc, context); } else if (mode == Mode.DEREF) { return new VarDerefBytesImpl.VarDerefReader(dir, id, maxDoc, context); } else if (mode == Mode.SORTED) { return new VarSortedBytesImpl.Reader(dir, id, maxDoc,context, Type.BYTES_VAR_SORTED, sortComparator); } } throw new IllegalArgumentException("Illegal Mode: " + mode); } // TODO open up this API? static abstract class BytesSourceBase extends Source { private final PagedBytes pagedBytes; protected final IndexInput datIn; protected final IndexInput idxIn; protected final static int PAGED_BYTES_BITS = 15; protected final PagedBytes.Reader data; protected final long totalLengthInBytes; protected BytesSourceBase(IndexInput datIn, IndexInput idxIn, PagedBytes pagedBytes, long bytesToRead, Type type) throws IOException { super(type); assert bytesToRead <= datIn.length() : " file size is less than the expected size diff: " + (bytesToRead - datIn.length()) + " pos: " + datIn.getFilePointer(); this.datIn = datIn; this.totalLengthInBytes = bytesToRead; this.pagedBytes = pagedBytes; this.pagedBytes.copy(datIn, bytesToRead); data = pagedBytes.freeze(true); this.idxIn = idxIn; } } // TODO: open up this API?! static abstract class BytesWriterBase extends Writer { private final String id; private IndexOutput idxOut; private IndexOutput datOut; protected BytesRef bytesRef = new BytesRef(); private final Directory dir; private final String codecNameIdx; private final String codecNameDat; private final int version; private final IOContext context; protected BytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int version, Counter bytesUsed, IOContext context, Type type) { super(bytesUsed, type); this.id = id; this.dir = dir; this.codecNameIdx = codecNameIdx; this.codecNameDat = codecNameDat; this.version = version; this.context = context; assert codecNameDat != null || codecNameIdx != null: "both codec names are null"; assert (codecNameDat != null && !codecNameDat.equals(codecNameIdx)) || (codecNameIdx != null && !codecNameIdx.equals(codecNameDat)): "index and data codec names must not be equal"; } protected IndexOutput getOrCreateDataOut() throws IOException { if (datOut == null) { boolean success = false; assert codecNameDat != null; try { datOut = dir.createOutput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX, DocValuesWriterBase.DATA_EXTENSION), context); CodecUtil.writeHeader(datOut, codecNameDat, version); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(datOut); } } } return datOut; } protected IndexOutput getIndexOut() { return idxOut; } protected IndexOutput getDataOut() { return datOut; } protected IndexOutput getOrCreateIndexOut() throws IOException { boolean success = false; try { if (idxOut == null) { assert codecNameIdx != null; idxOut = dir.createOutput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX, DocValuesWriterBase.INDEX_EXTENSION), context); CodecUtil.writeHeader(idxOut, codecNameIdx, version); } success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(idxOut); } } return idxOut; } @Override public abstract void finish(int docCount) throws IOException; } /** * Opens all necessary files, but does not read any data in until you call * {@link #load}. */ static abstract class BytesReaderBase extends DocValues { protected final IndexInput idxIn; protected final IndexInput datIn; protected final int version; protected final String id; protected final Type type; protected BytesReaderBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int maxVersion, boolean doIndex, IOContext context, Type type) throws IOException { IndexInput dataIn = null; IndexInput indexIn = null; boolean success = false; try { dataIn = dir.openInput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX, DocValuesWriterBase.DATA_EXTENSION), context); version = CodecUtil.checkHeader(dataIn, codecNameDat, maxVersion, maxVersion); if (doIndex) { indexIn = dir.openInput(IndexFileNames.segmentFileName(id, DV_SEGMENT_SUFFIX, DocValuesWriterBase.INDEX_EXTENSION), context); final int version2 = CodecUtil.checkHeader(indexIn, codecNameIdx, maxVersion, maxVersion); assert version == version2; } success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(dataIn, indexIn); } } datIn = dataIn; idxIn = indexIn; this.type = type; this.id = id; } /** * clones and returns the data {@link IndexInput} */ protected final IndexInput cloneData() { assert datIn != null; return datIn.clone(); } /** * clones and returns the indexing {@link IndexInput} */ protected final IndexInput cloneIndex() { assert idxIn != null; return idxIn.clone(); } @Override public void close() throws IOException { try { super.close(); } finally { IOUtils.close(datIn, idxIn); } } @Override public Type getType() { return type; } } static abstract class DerefBytesWriterBase extends BytesWriterBase { protected int size = -1; protected int lastDocId = -1; protected int[] docToEntry; protected final BytesRefHash hash; protected final float acceptableOverheadRatio; protected long maxBytes = 0; protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int codecVersion, Counter bytesUsed, IOContext context, Type type) { this(dir, id, codecNameIdx, codecNameDat, codecVersion, new DirectTrackingAllocator( ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), bytesUsed, context, PackedInts.DEFAULT, type); } protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int codecVersion, Counter bytesUsed, IOContext context, float acceptableOverheadRatio, Type type) { this(dir, id, codecNameIdx, codecNameDat, codecVersion, new DirectTrackingAllocator( ByteBlockPool.BYTE_BLOCK_SIZE, bytesUsed), bytesUsed, context, acceptableOverheadRatio, type); } protected DerefBytesWriterBase(Directory dir, String id, String codecNameIdx, String codecNameDat, int codecVersion, Allocator allocator, Counter bytesUsed, IOContext context, float acceptableOverheadRatio, Type type) { super(dir, id, codecNameIdx, codecNameDat, codecVersion, bytesUsed, context, type); hash = new BytesRefHash(new ByteBlockPool(allocator), BytesRefHash.DEFAULT_CAPACITY, new TrackingDirectBytesStartArray( BytesRefHash.DEFAULT_CAPACITY, bytesUsed)); docToEntry = new int[1]; bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); this.acceptableOverheadRatio = acceptableOverheadRatio; } protected static int writePrefixLength(DataOutput datOut, BytesRef bytes) throws IOException { if (bytes.length < 128) { datOut.writeByte((byte) bytes.length); return 1; } else { datOut.writeByte((byte) (0x80 | (bytes.length >> 8))); datOut.writeByte((byte) (bytes.length & 0xff)); return 2; } } @Override public void add(int docID, IndexableField value) throws IOException { BytesRef bytes = value.binaryValue(); assert bytes != null; if (bytes.length == 0) { // default value - skip it return; } checkSize(bytes); fillDefault(docID); int ord = hash.add(bytes); if (ord < 0) { ord = (-ord) - 1; } else { maxBytes += bytes.length; } docToEntry[docID] = ord; lastDocId = docID; } protected void fillDefault(int docID) { if (docID >= docToEntry.length) { final int size = docToEntry.length; docToEntry = ArrayUtil.grow(docToEntry, 1 + docID); bytesUsed.addAndGet((docToEntry.length - size) * RamUsageEstimator.NUM_BYTES_INT); } assert size >= 0; BytesRef ref = new BytesRef(size); ref.length = size; int ord = hash.add(ref); if (ord < 0) { ord = (-ord) - 1; } for (int i = lastDocId+1; i < docID; i++) { docToEntry[i] = ord; } } protected void checkSize(BytesRef bytes) { if (size == -1) { size = bytes.length; } else if (bytes.length != size) { throw new IllegalArgumentException("expected bytes size=" + size + " but got " + bytes.length); } } public int getValueSize() { return size; } // Important that we get docCount, in case there were // some last docs that we didn't see @Override public void finish(int docCount) throws IOException { boolean success = false; try { finishInternal(docCount); success = true; } finally { releaseResources(); if (success) { IOUtils.close(getIndexOut(), getDataOut()); } else { IOUtils.closeWhileHandlingException(getIndexOut(), getDataOut()); } } } protected abstract void finishInternal(int docCount) throws IOException; protected void releaseResources() { hash.close(); bytesUsed.addAndGet((-docToEntry.length) * RamUsageEstimator.NUM_BYTES_INT); docToEntry = null; } protected void writeIndex(IndexOutput idxOut, int docCount, long maxValue, int[] toEntry) throws IOException { writeIndex(idxOut, docCount, maxValue, (int[])null, toEntry); } protected void writeIndex(IndexOutput idxOut, int docCount, long maxValue, int[] addresses, int[] toEntry) throws IOException { final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, PackedInts.bitsRequired(maxValue), acceptableOverheadRatio); final int limit = docCount > docToEntry.length ? docToEntry.length : docCount; assert toEntry.length >= limit -1; if (addresses != null) { for (int i = 0; i < limit; i++) { assert addresses[toEntry[i]] >= 0; w.add(addresses[toEntry[i]]); } } else { for (int i = 0; i < limit; i++) { assert toEntry[i] >= 0; w.add(toEntry[i]); } } for (int i = limit; i < docCount; i++) { w.add(0); } w.finish(); } protected void writeIndex(IndexOutput idxOut, int docCount, long maxValue, long[] addresses, int[] toEntry) throws IOException { final PackedInts.Writer w = PackedInts.getWriter(idxOut, docCount, PackedInts.bitsRequired(maxValue), acceptableOverheadRatio); final int limit = docCount > docToEntry.length ? docToEntry.length : docCount; assert toEntry.length >= limit -1; if (addresses != null) { for (int i = 0; i < limit; i++) { assert addresses[toEntry[i]] >= 0; w.add(addresses[toEntry[i]]); } } else { for (int i = 0; i < limit; i++) { assert toEntry[i] >= 0; w.add(toEntry[i]); } } for (int i = limit; i < docCount; i++) { w.add(0); } w.finish(); } } static abstract class BytesSortedSourceBase extends SortedSource { private final PagedBytes pagedBytes; protected final PackedInts.Reader docToOrdIndex; protected final PackedInts.Reader ordToOffsetIndex; protected final IndexInput datIn; protected final IndexInput idxIn; protected final BytesRef defaultValue = new BytesRef(); protected final static int PAGED_BYTES_BITS = 15; protected final PagedBytes.Reader data; protected BytesSortedSourceBase(IndexInput datIn, IndexInput idxIn, Comparator<BytesRef> comp, long bytesToRead, Type type, boolean hasOffsets) throws IOException { this(datIn, idxIn, comp, new PagedBytes(PAGED_BYTES_BITS), bytesToRead, type, hasOffsets); } protected BytesSortedSourceBase(IndexInput datIn, IndexInput idxIn, Comparator<BytesRef> comp, PagedBytes pagedBytes, long bytesToRead, Type type, boolean hasOffsets) throws IOException { super(type, comp); assert bytesToRead <= datIn.length() : " file size is less than the expected size diff: " + (bytesToRead - datIn.length()) + " pos: " + datIn.getFilePointer(); this.datIn = datIn; this.pagedBytes = pagedBytes; this.pagedBytes.copy(datIn, bytesToRead); data = pagedBytes.freeze(true); this.idxIn = idxIn; ordToOffsetIndex = hasOffsets ? PackedInts.getReader(idxIn) : null; docToOrdIndex = PackedInts.getReader(idxIn); } @Override public boolean hasPackedDocToOrd() { return true; } @Override public PackedInts.Reader getDocToOrd() { return docToOrdIndex; } @Override public int ord(int docID) { assert docToOrdIndex.get(docID) < getValueCount(); return (int) docToOrdIndex.get(docID); } protected void closeIndexInput() throws IOException { IOUtils.close(datIn, idxIn); } } }