/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.index.codecs.siren10; import java.io.IOException; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.sindice.siren.index.codecs.block.BlockCompressor; import org.sindice.siren.index.codecs.block.BlockIndexOutput; import org.sindice.siren.util.ArrayUtils; /** * Implementation of the {@link BlockIndexOutput} for the .nod file of the SIREn * postings format. */ public class NodBlockIndexOutput extends BlockIndexOutput { private final int maxBlockSize; private final BlockCompressor nodCompressor; public NodBlockIndexOutput(final IndexOutput out, final int maxBlockSize, final BlockCompressor nodCompressor) throws IOException { super(out); this.nodCompressor = nodCompressor; this.maxBlockSize = maxBlockSize; } @Override public NodBlockWriter getBlockWriter() { return new NodBlockWriter(); } /** * Implementation of the {@link BlockWriter} for the .nod file. * * <p> * * Encode and write blocks containing the node labels and term frequencies. * * <p> * * TODO: Can we try to reduce the number of test conditions for buffer size by * using term frequency information ? At each new document, nodBlockWriter is * informed of the term frequency, and check buffer size appropriately. */ protected class NodBlockWriter extends BlockWriter { IntsRef nodLenBuffer; IntsRef nodBuffer; IntsRef termFreqBuffer; BytesRef nodLenCompressedBuffer; BytesRef nodCompressedBuffer; BytesRef termFreqCompressedBuffer; public NodBlockWriter() { // ensure that the input buffers has the minimum size required // maxBlockSize is just use as a minimum initial capacity for the buffers nodLenBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, nodCompressor.getWindowSize())); nodBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, nodCompressor.getWindowSize())); termFreqBuffer = new IntsRef(this.getMinimumBufferSize(maxBlockSize, nodCompressor.getWindowSize())); // init of the compressed buffers nodLenCompressedBuffer = new BytesRef(); nodCompressedBuffer = new BytesRef(); termFreqCompressedBuffer = new BytesRef(); } /** * Add a node label to the buffer. */ public void write(final IntsRef node) { final int nodeOffset = node.offset; final int nodeLength = node.length; final int[] nodeInts = node.ints; assert nodeLength > 0; /* * write node */ int[] nodBufferInts = nodBuffer.ints; final int nodBufferOffset = nodBuffer.offset; // increase buffers if needed if (nodBufferOffset + nodeLength >= nodBufferInts.length) { // Take the max to ensure that buffer will be large enough int newLength = Math.max(nodBufferOffset + nodeLength, nodBufferInts.length * 3/2); // ensure that the buffer is large enough to accommodate the window size newLength = this.getMinimumBufferSize(newLength, nodCompressor.getWindowSize()); ArrayUtils.growAndCopy(nodBuffer, newLength); // update reference of the buffer's int array nodBufferInts = nodBuffer.ints; } // compute delta encoding and copy to buffer this.deltaEncodingAndCopy(nodeInts, nodeOffset, nodeLength, nodBufferInts, nodBufferOffset); // cache current node for next delta encoding this.cacheNode(nodeInts, nodeOffset, nodeLength); // increment node buffer offset with node length nodBuffer.offset += nodeLength; /* * write node length */ int[] nodLenBufferInts = nodLenBuffer.ints; // increase node length buffer if needed if (nodLenBuffer.offset >= nodLenBufferInts.length) { // Take the max to ensure that buffer will be large enough int newLength = Math.max(nodLenBuffer.offset + 1, nodLenBufferInts.length * 3/2); // ensure that the buffer is large enough to accomodate the window size newLength = this.getMinimumBufferSize(newLength, nodCompressor.getWindowSize()); ArrayUtils.growAndCopy(nodLenBuffer, newLength); // update reference of the buffer's int array nodLenBufferInts = nodLenBuffer.ints; } // decrement length by one nodLenBufferInts[nodLenBuffer.offset++] = nodeLength - 1; } /** * Node cache used for computing the delta of a node label. */ private IntsRef nodeCache = new IntsRef(DEFAULT_NODE_CACHE_SIZE); private static final int DEFAULT_NODE_CACHE_SIZE = 16; /** * Cache the given node */ private final void cacheNode(final int[] nodeInts, final int nodeOffset, final int nodeLen) { int[] nodeCacheInts = nodeCache.ints; // ensure that the cache is large enough to accommodate the node array if (nodeLen > nodeCacheInts.length) { nodeCache = ArrayUtils.grow(nodeCache, nodeLen); // update reference of the cache's int array reference nodeCacheInts = nodeCache.ints; } System.arraycopy(nodeInts, nodeOffset, nodeCacheInts, 0, nodeLen); nodeCache.offset = 0; nodeCache.length = nodeLen; } /** * Compute the delta of the new node based on the cache node and copy * the delta encoding to the node buffer */ private final void deltaEncodingAndCopy(final int[] nodeInts, final int nodeOffset, final int nodeLen, final int[] nodeBufferInts, final int nodeBufferOffset) { final int nodEnd = nodeOffset + nodeLen; final int[] nodeCacheInts = nodeCache.ints; final int nodeCacheOffset = nodeCache.offset; final int nodeCacheEnd = nodeCacheOffset + nodeCache.length; int i, j, k; // iterate over the node levels for (i = nodeCacheOffset, j = nodeOffset, k = 0; i < nodeCacheEnd && j < nodEnd && nodeCacheInts[i] <= nodeInts[j]; i++, j++, k++) { // if previous node id is inferior to current node id, then we must stop // delta encoding if (nodeCacheInts[i] < nodeInts[j]) { // compute delta nodeBufferInts[nodeBufferOffset + k] = nodeInts[j] - nodeCacheInts[i]; // increment for preparing copy of remaining ids j += 1; k += 1; // stop iteration break; } // otherwise if equal, compute delta (== 0) and move to next level nodeBufferInts[nodeBufferOffset + k] = 0; } // copy the remaining ids for (; j < nodEnd; j++, k++) { nodeBufferInts[nodeBufferOffset + k] = nodeInts[j]; } } /** * Add the term frequency within the current node to the buffer */ public void writeTermFreq(final int termFreq) { // check size of the buffer and increase it if needed if (termFreqBuffer.offset >= termFreqBuffer.ints.length) { // Take the max to ensure that buffer will be large enough int newLength = Math.max(termFreqBuffer.offset + 1, termFreqBuffer.ints.length * 3/2); // ensure that the buffer is large enough to accomodate the window size newLength = this.getMinimumBufferSize(newLength, nodCompressor.getWindowSize()); ArrayUtils.growAndCopy(termFreqBuffer, newLength); } // decrement freq by one termFreqBuffer.ints[termFreqBuffer.offset++] = termFreq - 1; } @Override public boolean isEmpty() { return nodBuffer.offset == 0; } @Override public boolean isFull() { // this implementation is never full as it is synchronised with doc block // and grows on demand return false; } @Override protected void writeHeader() throws IOException { // logger.debug("Write Nod header: {}", this.hashCode()); // logger.debug("Nod header start at {}", out.getFilePointer()); // write block sizes out.writeVInt(nodLenBuffer.length); out.writeVInt(nodBuffer.length); out.writeVInt(termFreqBuffer.length); assert nodLenBuffer.length <= nodBuffer.length; // write size of compressed data blocks out.writeVInt(nodLenCompressedBuffer.length); out.writeVInt(nodCompressedBuffer.length); out.writeVInt(termFreqCompressedBuffer.length); } @Override protected void compress() { // Flip buffers before compression nodLenBuffer.length = nodLenBuffer.offset; nodLenBuffer.offset = 0; nodBuffer.length = nodBuffer.offset; nodBuffer.offset = 0; termFreqBuffer.length = termFreqBuffer.offset; termFreqBuffer.offset = 0; // determine max size of compressed buffer to avoid overflow int size = nodCompressor.maxCompressedSize(nodLenBuffer.length); nodLenCompressedBuffer = ArrayUtils.grow(nodLenCompressedBuffer, size); size = nodCompressor.maxCompressedSize(nodBuffer.length); nodCompressedBuffer = ArrayUtils.grow(nodCompressedBuffer, size); size = nodCompressor.maxCompressedSize(termFreqBuffer.length); termFreqCompressedBuffer = ArrayUtils.grow(termFreqCompressedBuffer, size); // compress nodCompressor.compress(nodLenBuffer, nodLenCompressedBuffer); nodCompressor.compress(nodBuffer, nodCompressedBuffer); nodCompressor.compress(termFreqBuffer, termFreqCompressedBuffer); } @Override protected void writeData() throws IOException { // logger.debug("Write Node data: {}", this.hashCode()); // logger.debug("Write Node Length at {}", out.getFilePointer()); out.writeBytes(nodLenCompressedBuffer.bytes, nodLenCompressedBuffer.length); // logger.debug("Write Node at {}", out.getFilePointer()); out.writeBytes(nodCompressedBuffer.bytes, nodCompressedBuffer.length); // logger.debug("Write Term Freq in Node at {}", out.getFilePointer()); out.writeBytes(termFreqCompressedBuffer.bytes, termFreqCompressedBuffer.length); } @Override protected void initBlock() { nodLenBuffer.offset = nodLenBuffer.length = 0; nodBuffer.offset = nodBuffer.length = 0; termFreqBuffer.offset = termFreqBuffer.length = 0; this.resetCurrentNode(); } protected void resetCurrentNode() { nodeCache.offset = 0; nodeCache.length = 0; } } }