package com.yahoo.glimmer.util;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrayBigList;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongIterable;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.Serializable;
public class BlockOffsets implements Serializable {
private static final long serialVersionUID = 6997859749849192991L;
private static final int BZIP2_FOOTER_LENGTH = 6 * 8 + 32; // 6 Stream end
// bytes + 32 bit
// CRC
private final LongBigList firstDocIds;
private final LongBigList blockStartBitOffsets;
private final long docCount;
private final long lastDocId;
private final long fileSizeInBits;
public BlockOffsets(LongIterable firstDocIds, LongIterable blockStartBitOffsets, long docCount, long lastDocId, long fileSizeInBits) {
this.firstDocIds = new EliasFanoMonotoneLongBigList(firstDocIds);
this.blockStartBitOffsets = new EliasFanoMonotoneLongBigList(blockStartBitOffsets);
if (this.firstDocIds.size64() != this.blockStartBitOffsets.size64()) {
throw new IllegalArgumentException("Number of block starts differs from number of first doc ids.");
}
if (docCount - 1 > lastDocId) {
throw new IllegalArgumentException("docCount(" + docCount + ") - 1 is greater than the lastDocId(" + lastDocId + ")");
}
this.docCount = docCount;
this.lastDocId = lastDocId;
this.fileSizeInBits = fileSizeInBits;
}
public long getBlockStartBitOffset(long index) throws IOException {
if (index < blockStartBitOffsets.size()) {
return blockStartBitOffsets.getLong(index);
} else if (index == blockStartBitOffsets.size()) {
return fileSizeInBits - BZIP2_FOOTER_LENGTH;
}
throw new IndexOutOfBoundsException("index (" + index + ") > block count(" + getBlockCount() + ")");
}
public long getBlockIndex(long docId) {
long index = longBigListBinarySearch(firstDocIds, docId);
if (index < 0) {
index = -index - 2;
}
if (index > 0) {
// If a record spans multiple blocks the value at index in
// firstDocIds will be repeated.
// Find the first element equal to the value at firstDocIds[index].
long value = firstDocIds.getLong(index);
index--;
while (index >= 0 && firstDocIds.getLong(index) == value) {
index--;
}
index++;
}
return index;
}
public long getLastDocId() {
return lastDocId;
}
public long getDocCount() {
return docCount;
}
public long getFileSizeInBits() {
return fileSizeInBits;
}
public long getBlockCount() {
return blockStartBitOffsets.size64();
}
// Surprisingly there doesn't seem to be a binary search method on
// LongBigLists in fastutil..
private static long longBigListBinarySearch(final LongBigList list, final long key) {
long midVal;
long from = 0;
long to = list.size64() - 1;
while (from <= to) {
final long mid = (from + to) >>> 1;
midVal = list.getLong(mid);
if (midVal < key) {
from = mid + 1;
} else if (midVal > key) {
to = mid - 1;
} else {
return mid;
}
}
return -(from + 1);
}
public void printTo(PrintStream ps) {
ps.println("Doc count:" + docCount);
ps.println("Last doc ID:" + lastDocId);
ps.println("Block count:" + getBlockCount());
ps.println("Bz2 file size (bits):" + fileSizeInBits);
ps.println("BlockIndex FirstDoc BlockStart");
for (long i = 0; i < firstDocIds.size64(); i++) {
ps.printf("%10d %16d %16d\n", i, firstDocIds.get(i), blockStartBitOffsets.get(i));
}
}
public void save(OutputStream outputStream) throws IOException {
BinIO.storeObject(this, outputStream);
}
public static class Builder {
private final LongBigArrayBigList firstDocIds = new LongBigArrayBigList();
private final LongBigArrayBigList blockStartBitOffsets = new LongBigArrayBigList();
private long totalBits;
public void setBlockStart(long blockStartBitOffset, long docId) {
blockStartBitOffsets.add(blockStartBitOffset);
firstDocIds.add(docId);
}
public void close(long totalBits) {
this.totalBits = totalBits;
}
public BlockOffsets build(long docCount, long lastDocId) {
if (totalBits == -1) {
throw new IllegalStateException("close() wasn't called!");
}
return new BlockOffsets(firstDocIds, blockStartBitOffsets, docCount, lastDocId, totalBits);
}
}
}