/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.rwstore.sector;
import java.util.ArrayList;
import org.apache.log4j.Logger;
import com.bigdata.rwstore.IWriteCacheManager;
/**
* The SectorAllocator is designed as an alternative the the standard RWStore
* FixedAllocators.
*
* The idea of the SectorAllocator is to efficiently contain within a single
* region as dense a usage as possible. Since a SectorAllocator is able to
* allocate a full range of slot sizes, it should be able to service several
* thousand allocations and maximise disk locality on write.
*
* Furthermore, it presents an option to be synced with the backing store -
* similarly to a MappedFile, in which case a single write for the entire
* sector could be made for update.
*
* What we do not want is to run out of bits and to leave significant unused
* space in the sector. This could happen if we primarily allocated small
* slots - say on average 512 bytes. In this case, the maximum 1636 entries
* would map 1636 * 32 * 512 bytes => only just over 26M, so a 64M sector
* is massively wasteful. The solution is to increment the sector reserve
* as required for each tab, say by a minimum of 256K while ensuring always less
* than 64M. Bit waste in the allocator - where the sector memory is allocated
* with far fewer bits than available is less of an issue, although it does
* impact on teh total number of allocations available. The absolute maximum
* size of an RWStore is derived from the maximum sector size * the number
* of sectors. An 8K sector allows for 32K bits, which, which only requires
* 15 unsigned bits, leaving a signed17bits for the sector index = 64K of
* sectors. Implying a maximum addressable store file of 64M * 64K,
* = 4TB of full sectors. If the average sector only requires 32M, then the
* total store would be reduced appropriately.
*
* The maximum theoretical storage is yielded by MAX_INT * AVG_SLOT_SIZE, so
* 2GB * 2K (avg) would equate to the optimal maximum addressable allocations
* and file size. An AVG of > 2K yields fewer allocations and an AVG of < 2K
* a reduced file size.
*
* TODO: add parameterisation of META_SIZE for exploitation by MemoryManager.
* TODO: cache block starts in m_addresses to simplify/optimise bit2Offset
*
* When a new SectorAllocator is at the head of the free list, a store
* such as the RWSectorStore can use an in-memory buffer to write the data
* to - sized to the full size of the sector. This can be written in a single
* write to the WriteCacheService.
*
* @author Martyn Cutcher
*
*/
public class SectorAllocator implements Comparable<SectorAllocator> {
private static final Logger log = Logger.getLogger(SectorAllocator.class);
static final int getBitMask(int bits) {
int ret = 0;
for (int i = 0; i < bits; i++) ret += 1 << i;
return ret;
}
static final int SECTOR_INDEX_BITS = 16;
static final int SECTOR_OFFSET_BITS = 32-SECTOR_INDEX_BITS;
static final int SECTOR_OFFSET_MASK = getBitMask(SECTOR_OFFSET_BITS);
static final int META_SIZE = 8192; // 8K
final static int SECTOR_SIZE = 64 * 1024 * 1024; // 64M
final static int NUM_ENTRIES = (META_SIZE - 12) / (4 + 1); // 8K - index - address (- chksum) / (4 + 1) bits plus tag
final int[] BIT_MASKS = {0x1, 0x3, 0x7, 0xF, 0xFF, 0xFFFF, 0xFFFFFFFF};
final public static int BLOB_SIZE = 4096;
final static int BLOB_CHAIN_OFFSET = BLOB_SIZE - 4;
final public static int[] ALLOC_SIZES = {64, 128, 256, 512, 1024, 2048, BLOB_SIZE};
final static int[] ALLOC_BITS = {32, 32, 32, 32, 32, 32, 32};
int m_index;
long m_sectorAddress;
int m_maxSectorSize;
byte[] m_tags = new byte[NUM_ENTRIES];
int[] m_bits = new int[NUM_ENTRIES]; // 128 - sectorAddress(1) - m_tags(4)
int[] m_transientbits = new int[NUM_ENTRIES];
int[] m_commitbits = new int[NUM_ENTRIES];
int[] m_addresses = new int[NUM_ENTRIES];
// maintain count against each alloc size, this provides ready access to be
// able to check the minimum number of bits for all tag sizes. No
// SectorAllocator should be on the free list unless there are free bits
// available for all tags.
//
// In order to return a SectorAllocator to the free list we can check not
// only the total number of bits, but the average number of bits for the
// tag, dividing the numebr of free bits by the total (number of blocks)
// for each tag.
int[] m_free = new int[ALLOC_SIZES.length];
int[] m_total = new int[ALLOC_SIZES.length];
int[] m_allocations = new int[ALLOC_SIZES.length];
int[] m_recycles = new int[ALLOC_SIZES.length];
final ISectorManager m_store;
boolean m_onFreeList = false;
// private int m_diskAddr;
private final IWriteCacheManager m_writes;
private boolean m_preserveSession;
public SectorAllocator(ISectorManager store, IWriteCacheManager writes) {
m_store = store;
m_writes = writes;
}
static byte getTag(final int size) {
byte tag = 0;
while (size > ALLOC_SIZES[tag]) tag++;
return tag;
}
/**
* Must find tag type that size fits in (or BLOB) and then find
* block of type into which an allocation can be made.
*/
public int alloc(final int size) {
if (size > BLOB_SIZE) {
throw new IllegalArgumentException("Cannot directly allocate a BLOB, use PSOutputStream");
}
// if (!m_onFreeList)
// throw new IllegalStateException("Allocation request to allocator " + m_index + " not on the free list");
final byte tag = getTag(size);
assert m_free[tag] > 0;
// now find allocated tag areas..
int sbit = 0;
int lbits = 0;
for (int i = 0; i < NUM_ENTRIES; i++) {
final int ttag = m_tags[i];
if (ttag == -1) {
throw new IllegalStateException("Allocator should not be on the FreeList for tag: " + ALLOC_SIZES[tag]);
}
lbits = ALLOC_BITS[ttag];
if (ttag == tag) {
final int bits = m_transientbits[i];
int bit = fndBit(bits);
if (bit != -1) {
sbit += bit;
if (log.isTraceEnabled())
log.trace("Setting bit: " + sbit);
setBit(m_bits, sbit);
setBit(m_transientbits, sbit);
if (!tstBit(m_bits, sbit)) {
throw new IllegalStateException("WTF with bit:" + sbit);
}
m_free[tag]--;
m_allocations[tag]++;
if (m_free[tag] == 0 && m_onFreeList) {
if (!addNewTag(tag)) {
if (log.isInfoEnabled()) {
log.info("Removing Sector #" + m_index + ": "+toString());
}
m_store.removeFromFreeList(this);
m_onFreeList = false;
}
}
int raddr = makeAddr(m_index, sbit);
if (log.isTraceEnabled())
log.trace("Allocating " + m_index + ":" + sbit + " as " + raddr + " for " + size);
if (getSectorIndex(raddr) != m_index) {
throw new IllegalStateException("Address: " + raddr + " does not yield index: " + m_index);
}
return raddr;
}
}
sbit += lbits; // bump over current tag's bits
}
return 0;
}
public static int makeAddr(final int index, final int bit) {
return -(((index+1) << SECTOR_OFFSET_BITS) + bit);
}
private boolean addNewTag(byte tag) {
int allocated = 0;
for (int i = 0; i < m_tags.length; i++) {
if (m_tags[i] == -1) {
final int block = SectorAllocator.ALLOC_SIZES[tag] * 32;
if ((allocated + block) <= m_maxSectorSize) {
m_tags[i] = tag;
m_free[tag] += 32;
m_total[tag]++;
if (i < (m_tags.length-1)) {
// cache next block offset
m_addresses[i+1] = m_addresses[i] + (32 * ALLOC_SIZES[tag]);
}
if (log.isTraceEnabled())
log.trace("addNewTag block for: " + ALLOC_SIZES[tag]);
if ((i+1) == m_tags.length) {
int trim = m_maxSectorSize - (allocated + block);
m_store.trimSector(trim, this);
}
return true;
} else {
if (log.isDebugEnabled())
log.debug("addNewTag FALSE due to Sector SIZE");
return false;
}
} else {
allocated += ALLOC_SIZES[m_tags[i]] * 32;
}
}
if (log.isDebugEnabled())
log.debug("addNewTag FALSE due to Sector BITS");
return false;
}
/**
*
* @param bit
* @return
*/
public boolean free(final int bit) {
if (!tstBit(m_bits, bit)) {
throw new IllegalStateException("Request to free bit not set: " + bit);
}
clrBit(m_bits, bit);
if (!tstBit(m_commitbits, bit)) {
if (!tstBit(m_transientbits, bit)) {
throw new IllegalStateException("Request to free transient bit not set" + bit);
}
if (!m_preserveSession) {
clrBit(m_transientbits, bit);
final int tag = bit2tag(bit);
m_free[tag]++;
m_recycles[tag]++;
}
// The hasFree test is too coarse, ideally we should test for
// percentage of free bits - say 10% PLUS a minimum of say 10
// for each tag type.
if ((!m_onFreeList) && hasFree(2)) { // minimum of 5 bits for each 32 bit block
m_onFreeList = true;
if (log.isInfoEnabled()) {
log
.info("Returning Sector #" + m_index + ": "
+ toString());
}
m_store.addToFreeList(this);
}
if (m_writes != null && m_writes.removeWriteToAddr(getPhysicalAddress(bit),0/*latchedAddr*/)) {
if (log.isTraceEnabled())
log.trace("Removed potential DUPLICATE");
}
}
return false;
}
/**
*
* @param bit
* @return the block size
*/
int bit2Size(int bit) {
for (int t = 0; t < NUM_ENTRIES; t++) {
int tag = m_tags[t];
if (tag == -1) {
throw new IllegalStateException("bit offset too large");
}
int bits = ALLOC_BITS[tag];
if (bit < bits) {
return ALLOC_SIZES[tag];
}
bit -= bits;
}
return 0;
}
/**
* Uses the m_addresses block offset cache to efficiently determine the
* corresponding resource offset.
*
* @param bit
* @return the offset in the sector
*/
int bit2Offset(final int bit) {
final int entry = bit / 32;
final int entryBit = bit % 32;
assert entry < m_addresses.length;
int offset = m_addresses[entry];
offset += entryBit * ALLOC_SIZES[m_tags[entry]];
return offset;
}
// /**
// * A previous version of bit2Offset that calculated the offset dynamically
// * @param bit
// * @return the offset in the sector
// */
// int calcBit2Offset(int bit) {
// int offset = 0;
// for (int t = 0; t < NUM_ENTRIES; t++) {
// int tag = m_tags[t];
// if (tag == -1) {
// throw new IllegalStateException("bit offset too large");
// }
// int bits = ALLOC_BITS[tag];
// if (bit < bits) {
// offset += ALLOC_SIZES[tag] * bit;
// return offset;
// } else {
// offset += ALLOC_SIZES[tag] * bits;
// bit -= bits;
// }
// }
//
// return 0;
// }
/**
* Since we know that all allocations are 32 bits each, there is no need to
* scan through the array.
*
* @param bit
* @return the tag of the bit
*/
public int bit2tag(final int bit) {
return m_tags[bit/32];
}
/**
*
*/
public long getPhysicalAddress(final int offset) {
if (!tstBit(m_transientbits, offset)) {
return 0L;
} else {
return m_sectorAddress + bit2Offset(offset);
}
}
public int getPhysicalSize(final int offset) {
return bit2Size(offset);
}
public long getStartAddr() {
return m_sectorAddress;
}
public String getStats() {
// TODO Auto-generated method stub
return null;
}
/**
*
* @param threshold the minimum number of bits free per 32 bit block
* @return whether there are sufficient free for all block sizes
*/
public boolean hasFree(final int threshold) {
for (int i = 0; i < m_free.length; i++) {
if (m_free[i] < (threshold * m_total[i]))
return false;
}
return true;
}
/**
* Checks
* @return if there is a positive free count for all tags
*/
public boolean hasFree() {
return hasFree(1);
}
public void preserveSessionData() {
m_preserveSession = true;
}
// public void read(DataInputStream str) {
// try {
// m_index = str.readInt();
// m_sectorAddress = str.readLong();
//
// System.out.println("Sector: " + m_index + " managing sector at " + m_sectorAddress);
//
// int taglen = str.read(m_tags);
// assert taglen == m_tags.length;
//
// m_addresses[0] = 0;
// for (int i = 0; i < NUM_ENTRIES; i++) {
// m_commitbits[i] = m_transientbits[i] = m_bits[i] = str.readInt();
//
// // maintain cached block offset
// if (i < (NUM_ENTRIES-1)) {
// final int tag = m_tags[i];
// if (tag != -1) {
// m_addresses[i+1] = m_addresses[i] + (32 * ALLOC_SIZES[tag]);
// }
// }
// }
// } catch (IOException ioe) {
// throw new RuntimeException(ioe);
// }
// }
// public int getDiskAddr() {
// return m_diskAddr;
// }
//
// public void setDiskAddr(int addr) {
// m_diskAddr = addr;
// }
// public boolean verify(int addr) {
// // TODO Auto-generated method stub
// return false;
// }
// public byte[] write() {
// final byte[] buf = new byte[META_SIZE];
// final DataOutputStream str = new DataOutputStream(
// new FixedOutputStream(buf));
// try {
// str.writeInt(m_index);
// str.writeLong(m_sectorAddress);
// str.write(m_tags);
// for (int i = 0; i < NUM_ENTRIES; i++) {
// str.writeInt(m_bits[i]);
// }
//
// m_transientbits = (int[]) m_bits.clone();
// m_commitbits = (int[]) m_bits.clone();
// } catch (IOException e) {
// e.printStackTrace();
// } finally {
// try {
// str.close();
// } catch (IOException e) {
// // ignore
// }
// }
//
// return buf;
// }
public int addressSize(final int offset) {
return bit2Size(offset);
}
public void setIndex(final int index) {
assert m_index == 0;
m_index = index;
}
public void addAddresses(final ArrayList<Long> addrs) {
addrs.add(Long.valueOf(m_sectorAddress));
}
static void clrBit(final int[] bits, final int bitnum) {
final int index = bitnum / 32;
final int bit = bitnum % 32;
int val = bits[index];
val &= ~(1 << bit);
bits[index] = val;
}
static void setBit(final int[] bits, final int bitnum) {
final int index = bitnum / 32;
final int bit = bitnum % 32;
bits[index] |= 1 << bit;
}
static boolean tstBit(final int[] bits, final int bitnum) {
final int index = bitnum / 32;
final int bit = bitnum % 32;
return (bits[index] & 1 << bit) != 0;
}
/*
* use divide and conquer rather than shifting through
*/
int fndBit(int bits) {
for (int n = 0; n < 8; n++) { // check nibbles
if ((bits & 0x0F) != 0xF) {
for (int b = 0; b < 4; b++) {
if ((bits & (1 << b)) == 0) {
return b + (n * 4);
}
}
}
bits >>>= 4; //right shift a nibble
}
return -1;
}
/**
* As well as setting the address, this is the point when the
* allocator can pre-allocate the first set of tags.
*
* @param sectorAddress managed by this Allocator
*/
public void setSectorAddress(final long sectorAddress, final int maxsize) {
if (log.isInfoEnabled())
log.info("setting sector #" + m_index + " address: " + sectorAddress);
m_sectorAddress = sectorAddress;
m_maxSectorSize = maxsize;
m_addresses[0] = 0;
for (int i = 0; i < ALLOC_SIZES.length; i++) {
m_tags[i] = (byte) i;
m_free[i] = 32;
m_total[i] = 1;
// cache block offset
m_addresses[i+1] = m_addresses[i] + (32 * ALLOC_SIZES[i]);
}
for (int i = ALLOC_SIZES.length; i < NUM_ENTRIES; i++) {
m_tags[i] = (byte) -1;
}
m_onFreeList = true;
m_store.addToFreeList(this);
}
public static int getSectorIndex(final int rwaddr) {
return ((-rwaddr) >>> SECTOR_OFFSET_BITS) - 1;
}
public static int getSectorOffset(final int rwaddr) {
return (-rwaddr) & SECTOR_OFFSET_MASK;
}
public static int getBlobBlockCount(final int size) {
final int nblocks = (size + BLOB_SIZE - 1) / BLOB_SIZE;
return nblocks;
}
public static int getBlockForSize(final int size) {
for (int i = 0; i < ALLOC_SIZES.length; i++) {
if (size <= ALLOC_SIZES[i]) {
return ALLOC_SIZES[i];
}
}
throw new IllegalArgumentException("Size does not fit in a slot");
}
public int compareTo(final SectorAllocator other) {
final int oindex = ((SectorAllocator) other).m_index;
return m_index < oindex ? -1 : (m_index > oindex ? 1 : 0);
}
public int getIndex() {
return m_index;
}
public void releaseSession(IWriteCacheManager cache /* ignored */) {
for (int i = 0; i < m_bits.length; i++) {
m_transientbits[i] = m_commitbits[i] | m_bits[i];
}
m_preserveSession = false;
}
// public boolean addressInRange(int addr) {
// return false;
// }
// public int getAllocatedBlocks() {
// return 0;
// }
// public long getFileStorage() {
// return 0;
// }
// public long getAllocatedSlots() {
// return 0;
// }
// public boolean canImmediatelyFree(int addr, int sze, IAllocationContext context) {
// return false;
// }
// public boolean isAllocated(final int addrOffset) {
// return tstBit(m_bits, addrOffset);
// }
// public void free(int addr, int sze, boolean overrideSession) {
// free(addr);
// }
// public void setAllocationContext(IAllocationContext m_context) {
// throw new UnsupportedOperationException();
// }
//
// public int alloc(final int size, final IAllocationContext context) {
//
// return alloc(size);
// }
public String toString() {
final StringBuilder str = new StringBuilder();
for (int t = 0; t < m_free.length; t++) {
str.append("(" + (m_free[t] / m_total[t]) + ")[T"
+ (m_total[t] * 32) + ",A" + m_allocations[t] + ",F"
+ m_free[t] + ",R" + m_recycles[t] + "]");
}
return str.toString();
}
/**
* Called from MemoryManager to commit bits
*/
public void commit() {
m_commitbits = m_bits.clone();
if (!m_preserveSession) {
m_transientbits = m_bits.clone();
}
}
public boolean isCommitted(final int offset) {
return tstBit(m_commitbits, offset);
}
public boolean isGettable(final int offset) {
return tstBit(m_transientbits, offset);
}
}