package water.fvec;
import water.*;
/** A compression scheme, over a chunk - a single array of bytes. The *actual*
* vector header info is in the Vec struct - which contains info to find all
* the bytes of the distributed vector. This struct is basically a 1-entry
* chunk cache of the total vector. Subclasses of this abstract class
* implement (possibly empty) compression schemes. */
public abstract class Chunk extends Iced implements Cloneable {
public transient long _start = -1; // Start element; filled after AutoBuffer.read
public transient int _len; // Number of elements in this chunk
public int len() { return _len; }
public int set_len(int l) { _len = l; return _len; }
protected transient Chunk _chk2; // Normally==null, changed if chunk is written to
public transient Vec _vec; // Owning Vec; filled after AutoBuffer.read
public byte[] _mem; // Short-cut to the embedded memory; WARNING: holds onto a large array
public final boolean readable( ) { return _vec.readable(); }
public final boolean writable( ) { return _vec.writable(); }
public final byte[] getBytes() { return _mem; }
/** Load a long value. Floating point values are silently rounded to an
* integer. Throws if the value is missing.
* <p>
* Loads from the 1-entry chunk cache, or misses-out. This version uses
* absolute element numbers, but must convert them to chunk-relative indices
* - requiring a load from an aliasing local var, leading to lower quality
* JIT'd code (similar issue to using iterator objects).
* <p>
* Slightly slower than 'at0' since it range checks within a chunk. */
public final long at8( long i ) {
long x = i - (_start>0 ? _start : 0);
if( 0 <= x && x < _len ) return at80((int)x);
throw new ArrayIndexOutOfBoundsException(""+_start+" <= "+i+" < "+(_start+_len));
}
/** Load a double value. Returns Double.NaN if value is missing.
* <p>
* Loads from the 1-entry chunk cache, or misses-out. This version uses
* absolute element numbers, but must convert them to chunk-relative indices
* - requiring a load from an aliasing local var, leading to lower quality
* JIT'd code (similar issue to using iterator objects).
* <p>
* Slightly slower than 'at80' since it range checks within a chunk. */
public final double at( long i ) {
long x = i - (_start>0 ? _start : 0);
if( 0 <= x && x < _len ) return at0((int)x);
throw new ArrayIndexOutOfBoundsException(getClass().getSimpleName() + " " +_start+" <= "+i+" < "+(_start+_len));
}
/** Fetch the missing-status the slow way. */
public final boolean isNA(long i) {
long x = i - (_start>0 ? _start : 0);
if( 0 <= x && x < _len ) return isNA0((int)x);
throw new ArrayIndexOutOfBoundsException(getClass().getSimpleName() + " " +_start+" <= "+i+" < "+(_start+_len));
}
public final long at16l( long i ) {
long x = i - (_start>0 ? _start : 0);
if( 0 <= x && x < _len ) return at16l0((int)x);
throw new ArrayIndexOutOfBoundsException(getClass().getSimpleName() + " " +_start+" <= "+i+" < "+(_start+_len));
}
public final long at16h( long i ) {
long x = i - (_start>0 ? _start : 0);
if( 0 <= x && x < _len ) return at16h0((int)x);
throw new ArrayIndexOutOfBoundsException(getClass().getSimpleName() + " " +_start+" <= "+i+" < "+(_start+_len));
}
/** The zero-based API. Somewhere between 10% to 30% faster in a tight-loop
* over the data than the generic at() API. Probably no gain on larger
* loops. The row reference is zero-based on the chunk, and should
* range-check by the JIT as expected. */
public final double at0 ( int i ) { return _chk2 == null ? atd_impl(i) : _chk2. atd_impl(i); }
public final long at80 ( int i ) { return _chk2 == null ? at8_impl(i) : _chk2. at8_impl(i); }
public final boolean isNA0( int i ) { return _chk2 == null ?isNA_impl(i) : _chk2.isNA_impl(i); }
public final long at16l0( int i ) { return _chk2 == null ? at16l_impl(i) : _chk2.at16l_impl(i); }
public final long at16h0( int i ) { return _chk2 == null ? at16h_impl(i) : _chk2.at16h_impl(i); }
/** Slightly slower than 'at0' inside a chunk; goes (very) slow outside the
* chunk instead of throwing. First outside-chunk fetches and caches whole
* chunk; maybe takes multiple msecs. 2nd and later touches in the same
* outside-chunk probably run 100x slower than inside-chunk accesses. */
public final double at_slow( long i ) { long x = i-_start; return (0 <= x && x < _len) ? at0((int)x) : _vec. at(i); }
public final long at8_slow( long i ) { long x = i-_start; return (0 <= x && x < _len) ? at80((int)x) : _vec.at8(i); }
public final boolean isNA_slow( long i ) { long x = i-_start; return (0 <= x && x < _len) ? isNA0((int)x) : _vec.isNA(i); }
/** Write element the slow way, as a long. There is no way to write a
* missing value with this call. Under rare circumstances this can throw:
* if the long does not fit in a double (value is larger magnitude than
* 2^52), AND float values are stored in Vector. In this case, there is no
* common compatible data representation. */
public final long set( long i, long l) { long x = i-_start; return (0 <= x && x < _len) ? set0((int)x,l) : _vec.set(i,l); }
/** Write element the slow way, as a double. Double.NaN will be treated as
* a set of a missing element. */
public final double set( long i, double d) { long x = i-_start; return (0 <= x && x < _len) ? set0((int)x,d) : _vec.set(i,d); }
/** Write element the slow way, as a float. Float.NaN will be treated as
* a set of a missing element. */
public final float set( long i, float f) { long x = i-_start; return (0 <= x && x < _len) ? set0((int)x,f) : _vec.set(i,f); }
/** Set the element as missing the slow way. */
public final boolean setNA( long i ) { long x = i-_start; return (0 <= x && x < _len) ? setNA0((int)x) : _vec.setNA(i); }
public void setAll(double [] vals) {
setWrite();
_chk2 = new NewChunk(_vec,cidx(),vals).compress();
}
public Chunk modifiedChunk(){return _chk2;}
private void setWrite() {
if( _chk2 != null ) return; // Already setWrite
assert !(this instanceof NewChunk) : "Cannot direct-write into a NewChunk, only append";
_vec.preWriting(); // One-shot writing-init
_chk2 = clone(); // Flag this chunk as having been written into
assert _chk2._chk2 == null; // Clone has NOT been written into
}
/**
* Set a long element in a chunk given a 0-based chunk local index.
*
* Write into a chunk.
* May rewrite/replace chunks if the chunk needs to be
* "inflated" to hold larger values. Returns the input value.
*
* Note that the idx is an int (instead of a long), which tells you
* that index 0 is the first row in the chunk, not the whole Vec.
*/
public final long set0(int idx, long l) {
setWrite();
if( _chk2.set_impl(idx,l) ) return l;
(_chk2 = inflate_impl(new NewChunk(this))).set_impl(idx,l);
return l;
}
/** Set a double element in a chunk given a 0-based chunk local index. */
public final double set0(int idx, double d) {
setWrite();
if( _chk2.set_impl(idx,d) ) return d;
(_chk2 = inflate_impl(new NewChunk(this))).set_impl(idx,d);
return d;
}
/** Set a floating element in a chunk given a 0-based chunk local index. */
public final float set0(int idx, float f) {
setWrite();
if( _chk2.set_impl(idx,f) ) return f;
(_chk2 = inflate_impl(new NewChunk(this))).set_impl(idx,f);
return f;
}
/** Set the element in a chunk as missing given a 0-based chunk local index. */
public final boolean setNA0(int idx) {
setWrite();
if( _chk2.setNA_impl(idx) ) return true;
(_chk2 = inflate_impl(new NewChunk(this))).setNA_impl(idx);
return true;
}
/** After writing we must call close() to register the bulk changes */
public void close( int cidx, Futures fs ) {
int len = _len;
if( this instanceof NewChunk ) _chk2 = this;
if( _chk2 == null ) return; // No change?
if( _chk2 instanceof NewChunk ) _chk2 = ((NewChunk)_chk2).new_close();
assert _chk2._len == len:"incompatible length after compression, " + len + " != " + _chk2._len + ", " + ", chunk = " + _chk2.getClass().getSimpleName();
DKV.put(_vec.chunkKey(cidx),_chk2,fs,true); // Write updated chunk back into K/V
if( _vec._cache == this ) _vec._cache = null;
}
public int cidx() { return _vec.elem2ChunkIdx(_start); }
/** Chunk-specific readers. */
abstract protected double atd_impl(int idx);
abstract protected long at8_impl(int idx);
abstract protected boolean isNA_impl(int idx);
protected long at16l_impl(int idx) { throw new IllegalArgumentException("Not a UUID"); }
protected long at16h_impl(int idx) { throw new IllegalArgumentException("Not a UUID"); }
/** Chunk-specific writer. Returns false if the value does not fit in the
* current compression scheme. */
abstract boolean set_impl (int idx, long l );
abstract boolean set_impl (int idx, double d );
abstract boolean set_impl (int idx, float f );
abstract boolean setNA_impl(int idx);
public int nextNZ(int rid){return rid+1;}
public boolean isSparse() {return false;}
public int sparseLen(){return _len;}
/**
* Get chunk-relative indexes of values (nonzeros for sparse, all for dense) stored in this chunk.
* For desne chunks, this will contain indeces of all the rows in this chunk.
*
* @return array of chunk-relative indeces of values stored in this chunk.
*/
public int nonzeros(int [] res){
if(!isSparse())
for( int i = 0; i < _len; ++i)
res[i] = i;
else {
int j = 0;
for (int i = nextNZ(-1); i < _len; i = nextNZ(i))
res[j++] = i;
assert res.length == j;
}
return _len;
}
/**
* Get chunk-relative indeces of values (nonzeros for sparse, all for dense) stored in this chunk.
* For desne chunks, this will contain indeces of all the rows in this chunk.
*
* @return array of chunk-relative indeces of values stored in this chunk.
*/
public final int [] nonzeros () {
int [] res = MemoryManager.malloc4(sparseLen());
nonzeros(res);
return res;
}
public NewChunk inflate(){
return inflate_impl(new NewChunk(this));
}
/** Chunk-specific bulk inflator back to NewChunk. Used when writing into a
* chunk and written value is out-of-range for an update-in-place operation.
* Bulk copy from the compressed form into the nc._ls array. */
abstract NewChunk inflate_impl(NewChunk nc);
abstract boolean hasFloat();
/** Chunk-specific implementations of read and write */
public abstract AutoBuffer write(AutoBuffer bb);
public abstract Chunk read (AutoBuffer bb);
// Support for fixed-width format printing
public String pformat () { return pformat0(); }
public int pformat_len() { return pformat_len0(); }
protected String pformat0() {
assert !hasFloat() : "need impl:"+getClass(); // Floats handled in subclasses
long min = (long)_vec.min();
if( min < 0 ) return "% "+pformat_len0()+"d";
return "%"+pformat_len0()+"d";
}
protected int pformat_len0() {
assert !hasFloat(); // Floats handled in subclasses
int len=0;
long min = (long)_vec.min();
if( min < 0 ) len++;
long max = Math.max(Math.abs(min),Math.abs((long)_vec.max()));
for( int i=1; i<PrettyPrint.powers10i.length; i++ )
if( max < PrettyPrint.powers10i[i] )
return i+len;
return 20;
}
protected int pformat_len0( double scale, int lg ) {
double dx = Math.log10(scale);
int x = (int)dx;
if( x >= 0 && PrettyPrint.pow10i(x) != scale ) throw H2O.unimpl();
int w=1/*blank/sign*/+lg/*compression limits digits*/+1/*dot*/+1/*e*/+1/*neg exp*/+2/*digits of exp*/;
return w;
}
@Override public Chunk clone() { return (Chunk)super.clone(); }
@Override public String toString() {
return getClass().getSimpleName() + "(start = " + _start + ", len = " + _len + ")";
}
public long byteSize() {
long s= _mem == null ? 0 : _mem.length;
s += (2+5)*8 + 12; // 2 hdr words, 5 other words, @8bytes each, plus mem array hdr
if( _chk2 != null ) s += _chk2.byteSize();
return s;
}
}