package water.fvec;
import water.*;
import water.util.SetOfBytes;
import water.util.UnsafeUtils;
import water.parser.BufferedString;
import java.util.Arrays;
public class CStrChunk extends Chunk {
static final int NA = -1;
static protected final int _OFF=4+1;
private int _valstart;
public boolean _isAllASCII = false;
public CStrChunk() {}
public CStrChunk(int sslen, byte[] ss, int sparseLen, int idxLen, int[] id, int[] is) {
_start = -1;
_valstart = idx(idxLen);
_len = idxLen;
_mem = MemoryManager.malloc1(_valstart + sslen, false);
UnsafeUtils.set4(_mem, 0, _valstart); // location of start of strings
Arrays.fill(_mem,_OFF,_valstart,(byte)-1); // Indicate All Is NA's
for( int i = 0; i < sparseLen; ++i ) // Copy the sparse indices
UnsafeUtils.set4(_mem, idx(id==null ? i : id[i]), is[i]);
UnsafeUtils.copyMemory(ss,0,_mem,_valstart,sslen);
_isAllASCII = true;
for(int i = _valstart; i < _mem.length; ++i) {
byte c = _mem[i];
if ((c & 0x80) == 128) { //value beyond std ASCII
_isAllASCII = false;
break;
}
}
UnsafeUtils.set1(_mem, 4, (byte) (_isAllASCII ? 1 : 0)); // isAllASCII flag
}
private int idx(int i) { return _OFF+(i<<2); }
@Override public boolean setNA_impl(int idx) { return false; }
@Override public boolean set_impl(int idx, float f) { if (Float.isNaN(f)) return false; else throw new IllegalArgumentException("Operation not allowed on string vector.");}
@Override public boolean set_impl(int idx, double d) { if (Double.isNaN(d)) return false; else throw new IllegalArgumentException("Operation not allowed on string vector.");}
@Override public boolean set_impl(int idx, long l) { throw new IllegalArgumentException("Operation not allowed on string vector.");}
@Override public boolean set_impl(int idx, String str) { return false; }
@Override public boolean isNA_impl(int idx) {
int off = intAt(idx);
return off == NA;
}
public int intAt(int i) { return UnsafeUtils.get4(_mem, idx(i)); }
public byte byteAt(int i) { return _mem[_valstart+i]; }
public int lengthAtOffset(int off) {
int len = 0;
while (byteAt(off + len) != 0) len++;
return len;
}
@Override public long at8_impl(int idx) { throw new IllegalArgumentException("Operation not allowed on string vector.");}
@Override public double atd_impl(int idx) { throw new IllegalArgumentException("Operation not allowed on string vector.");}
@Override public BufferedString atStr_impl(BufferedString bStr, int idx) {
int off = intAt(idx);
if( off == NA ) return null;
int len = lengthAtOffset(off);
assert len >= 0 : getClass().getSimpleName() + ".atStr_impl: len=" + len + ", idx=" + idx + ", off=" + off;
return bStr.set(_mem,_valstart+off,len);
}
@Override protected final void initFromBytes () {
_start = -1; _cidx = -1;
_valstart = UnsafeUtils.get4(_mem, 0);
byte b = UnsafeUtils.get1(_mem,4);
_isAllASCII = b != 0;
set_len((_valstart-_OFF)>>2);
}
@Override public ChunkVisitor processRows(ChunkVisitor nc, int from, int to){
BufferedString bs = new BufferedString();
for(int i = from; i < to; i++)
nc.addValue(atStr(bs,i));
return nc;
}
@Override public ChunkVisitor processRows(ChunkVisitor nc, int... rows){
BufferedString bs = new BufferedString();
for(int i:rows)
nc.addValue(atStr(bs,i));
return nc;
}
/**
* Optimized toLower() method to operate across the entire CStrChunk buffer in one pass.
* This method only changes the values of ASCII uppercase letters in the text.
*
* NewChunk is the same size as the original.
*
* @param nc NewChunk to be filled with the toLower version of ASCII strings in this chunk
* @return Filled NewChunk
*/
public NewChunk asciiToLower(NewChunk nc) {
// copy existing data
nc = this.extractRows(nc, 0,_len);
//update offsets and byte array
for(int i= 0; i < nc._sslen; i++) {
if (nc._ss[i] > 0x40 && nc._ss[i] < 0x5B) // check for capital letter
nc._ss[i] += 0x20; // lower it
}
return nc;
}
/**
* Optimized toUpper() method to operate across the entire CStrChunk buffer in one pass.
* This method only changes the values of ASCII lowercase letters in the text.
*
* NewChunk is the same size as the original.
*
* @param nc NewChunk to be filled with the toUpper version of ASCII strings in this chunk
* @return Filled NewChunk
*/
public NewChunk asciiToUpper(NewChunk nc) {
// copy existing data
nc = this.extractRows(nc, 0,_len);
//update offsets and byte array
for(int i= 0; i < nc._sslen; i++) {
if (nc._ss[i] > 0x60 && nc._ss[i] < 0x7B) // check for capital letter
nc._ss[i] -= 0x20; // upper it
}
return nc;
}
/**
* Optimized trim() method to operate across the entire CStrChunk buffer in one pass.
* This mimics Java String.trim() by only considering characters of value
* <code>'\u0020'</code> or less as whitespace to be trimmed. This means that like
* Java's String.trim() it ignores 16 of the 17 characters regarded as a space in UTF.
*
* NewChunk is the same size as the original, despite trimming.
*
* @param nc NewChunk to be filled with trimmed version of strings in this chunk
* @return Filled NewChunk
*/
public NewChunk asciiTrim(NewChunk nc) {
// copy existing data
nc = this.extractRows(nc, 0,_len);
//update offsets and byte array
for(int i=0; i < _len; i++) {
int j = 0;
int off = UnsafeUtils.get4(_mem,idx(i));
if (off != NA) {
//UTF chars will appear as negative values. In Java spec, space is any char 0x20 and lower
while( _mem[_valstart+off+j] > 0 && _mem[_valstart+off+j] < 0x21) j++;
if (j > 0) nc.set_is(i,off + j);
while( _mem[_valstart+off+j] != 0 ) j++; //Find end
j--;
while( _mem[_valstart+off+j] > 0 && _mem[_valstart+off+j] < 0x21) { //March back to find first non-space
nc._ss[off+j] = 0; //Set new end
j--;
}
}
}
return nc;
}
/**
* Optimized substring() method for a buffer of only ASCII characters.
* The presence of UTF-8 multi-byte characters would give incorrect results
* for the string length, which is required here.
*
* @param nc NewChunk to be filled with substrings in this chunk
* @param startIndex The beginning index of the substring, inclusive
* @param endIndex The ending index of the substring, exclusive
* @return Filled NewChunk
*/
public NewChunk asciiSubstring(NewChunk nc, int startIndex, int endIndex) {
// copy existing data
nc = this.extractRows(nc, 0,_len);
//update offsets and byte array
for (int i = 0; i < _len; i++) {
int off = UnsafeUtils.get4(_mem, idx(i));
if (off != NA) {
int len = 0;
while (_mem[_valstart + off + len] != 0) len++; //Find length
nc.set_is(i,startIndex < len ? off + startIndex : off + len);
for (; len > endIndex - 1; len--) {
nc._ss[off + len] = 0; //Set new end
}
}
}
return nc;
}
/**
* Optimized length() method for a buffer of only ASCII characters.
* This is a straight byte count for each word in the chunk. The presence
* of UTF-8 multi-byte characters would give incorrect results.
*
* @param nc NewChunk to be filled with lengths of strings in this chunk
* @return Filled NewChunk
*/
public NewChunk asciiLength(NewChunk nc) {
//pre-allocate since size is known
nc.alloc_mantissa(_len);
nc.alloc_exponent(_len); // sadly, a waste
// fill in lengths
for(int i=0; i < _len; i++) {
int off = UnsafeUtils.get4(_mem,idx(i));
int len = 0;
if (off != NA) {
while (_mem[_valstart + off + len] != 0) len++;
nc.addNum(len, 0);
} else nc.addNA();
}
return nc;
}
public NewChunk asciiEntropy(NewChunk nc) {
nc.alloc_doubles(_len);
for (int i = 0; i < _len; i++) {
double entropy = entropyAt(i);
if (Double.isNaN(entropy)) nc.addNA();
else nc.addNum(entropy);
}
return nc;
}
double entropyAt(int i) {
int off = intAt(i);
if (off == NA) return Double.NaN;
int[] frq = new int[256];
int len = lengthAtOffset(off);
for (int j = 0; j < len; j++) {
frq[0xff & byteAt(off + j)]++;
}
double sum = 0;
for (int b = 0; b < 256; b++) {
int f = frq[b];
if (f > 0) {
double x = (double)f / len;
sum += x * Math.log(x);
}
}
return - sum / Math.log(2);
}
/**
* Optimized lstrip() & rstrip() methods to operate across the entire CStrChunk buffer in one pass.
*
* NewChunk is the same size as the original, despite trimming.
*
* @param nc NewChunk to be filled with strip version of strings in this chunk
* @param chars chars to strip, treated as ASCII
* @return Filled NewChunk
*/
public NewChunk asciiLStrip(NewChunk nc, String chars) {
SetOfBytes set = new SetOfBytes(chars);
//update offsets and byte array
for(int i=0; i < _len; i++) {
int off = intAt(i);
if (off != NA) {
while (set.contains(byteAt(off))) off++;
int len = lengthAtOffset(off);
nc.addStr(new BufferedString(_mem, _valstart+off, len));
} else nc.addNA();
}
return nc;
}
public NewChunk asciiRStrip(NewChunk nc, String chars) {
SetOfBytes set = new SetOfBytes(chars);
//update offsets and byte array
for(int i=0; i < _len; i++) {
int off = intAt(i);
if (off != NA) {
int pos = off + lengthAtOffset(off);
while (pos --> off && set.contains(byteAt(pos)));
nc.addStr(new BufferedString(_mem, _valstart+off, pos - off + 1));
} else nc.addNA();
}
return nc;
}
}