package water.parser;
import com.google.common.base.Charsets;
import water.AutoBuffer;
import water.Iced;
import water.util.StringUtils;
import java.util.Arrays;
import java.util.Formatter;
/**
* A mutable wrapper to hold String as a byte array.
*
* It can be modified by set of methods, the hash code is computed
* on the fly. There is no speed up benefit of cashing the hash in
* a dedicated private field. See the speed test in {@code ParseTest2#testSpeedOfCategoricalUpdate}.
*
* Warning: This data structure is not designed for parallel access!
*/
public class BufferedString extends Iced implements Comparable<BufferedString> {
private byte [] _buf;
private int _off;
private int _len;
public BufferedString(byte[] buf, int off, int len) {
_buf = buf;
_off = off;
_len = len;
assert len >= 0 : "Bad length in constructor " + len;
}
private BufferedString(byte[] buf) { this(buf,0,buf.length); }
// Cloning constructing used during collecting unique categoricals
BufferedString(BufferedString from) {
this(Arrays.copyOfRange(from._buf,from._off,from._off+from._len));
}
public BufferedString(String from) { this(StringUtils.bytesOf(from)); }
// Used to make a temp recycling BufferedString in hot loops
public BufferedString() { }
public final AutoBuffer write_impl(AutoBuffer ab) {
if( _buf == null ) return ab.putInt(-1);
ab.putInt(_len);
return ab.putA1(_buf,_off,_off+_len);
}
public final BufferedString read_impl(AutoBuffer ab){
_buf = ab.getA1();
if(_buf != null) _len = _buf.length;
return this;
}
/**
* Comparison, according to Comparable interface
* @param o other string to compare
* @return -1 or 0 or 1, as specified in Comparable
*/
@Override public int compareTo( BufferedString o ) {
int len = Math.min(_len,o._len);
for( int i=0; i<len; i++ ) {
int x = (0xFF&_buf[_off+i]) - (0xFF&o._buf[o._off+i]);
if( x != 0 ) return x;
}
return _len - o._len;
}
@Override public int hashCode(){
int hash = 0;
int n = _off + _len;
for (int i = _off; i < n; ++i) // equivalent to String.hashCode (not actually)
hash = 31 * hash + (char)_buf[i];
return hash;
}
// TODO(vlad): make sure that this method is not as destructive as it now is (see tests)
void addChar() {
_len++;
}
void addBuff(byte [] bits){
byte [] buf = new byte[_len];
int l1 = _buf.length- _off;
System.arraycopy(_buf, _off, buf, 0, l1);
System.arraycopy(bits, 0, buf, l1, _len-l1);
_off = 0;
_buf = buf;
}
// WARNING: LOSSY CONVERSION!!!
// Converting to a String will truncate all bytes with high-order bits set,
// even if they are otherwise a valid member of the field/BufferedString.
// Converting back to a BufferedString will then make something with fewer
// characters than what you started with, and will fail all equals() tests.
// TODO(Vlad): figure out what to do about the buffer being not UTF-8 (who guarantees?)
@Override
public String toString() {
return _buf == null ? null : StringUtils.toString(_buf, Math.max(0, _off), Math.min(_buf.length, _len));
}
public String bytesToString() {
StringBuilder sb = new StringBuilder(_len * 2);
Formatter formatter = new Formatter(sb);
boolean inHex = false;
for (int i = 0; i < _len; i++) {
if ((_buf[_off + i] & 0x80) == 128) {
if (!inHex) sb.append("<0x");
formatter.format("%02X", _buf[_off + i]);
inHex = true;
} else { // ASCII
if (inHex) {
sb.append(">");
inHex = false;
}
formatter.format("%c", _buf[_off + i]);
}
}
if (inHex) sb.append(">"); // close hex values as trailing char
return sb.toString();
}
public static String[] toString(BufferedString bStr[]) {
if( bStr==null ) return null;
String[] ss = new String[bStr.length];
for( int i=0; i<bStr.length; i++ )
ss[i] = bStr[i].toString();
return ss;
}
public static BufferedString[] toBufferedString(String[] strings) {
if (strings == null) return null;
BufferedString[] res = new BufferedString[strings.length];
for (int i = 0; i < strings.length; i++) {
res[i] = new BufferedString(strings[i]);
}
return res;
}
public final BufferedString set(byte[] buf) {
return set(buf, 0, buf.length);
}
public final BufferedString set(byte[] buf, int off, int len) {
_buf = buf;
_off = off;
_len = len;
assert len >= 0 : "Bad length in setter " + len;
return this;
}
public final BufferedString set(String s) {
return set(StringUtils.bytesOf(s));
}
public void setOff(int off) {
_off=off;
}
@Override public boolean equals(Object o){
if(o instanceof BufferedString) {
BufferedString str = (BufferedString) o;
if (str._len != _len) return false;
for (int i = 0; i < _len; ++i)
if (_buf[_off + i] != str._buf[str._off + i]) return false;
return true;
}
return false;
}
public boolean sameString(String str) {
if (str == null || str.length() != _len) return false;
for (int i = 0; i < _len; ++i)
if ((0xFF&_buf[_off + i]) != str.charAt(i)) return false;
return true;
}
public boolean isOneOf(String[] samples) {
if (samples != null) {
for (String sample : samples) if (sameString(sample)) return true;
}
return false;
}
// Thou Shalt Not use accessors in performance critical code - because it
// obfuscates the code's cost model. All file-local uses of the accessors
// has been stripped, please do not re-insert them. In particular, the
// hashcode and equals calls are made millions (billions?) of times a second
// when parsing categoricals.
public final byte [] getBuffer() {return _buf;}
public final int getOffset() {return _off;}
public final int length() {return _len;}
public static final byte NA = 0;
public static final byte INT = 1;
public static final byte REAL= 2;
public final byte getNumericType() {
int i = 0;
int decimalCnt = 0;
if (_len == 0) return NA;
if (_buf[_off] == '+' || _buf[_off] == '-') i++;
while( i < _len) {
if (_buf[_off+i] == '.') decimalCnt++;
else if (_buf[_off+i] < '0' || _buf[_off+i] > '9') return NA;
i++;
}
if (decimalCnt > 0)
if (decimalCnt == 1) return REAL;
else return NA; //more than one decimal, NaN
else return INT;
}
}