BufferedString.java example

Explorer
h2o-3-master
package water.parser;

import com.google.common.base.Charsets;
import water.AutoBuffer;
import water.Iced;
import water.util.StringUtils;

import java.util.Arrays;
import java.util.Formatter;

/**
 * A mutable wrapper to hold String as a byte array.
 *
 * It can be modified by set of methods, the hash code is computed
 * on the fly. There is no speed up benefit of cashing the hash in
 * a dedicated private field. See the speed test in {@code ParseTest2#testSpeedOfCategoricalUpdate}.
 *
 * Warning: This data structure is not designed for parallel access!
 */
public class BufferedString extends Iced implements Comparable<BufferedString> {
   private byte [] _buf;
   private int _off;
   private int _len;

   public BufferedString(byte[] buf, int off, int len) { 
     _buf = buf;  
     _off = off;  
     _len = len; 
     assert len >= 0 :  "Bad length in constructor " + len;
   }

   private BufferedString(byte[] buf) { this(buf,0,buf.length); }
   // Cloning constructing used during collecting unique categoricals
   BufferedString(BufferedString from) {
     this(Arrays.copyOfRange(from._buf,from._off,from._off+from._len));
   }

   public BufferedString(String from) { this(StringUtils.bytesOf(from)); }
   // Used to make a temp recycling BufferedString in hot loops
   public BufferedString() { }

   public final AutoBuffer write_impl(AutoBuffer ab) {
     if( _buf == null ) return ab.putInt(-1);
     ab.putInt(_len);
     return ab.putA1(_buf,_off,_off+_len);
   }

  public final BufferedString read_impl(AutoBuffer ab){
    _buf = ab.getA1();
    if(_buf != null) _len = _buf.length;
    return this;
  }

  /**
   * Comparison, according to Comparable interface
   * @param o other string to compare
   * @return -1 or 0 or 1, as specified in Comparable
   */
   @Override public int compareTo( BufferedString o ) {
     int len = Math.min(_len,o._len);
     for( int i=0; i<len; i++ ) {
       int x = (0xFF&_buf[_off+i]) - (0xFF&o._buf[o._off+i]);
       if( x != 0 ) return x;
     }
     return _len - o._len;
   }

   @Override public int hashCode(){
     int hash = 0;
     int n = _off + _len;
     for (int i = _off; i < n; ++i) // equivalent to String.hashCode (not actually)
       hash = 31 * hash + (char)_buf[i];
     return hash;
   }

   // TODO(vlad): make sure that this method is not as destructive as it now is (see tests) 
   void addChar() {
     _len++;
   }

   void addBuff(byte [] bits){
     byte [] buf = new byte[_len];
     int l1 = _buf.length- _off;
     System.arraycopy(_buf, _off, buf, 0, l1);
     System.arraycopy(bits, 0, buf, l1, _len-l1);
     _off = 0;
     _buf = buf;
   }


  // WARNING: LOSSY CONVERSION!!!
  // Converting to a String will truncate all bytes with high-order bits set,
  // even if they are otherwise a valid member of the field/BufferedString.
  // Converting back to a BufferedString will then make something with fewer
  // characters than what you started with, and will fail all equals() tests.
  // TODO(Vlad): figure out what to do about the buffer being not UTF-8 (who guarantees?)
  @Override
  public String toString() {
    return _buf == null ? null : StringUtils.toString(_buf, Math.max(0, _off), Math.min(_buf.length, _len));
  }

  public String bytesToString() {
    StringBuilder sb = new StringBuilder(_len * 2);
    Formatter formatter = new Formatter(sb);
    boolean inHex = false;
    for (int i = 0; i < _len; i++) {
      if ((_buf[_off + i] & 0x80) == 128) {
        if (!inHex) sb.append("<0x");
        formatter.format("%02X", _buf[_off + i]);
        inHex = true;
      } else { // ASCII
        if (inHex) {
          sb.append(">");
          inHex = false;
        }
        formatter.format("%c", _buf[_off + i]);
      }
    }
    if (inHex) sb.append(">"); // close hex values as trailing char
    return sb.toString();
  }

  public static String[] toString(BufferedString bStr[]) {
    if( bStr==null ) return null;
    String[] ss = new String[bStr.length];
    for( int i=0; i<bStr.length; i++ )
      ss[i] = bStr[i].toString();
    return ss;
  }

  public static BufferedString[] toBufferedString(String[] strings) {
    if (strings == null) return null;
    BufferedString[] res = new BufferedString[strings.length];
    for (int i = 0; i < strings.length; i++) {
      res[i] = new BufferedString(strings[i]);
    }
    return res;
  }

  public final BufferedString set(byte[] buf) {
    return set(buf, 0, buf.length);
  }

  public final BufferedString set(byte[] buf, int off, int len) {
    _buf = buf;
    _off = off;
    _len = len;
    assert len >= 0 : "Bad length in setter " + len;
    return this;
  }

  public final BufferedString set(String s) {
    return set(StringUtils.bytesOf(s));
  }

  public void setOff(int off) {
    _off=off;
  }

  @Override public boolean equals(Object o){
    if(o instanceof BufferedString) {
      BufferedString str = (BufferedString) o;
      if (str._len != _len) return false;
      for (int i = 0; i < _len; ++i)
        if (_buf[_off + i] != str._buf[str._off + i]) return false;
      return true;
    }
    return false;
  }
 
  public boolean sameString(String str) {
    if (str == null || str.length() != _len) return false;
    for (int i = 0; i < _len; ++i)
      if ((0xFF&_buf[_off + i]) != str.charAt(i)) return false;
    return true;
  }
  
  public boolean isOneOf(String[] samples) {
    if (samples != null) {
      for (String sample : samples) if (sameString(sample)) return true;
    }
    return false;
  }
  
  // Thou Shalt Not use accessors in performance critical code - because it
  // obfuscates the code's cost model.  All file-local uses of the accessors
  // has been stripped, please do not re-insert them.  In particular, the
  // hashcode and equals calls are made millions (billions?) of times a second
  // when parsing categoricals.
  public final byte [] getBuffer() {return _buf;} 
  public final int getOffset() {return _off;}
  public final int length() {return _len;}

  public static final byte NA  =  0;
  public static final byte INT =  1;
  public static final byte REAL=  2;

  public final byte getNumericType() {
    int i = 0;
    int decimalCnt = 0;
    if (_len == 0) return NA;
    if (_buf[_off] == '+' || _buf[_off] == '-') i++;
    while( i < _len) {
      if (_buf[_off+i] == '.') decimalCnt++;
      else if (_buf[_off+i] < '0' || _buf[_off+i] > '9') return NA;
      i++;
    }
    if (decimalCnt > 0)
      if (decimalCnt == 1) return REAL;
      else return NA; //more than one decimal, NaN
    else return INT;
  }
}