CStrChunk.java example

Explorer
h2o-3-master
package water.fvec;

import water.*;
import water.util.SetOfBytes;
import water.util.UnsafeUtils;
import water.parser.BufferedString;

import java.util.Arrays;

public class CStrChunk extends Chunk {
  static final int NA = -1;
  static protected final int _OFF=4+1;
  private int _valstart;
  public boolean _isAllASCII = false;

  public CStrChunk() {}
  public CStrChunk(int sslen, byte[] ss, int sparseLen, int idxLen, int[] id, int[] is) {
    _start = -1;
    _valstart = idx(idxLen);
    _len = idxLen;
    _mem = MemoryManager.malloc1(_valstart + sslen, false);
    UnsafeUtils.set4(_mem, 0, _valstart); // location of start of strings

    Arrays.fill(_mem,_OFF,_valstart,(byte)-1); // Indicate All Is NA's
    for( int i = 0; i < sparseLen; ++i ) // Copy the sparse indices
      UnsafeUtils.set4(_mem, idx(id==null ? i : id[i]), is[i]);
    UnsafeUtils.copyMemory(ss,0,_mem,_valstart,sslen);
    _isAllASCII = true;
    for(int i = _valstart; i < _mem.length; ++i) {
      byte c = _mem[i];
      if ((c & 0x80) == 128) { //value beyond std ASCII
        _isAllASCII = false;
        break;
      }
    }
    UnsafeUtils.set1(_mem, 4, (byte) (_isAllASCII ? 1 : 0)); // isAllASCII flag
  }

  private int idx(int i) { return _OFF+(i<<2); }
  @Override public boolean setNA_impl(int idx) { return false; }
  @Override public boolean set_impl(int idx, float f) { if (Float.isNaN(f)) return false; else throw new IllegalArgumentException("Operation not allowed on string vector.");}
  @Override public boolean set_impl(int idx, double d) { if (Double.isNaN(d)) return false; else throw new IllegalArgumentException("Operation not allowed on string vector.");}
  @Override public boolean set_impl(int idx, long l) { throw new IllegalArgumentException("Operation not allowed on string vector.");}
  @Override public boolean set_impl(int idx, String str) { return false; }

  @Override public boolean isNA_impl(int idx) {
    int off = intAt(idx);
    return off == NA;
  }

  public int intAt(int i) { return UnsafeUtils.get4(_mem, idx(i)); }
  public byte byteAt(int i) { return _mem[_valstart+i]; }
  public int lengthAtOffset(int off) {
    int len = 0;
    while (byteAt(off + len) != 0) len++;
    return len;
  }
  
  @Override public long at8_impl(int idx) { throw new IllegalArgumentException("Operation not allowed on string vector.");}
  @Override public double atd_impl(int idx) { throw new IllegalArgumentException("Operation not allowed on string vector.");}
  @Override public BufferedString atStr_impl(BufferedString bStr, int idx) {
    int off = intAt(idx);
    if( off == NA ) return null;
    int len = lengthAtOffset(off);
    assert len >= 0 : getClass().getSimpleName() + ".atStr_impl: len=" + len + ", idx=" + idx + ", off=" + off;
    return bStr.set(_mem,_valstart+off,len);
  }

  @Override protected final void initFromBytes () {
    _start = -1;  _cidx = -1;
    _valstart = UnsafeUtils.get4(_mem, 0);
    byte b = UnsafeUtils.get1(_mem,4);
    _isAllASCII = b != 0;
    set_len((_valstart-_OFF)>>2);
  }

  @Override public ChunkVisitor processRows(ChunkVisitor nc, int from, int to){
    BufferedString bs = new BufferedString();
    for(int i = from; i < to; i++)
      nc.addValue(atStr(bs,i));
    return nc;
  }
  @Override public ChunkVisitor processRows(ChunkVisitor nc, int... rows){
    BufferedString bs = new BufferedString();
    for(int i:rows)
      nc.addValue(atStr(bs,i));
    return nc;
  }



  /**
   * Optimized toLower() method to operate across the entire CStrChunk buffer in one pass.
   * This method only changes the values of ASCII uppercase letters in the text.
   *
   * NewChunk is the same size as the original.
   *
   * @param nc NewChunk to be filled with the toLower version of ASCII strings in this chunk
   * @return Filled NewChunk
   */
  public NewChunk asciiToLower(NewChunk nc) {
    // copy existing data
    nc = this.extractRows(nc, 0,_len);
    //update offsets and byte array
    for(int i= 0; i < nc._sslen; i++) {
      if (nc._ss[i] > 0x40 && nc._ss[i] < 0x5B) // check for capital letter
        nc._ss[i] += 0x20; // lower it
    }

    return nc;
  }

  /**
   * Optimized toUpper() method to operate across the entire CStrChunk buffer in one pass.
   * This method only changes the values of ASCII lowercase letters in the text.
   *
   * NewChunk is the same size as the original.
   *
   * @param nc NewChunk to be filled with the toUpper version of ASCII strings in this chunk
   * @return Filled NewChunk
   */
  public NewChunk asciiToUpper(NewChunk nc) {
    // copy existing data
    nc = this.extractRows(nc, 0,_len);
    //update offsets and byte array
    for(int i= 0; i < nc._sslen; i++) {
      if (nc._ss[i] > 0x60 && nc._ss[i] < 0x7B) // check for capital letter
        nc._ss[i] -= 0x20; // upper it
    }

    return nc;
  }

  /**
   * Optimized trim() method to operate across the entire CStrChunk buffer in one pass.
   * This mimics Java String.trim() by only considering characters of value
   * <code>'\u0020'</code> or less as whitespace to be trimmed. This means that like
   * Java's String.trim() it ignores 16 of the 17 characters regarded as a space in UTF.
   *
   * NewChunk is the same size as the original, despite trimming.
   *
   * @param nc NewChunk to be filled with trimmed version of strings in this chunk
   * @return Filled NewChunk
   */
  public NewChunk asciiTrim(NewChunk nc) {
    // copy existing data
    nc = this.extractRows(nc, 0,_len);
    //update offsets and byte array
    for(int i=0; i < _len; i++) {
      int j = 0;
      int off = UnsafeUtils.get4(_mem,idx(i));
      if (off != NA) {
        //UTF chars will appear as negative values. In Java spec, space is any char 0x20 and lower
        while( _mem[_valstart+off+j] > 0 && _mem[_valstart+off+j] < 0x21) j++;
        if (j > 0) nc.set_is(i,off + j);
        while( _mem[_valstart+off+j] != 0 ) j++; //Find end
        j--;
        while( _mem[_valstart+off+j] > 0 && _mem[_valstart+off+j] < 0x21) { //March back to find first non-space
          nc._ss[off+j] = 0; //Set new end
          j--;
        }
      }
    }
    return nc;
  }
  
  /**
   * Optimized substring() method for a buffer of only ASCII characters.
   * The presence of UTF-8 multi-byte characters would give incorrect results
   * for the string length, which is required here.
   *
   * @param nc NewChunk to be filled with substrings in this chunk
   * @param startIndex The beginning index of the substring, inclusive
   * @param endIndex The ending index of the substring, exclusive
   * @return Filled NewChunk
   */
  public NewChunk asciiSubstring(NewChunk nc, int startIndex, int endIndex) {
    // copy existing data
    nc = this.extractRows(nc, 0,_len);
    //update offsets and byte array
    for (int i = 0; i < _len; i++) {
      int off = UnsafeUtils.get4(_mem, idx(i));
      if (off != NA) {
        int len = 0;
        while (_mem[_valstart + off + len] != 0) len++; //Find length
        nc.set_is(i,startIndex < len ? off + startIndex : off + len);
        for (; len > endIndex - 1; len--) {
          nc._ss[off + len] = 0; //Set new end
        }
      }
    }
    return nc;
  }

  /**
   * Optimized length() method for a buffer of only ASCII characters.
   * This is a straight byte count for each word in the chunk. The presence
   * of UTF-8 multi-byte characters would give incorrect results.
   *
   * @param nc NewChunk to be filled with lengths of strings in this chunk
   * @return Filled NewChunk
   */
  public NewChunk asciiLength(NewChunk nc) {
    //pre-allocate since size is known
    nc.alloc_mantissa(_len);
    nc.alloc_exponent(_len); // sadly, a waste
    // fill in lengths
    for(int i=0; i < _len; i++) {
      int off = UnsafeUtils.get4(_mem,idx(i));
      int len = 0;
      if (off != NA) {
        while (_mem[_valstart + off + len] != 0) len++;
        nc.addNum(len, 0);
      } else nc.addNA();
    }
    return nc;
  }
  
  public NewChunk asciiEntropy(NewChunk nc) {
    nc.alloc_doubles(_len);
    for (int i = 0; i < _len; i++) {
      double entropy = entropyAt(i);
      if (Double.isNaN(entropy)) nc.addNA();
      else                       nc.addNum(entropy);
    }
    return nc;
  }

  double entropyAt(int i) {
    int off = intAt(i);
    if (off == NA) return Double.NaN;
    int[] frq = new int[256];
    int len = lengthAtOffset(off);
    for (int j = 0; j < len; j++) {
      frq[0xff & byteAt(off + j)]++;
    }
    double sum = 0;
    for (int b = 0; b < 256; b++) {
      int f = frq[b];
      if (f > 0) {
        double x = (double)f / len;
        sum += x * Math.log(x);
      }
    }
    return - sum / Math.log(2);
  }

  /**
   * Optimized lstrip() & rstrip() methods to operate across the entire CStrChunk buffer in one pass.
   *
   * NewChunk is the same size as the original, despite trimming.
   *
   * @param nc NewChunk to be filled with strip version of strings in this chunk
   * @param chars chars to strip, treated as ASCII
   * @return Filled NewChunk
   */
  public NewChunk asciiLStrip(NewChunk nc, String chars) {
    SetOfBytes set = new SetOfBytes(chars);
    //update offsets and byte array
    for(int i=0; i < _len; i++) {
      int off = intAt(i);
      if (off != NA) {
        while (set.contains(byteAt(off))) off++;
        int len = lengthAtOffset(off);
        nc.addStr(new BufferedString(_mem, _valstart+off, len));
      } else nc.addNA();
    }
    return nc;
  }

  public NewChunk asciiRStrip(NewChunk nc, String chars) {
    SetOfBytes set = new SetOfBytes(chars);
    //update offsets and byte array
    for(int i=0; i < _len; i++) {
      int off = intAt(i);
      if (off != NA) {
        int pos = off + lengthAtOffset(off);
        while (pos --> off && set.contains(byteAt(pos)));
        nc.addStr(new BufferedString(_mem, _valstart+off, pos - off + 1));
      } else nc.addNA();
    }
    return nc;
  }
}