WordCountTest.java example

Explorer
h2o-2-master
package water.fvec;

import java.io.File;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;

import org.junit.*;

import water.*;
import water.nbhm.NonBlockingHashMap;

public class WordCountTest extends TestUtil {
  @BeforeClass public static void stall() { stall_till_cloudsize(1); }
  // ==========================================================================
  /*@Test*/ public void testWordCount() {
    File file = TestUtil.find_test_file("./smalldata/cars.csv");
    //File file = TestUtil.find_test_file("../wiki/enwiki-latest-pages-articles.xml");
    //File file = TestUtil.find_test_file("/home/0xdiag/datasets/wiki.xml");
    //File file = TestUtil.find_test_file("../Dropbox/Sris and Cliff/H20_Rush_New_Dataset_100k.csv");
    Key key = NFSFileVec.make(file);
    NFSFileVec nfs=DKV.get(key).get();

    final long start = System.currentTimeMillis();
    NonBlockingHashMap<VStr,VStr> words = new WordCount().doAll(nfs)._words;
    final long time_wc = System.currentTimeMillis();
    VStr[] vss = new VStr[words.size()];
    System.out.println("WC takes "+(time_wc-start)+"msec for "+vss.length+" words");

    // Faster version of toArray - because calling toArray on a 16M entry array
    // is slow.
    // Start the walk at slot 2, because slots 0,1 hold meta-data
    // In the raw backing array, Keys and Values alternate in slots
    int cnt=0;
    Object[] kvs = WordCount.WORDS.raw_array();
    for( int i=2; i<kvs.length; i += 2 ) {
      Object ok = kvs[i+0], ov = kvs[i+1];
      if( ok != null && ok instanceof VStr && ok == ov )
        vss[cnt++] = (VStr)ov;
    }
    final long time_ary = System.currentTimeMillis();
    System.out.println("WC toArray "+(time_ary-time_wc)+"msec for "+cnt+" words");

    Arrays.sort(vss,0,cnt,null);
    final long time_sort = System.currentTimeMillis();
    System.out.println("WC sort "+(time_sort-time_ary)+"msec for "+cnt+" words");

    System.out.println("Found "+cnt+" unique words.");
    System.out.println(Arrays.toString(vss));
    UKV.remove(key);
  }

  private static class WordCount extends MRTask2<WordCount> {
    public static NonBlockingHashMap<VStr,VStr> WORDS;
    public NonBlockingHashMap<VStr,VStr> _words;
    @Override public void setupLocal() { WORDS = new NonBlockingHashMap(); }
    private static int isChar( int b ) {
      if( 'A'<=b && b<='Z' ) return b-'A'+'a';
      if( 'a'<=b && b<='z' ) return b;
      return -1;
    }

    @Override public void map( Chunk bv ) {
      _words = WORDS;
      final long start = bv._start;
      final int len = bv._len;
      long i = start;           // Parse point
      // Skip partial words at the start of chunks, assuming they belong to the
      // trailing end of the prior chunk.
      if( start > 0 )           // Not on the 1st chunk...
        while( isChar((int)bv.at(i)) >= 0 ) i++; // skip any partial word from prior
      VStr vs = new VStr(new byte[512],(short)0);
      // Loop over the chunk, picking out words
      while( i<start+len || vs._len > 0 ) { // Till we run dry & not in middle of word
        int c = isChar((int)bv.at(i));      // Load a char, lowercase it
        if( c >= 0 && vs._len < 32700/*break silly long words*/ ) { // In a word?
          vs.append(c);               // Append char
        } else if( vs._len > 0 ) {    // Have a word?
          VStr vs2 = WORDS.putIfAbsent(vs,vs);
          if( vs2 == null ) {   // If actually inserted, need new VStr
            if( vs._len>256 ) System.out.println("Too long: "+vs+" at char "+i);
            vs = new VStr(vs._cs,(short)(vs._off+vs._len));
          } else {
            vs2.inc(1);         // Inc count on added word,
            vs._len = 0;        // and re-use VStr
          }
        }
        i++;
      }
    }

    @Override public void reduce( WordCount wc ) {
      if( _words != wc._words )
        throw H2O.unimpl();
    }

    @Override public AutoBuffer write(AutoBuffer ab) {
      super.write(ab);
      if( /*_res != null &&*/ WORDS != null )
        for( VStr key : WORDS.keySet() )
          ab.put2((char)key._len).putA1(key._cs,key._off,key._off+key._len).put4(key._cnt);
      return ab.put2((char)65535); // End of map marker
    }
    @Override public WordCount read(AutoBuffer ab) {
      super.read(ab);
      final long start = System.currentTimeMillis();
      int cnt=0;
      _words = WORDS;
      int len = 0;
      while( (len = ab.get2()) != 65535 ) { // Read until end-of-map marker
        VStr vs = new VStr(ab.getA1(len),(short)0);
        vs._len = (short)len;
        vs._cnt = ab.get4();
        VStr vs2 = WORDS.putIfAbsent(vs,vs);
        if( vs2 != null ) vs2.inc(vs._cnt); // Inc count on added word
        cnt++;
      }
      final long t = System.currentTimeMillis() - start;
      System.out.println("WC Read takes "+t+"msec for "+cnt+" words");
      return this;
    }
    @Override public void copyOver(Freezable wc) { _words = ((WordCount)wc)._words; }
  }


  // A word, and a count of occurences
  private static class VStr implements Comparable<VStr> {
    byte[] _cs;                 // shared array of chars holding words
    short _off,_len;            // offset & len of this word
    VStr(byte[]cs, short off) { assert off>=0:off; _cs=cs; _off=off; _len=0; _cnt=1; }
    // append a char; return wasted pad space
    public void append( int c ) {
      if( _off+_len >= _cs.length ) { // no room for word?
        int newlen = Math.min(32767,_cs.length<<1);
        if( _off > 0 && _len < 512 ) newlen = Math.max(1024,newlen);
        byte[] cs = new byte[newlen];
        System.arraycopy(_cs,_off,cs,0,_len);
        _off=0;
        _cs = cs;
      }
      _cs[_off+_len++] = (byte)c;
    }
    volatile int _cnt;          // Atomically update
    private static final AtomicIntegerFieldUpdater<VStr> _cntUpdater =
      AtomicIntegerFieldUpdater.newUpdater(VStr.class, "_cnt");
    void inc(int d) {
      int r = _cnt;
      while( !_cntUpdater.compareAndSet(this,r,r+d) )
        r = _cnt;
    }

    public String toString() { return new String(_cs,_off,_len)+"="+_cnt; }

    @Override public int compareTo(VStr vs) {
      int f = vs._cnt - _cnt; // sort by freq
      if( f != 0 ) return f;
      // alpha-sort, after tied on freq
      int len = Math.min(_len,vs._len);
      for(int i = 0; i < len; ++i)
        if(_cs[_off+i] != vs._cs[vs._off+i])
          return _cs[_off+i]-vs._cs[vs._off+i];
      return _len - vs._len;
    }
    @Override public boolean equals(Object o){
      if(!(o instanceof VStr)) return false;
      VStr vs = (VStr)o;
      if( vs._len != _len)return false;
      for(int i = 0; i < _len; ++i)
        if(_cs[_off+i] != vs._cs[vs._off+i]) return false;
      return true;
    }
    @Override public int hashCode() {
     int hash = 0;
     for(int i = 0; i < _len; ++i)
       hash = 31 * hash + _cs[_off+i];
     return hash;
    }
  }

  @Test @Ignore public void dummy_test() {
    /* this is just a dummy test to avoid JUnit complains about missing test */
  }
}