WordCountTest.java example

Explorer
h2o-3-master
package water.fvec;

import org.junit.*;

import java.io.*;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
import java.util.concurrent.atomic.AtomicLong;
import water.*;
import water.nbhm.NonBlockingHashMap;
import water.util.FileUtils;

public class WordCountTest extends TestUtil {
  @BeforeClass static public void setup() {  stall_till_cloudsize(1); }

  // ==========================================================================
  @Test public void testWordCount() throws IOException {
    File file = FileUtils.getFile("./smalldata/junit/cars.csv");
    doWordCount(file);
  }

  protected void doWordCount(File file) throws IOException {
    NFSFileVec nfs=NFSFileVec.make(file);
  
    System.out.printf("\nProgress: 00 percent");
    final long start = System.currentTimeMillis();
    NonBlockingHashMap<VStr,VStr> words = new WordCount().doAll(nfs)._words;
    final long time_wc = System.currentTimeMillis();
    VStr[] vss = new VStr[words.size()];
    System.out.println("\nWC takes "+(time_wc-start)+"msec for "+vss.length+" words");
  
    // Faster version of toArray - because calling toArray on a 16M entry array
    // is slow.
    // Start the walk at slot 2, because slots 0,1 hold meta-data
    // In the raw backing array, Keys and Values alternate in slots
    int cnt=0;
    Object[] kvs = WordCount.WORDS.raw_array();
    for( int i=2; i<kvs.length; i += 2 ) {
      Object ok = kvs[i], ov = kvs[i+1];
      if( ok != null && ok instanceof VStr && ok == ov )
        vss[cnt++] = (VStr)ov;
    }
    final long time_ary = System.currentTimeMillis();
    System.out.println("WC toArray "+(time_ary-time_wc)+"msec for "+cnt+" words");
  
    Arrays.sort(vss,0,cnt,null);
    final long time_sort = System.currentTimeMillis();
    System.out.println("WC sort "+(time_sort-time_ary)+"msec for "+cnt+" words");
  
    System.out.println("Found "+cnt+" unique words.");
    System.out.println(Arrays.toString(vss));
    nfs.remove(new Futures()).blockForPending();
  }
  
  private static class WordCount extends MRTask<WordCount> {
    static NonBlockingHashMap<VStr,VStr> WORDS;
    static AtomicLong PROGRESS;
    transient NonBlockingHashMap<VStr,VStr> _words;

    @Override public void setupLocal() { WORDS = new NonBlockingHashMap<>(); PROGRESS = new AtomicLong(0); }
    private static int isChar( int b ) {
      if( 'A'<=b && b<='Z' ) return b-'A'+'a';
      if( 'a'<=b && b<='z' ) return b;
      return -1;
    }
  
    @Override public void map( Chunk bv ) {
      _words = WORDS;
      final int len = bv._len;
      int i=0;                  // Parse point
      // Skip partial words at the start of chunks, assuming they belong to the
      // trailing end of the prior chunk.
      if( bv._start > 0 )       // Not on the 1st chunk...
        while( i < len && isChar((int)bv.atd(i)) >= 0 ) i++; // skip any partial word from prior
      VStr vs = new VStr(new byte[512],(short)0);
      // Loop over the chunk, picking out words
      while( i<len )                      // Till we run dry
        vs = doChar(vs,(int)bv.atd(i++)); // Load a char & make words
      // Finish up partial word at Chunk end by flowing into the next Chunk
      i = 0;
      Chunk nv = bv.nextChunk();
      if( nv == null ) vs = doChar(vs,' '); // No next Chunk, end partial word
      while( vs._len > 0 )                // Till word breaks
        vs = doChar(vs,(int)nv.atd(i++)); // Load a char & make words
      // Show some progress
      long progress = PROGRESS.addAndGet(len);
      long pre = progress - len;
      final long total = bv._vec.length();
      int perc0 = (int)(100*pre     /total);
      int perc1 = (int)(100*progress/total);
      if( perc0 != perc1 ) System.out.printf("\b\b\b%2d percent",perc1);
    }

    private VStr doChar( VStr vs, int raw ) {
      int c = isChar(raw);      // Check for letter & lowercase it
      if( c >= 0 && vs._len < 32700/*break silly long words*/ ) // In a word?
        return vs.append(c);     // Append char
      if( vs._len == 0 ) return vs; // Not a letter and not in a word?
      // None-letter ends word; count word
      VStr vs2 = WORDS.putIfAbsent(vs,vs);
      if( vs2 == null ) {     // If actually inserted, need new VStr
        //if( vs._len>256 ) System.out.println("Too long: "+vs);
        return new VStr(vs._cs,(short)(vs._off+vs._len)); // New VStr reuses extra space from old
      }
      vs2.inc(1);               // Inc count on added word, and
      vs._len = 0;              // re-use VStr (since not added to NBHM)
      return vs;
    }
  
    @Override public void reduce( WordCount wc ) {
      if( _words != wc._words )
        throw H2O.unimpl();
    }
  
    public final AutoBuffer write_impl(AutoBuffer ab) {
      if( _words != null ) 
        for( VStr key : WORDS.keySet() )
          ab.put2((char)key._len).putA1(key._cs,key._off,key._off+key._len).put4(key._cnt);
      return ab.put2((char)65535); // End of map marker
    }
    public final WordCount read_impl(AutoBuffer ab) {
      final long start = System.currentTimeMillis();
      int cnt=0;
      _words = WORDS;
      int len;
      while( (len = ab.get2()) != 65535 ) { // Read until end-of-map marker
        VStr vs = new VStr(ab.getA1(len),(short)0);
        vs._len = (short)len;
        vs._cnt = ab.get4();
        VStr vs2 = WORDS.putIfAbsent(vs,vs);
        if( vs2 != null ) vs2.inc(vs._cnt); // Inc count on added word
        cnt++;
      }
      final long t = System.currentTimeMillis() - start;
      System.out.println("WC Read takes "+t+"msec for "+cnt+" words");
      return this;
    }
    @Override protected void copyOver(WordCount wc) { _words = wc._words; }
  }
  
  
  // A word, and a count of occurences.  Typically the '_cs' buf is shared
  // amongst many VStr's, all using different off/len pairs.
  private static class VStr implements Comparable<VStr> {
    byte[] _cs;                 // shared array of chars holding words
    short _off,_len;            // offset & len of this word
    VStr(byte[]cs, short off) { assert off>=0:off; _cs=cs; _off=off; _len=0; _cnt=1; }
    // append a char; return wasted pad space
    public VStr append( int c ) {
      if( _off+_len >= _cs.length ) { // no room for word?
        int newlen = Math.min(32767,_cs.length<<1);
        if( _off > 0 && _len < 512 ) newlen = Math.max(1024,newlen);
        byte[] cs = new byte[newlen];
        System.arraycopy(_cs,_off,cs,0,_len);
        _off=0;
        _cs = cs;
      }
      _cs[_off+_len++] = (byte)c;
      return this;
    }
    volatile int _cnt;          // Atomically update
    private static final AtomicIntegerFieldUpdater<VStr> _cntUpdater =
      AtomicIntegerFieldUpdater.newUpdater(VStr.class, "_cnt");
    void inc(int d) {
      int r = _cnt;
      while( !_cntUpdater.compareAndSet(this,r,r+d) )
        r = _cnt;
    }
  
    public String toString() { return new String(_cs,_off,_len)+"="+_cnt; }
  
    @Override public int compareTo(VStr vs) {
      int f = vs._cnt - _cnt; // sort by freq
      if( f != 0 ) return f;
      // alpha-sort, after tied on freq
      int len = Math.min(_len,vs._len);
      for(int i = 0; i < len; ++i)
        if(_cs[_off+i] != vs._cs[vs._off+i])
          return _cs[_off+i]-vs._cs[vs._off+i];
      return _len - vs._len;
    }
    @Override public boolean equals(Object o){
      if(!(o instanceof VStr)) return false;
      VStr vs = (VStr)o;
      if( vs._len != _len)return false;
      for(int i = 0; i < _len; ++i)
        if(_cs[_off+i] != vs._cs[vs._off+i]) return false;
      return true;
    }
    @Override public int hashCode() {
     int hash = 0;
     for(int i = 0; i < _len; ++i)
       hash = 31 * hash + _cs[_off+i];
     return hash;
    }
  }

  @Test public void dummy_test() {
    /* this is just a dummy test to avoid JUnit complains about missing test */
  }
}