package water.fvec;
import java.io.File;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
import org.junit.*;
import water.*;
import water.nbhm.NonBlockingHashMap;
public class WordCountTest extends TestUtil {
@BeforeClass public static void stall() { stall_till_cloudsize(1); }
// ==========================================================================
/*@Test*/ public void testWordCount() {
File file = TestUtil.find_test_file("./smalldata/cars.csv");
//File file = TestUtil.find_test_file("../wiki/enwiki-latest-pages-articles.xml");
//File file = TestUtil.find_test_file("/home/0xdiag/datasets/wiki.xml");
//File file = TestUtil.find_test_file("../Dropbox/Sris and Cliff/H20_Rush_New_Dataset_100k.csv");
Key key = NFSFileVec.make(file);
NFSFileVec nfs=DKV.get(key).get();
final long start = System.currentTimeMillis();
NonBlockingHashMap<VStr,VStr> words = new WordCount().doAll(nfs)._words;
final long time_wc = System.currentTimeMillis();
VStr[] vss = new VStr[words.size()];
System.out.println("WC takes "+(time_wc-start)+"msec for "+vss.length+" words");
// Faster version of toArray - because calling toArray on a 16M entry array
// is slow.
// Start the walk at slot 2, because slots 0,1 hold meta-data
// In the raw backing array, Keys and Values alternate in slots
int cnt=0;
Object[] kvs = WordCount.WORDS.raw_array();
for( int i=2; i<kvs.length; i += 2 ) {
Object ok = kvs[i+0], ov = kvs[i+1];
if( ok != null && ok instanceof VStr && ok == ov )
vss[cnt++] = (VStr)ov;
}
final long time_ary = System.currentTimeMillis();
System.out.println("WC toArray "+(time_ary-time_wc)+"msec for "+cnt+" words");
Arrays.sort(vss,0,cnt,null);
final long time_sort = System.currentTimeMillis();
System.out.println("WC sort "+(time_sort-time_ary)+"msec for "+cnt+" words");
System.out.println("Found "+cnt+" unique words.");
System.out.println(Arrays.toString(vss));
UKV.remove(key);
}
private static class WordCount extends MRTask2<WordCount> {
public static NonBlockingHashMap<VStr,VStr> WORDS;
public NonBlockingHashMap<VStr,VStr> _words;
@Override public void setupLocal() { WORDS = new NonBlockingHashMap(); }
private static int isChar( int b ) {
if( 'A'<=b && b<='Z' ) return b-'A'+'a';
if( 'a'<=b && b<='z' ) return b;
return -1;
}
@Override public void map( Chunk bv ) {
_words = WORDS;
final long start = bv._start;
final int len = bv._len;
long i = start; // Parse point
// Skip partial words at the start of chunks, assuming they belong to the
// trailing end of the prior chunk.
if( start > 0 ) // Not on the 1st chunk...
while( isChar((int)bv.at(i)) >= 0 ) i++; // skip any partial word from prior
VStr vs = new VStr(new byte[512],(short)0);
// Loop over the chunk, picking out words
while( i<start+len || vs._len > 0 ) { // Till we run dry & not in middle of word
int c = isChar((int)bv.at(i)); // Load a char, lowercase it
if( c >= 0 && vs._len < 32700/*break silly long words*/ ) { // In a word?
vs.append(c); // Append char
} else if( vs._len > 0 ) { // Have a word?
VStr vs2 = WORDS.putIfAbsent(vs,vs);
if( vs2 == null ) { // If actually inserted, need new VStr
if( vs._len>256 ) System.out.println("Too long: "+vs+" at char "+i);
vs = new VStr(vs._cs,(short)(vs._off+vs._len));
} else {
vs2.inc(1); // Inc count on added word,
vs._len = 0; // and re-use VStr
}
}
i++;
}
}
@Override public void reduce( WordCount wc ) {
if( _words != wc._words )
throw H2O.unimpl();
}
@Override public AutoBuffer write(AutoBuffer ab) {
super.write(ab);
if( /*_res != null &&*/ WORDS != null )
for( VStr key : WORDS.keySet() )
ab.put2((char)key._len).putA1(key._cs,key._off,key._off+key._len).put4(key._cnt);
return ab.put2((char)65535); // End of map marker
}
@Override public WordCount read(AutoBuffer ab) {
super.read(ab);
final long start = System.currentTimeMillis();
int cnt=0;
_words = WORDS;
int len = 0;
while( (len = ab.get2()) != 65535 ) { // Read until end-of-map marker
VStr vs = new VStr(ab.getA1(len),(short)0);
vs._len = (short)len;
vs._cnt = ab.get4();
VStr vs2 = WORDS.putIfAbsent(vs,vs);
if( vs2 != null ) vs2.inc(vs._cnt); // Inc count on added word
cnt++;
}
final long t = System.currentTimeMillis() - start;
System.out.println("WC Read takes "+t+"msec for "+cnt+" words");
return this;
}
@Override public void copyOver(Freezable wc) { _words = ((WordCount)wc)._words; }
}
// A word, and a count of occurences
private static class VStr implements Comparable<VStr> {
byte[] _cs; // shared array of chars holding words
short _off,_len; // offset & len of this word
VStr(byte[]cs, short off) { assert off>=0:off; _cs=cs; _off=off; _len=0; _cnt=1; }
// append a char; return wasted pad space
public void append( int c ) {
if( _off+_len >= _cs.length ) { // no room for word?
int newlen = Math.min(32767,_cs.length<<1);
if( _off > 0 && _len < 512 ) newlen = Math.max(1024,newlen);
byte[] cs = new byte[newlen];
System.arraycopy(_cs,_off,cs,0,_len);
_off=0;
_cs = cs;
}
_cs[_off+_len++] = (byte)c;
}
volatile int _cnt; // Atomically update
private static final AtomicIntegerFieldUpdater<VStr> _cntUpdater =
AtomicIntegerFieldUpdater.newUpdater(VStr.class, "_cnt");
void inc(int d) {
int r = _cnt;
while( !_cntUpdater.compareAndSet(this,r,r+d) )
r = _cnt;
}
public String toString() { return new String(_cs,_off,_len)+"="+_cnt; }
@Override public int compareTo(VStr vs) {
int f = vs._cnt - _cnt; // sort by freq
if( f != 0 ) return f;
// alpha-sort, after tied on freq
int len = Math.min(_len,vs._len);
for(int i = 0; i < len; ++i)
if(_cs[_off+i] != vs._cs[vs._off+i])
return _cs[_off+i]-vs._cs[vs._off+i];
return _len - vs._len;
}
@Override public boolean equals(Object o){
if(!(o instanceof VStr)) return false;
VStr vs = (VStr)o;
if( vs._len != _len)return false;
for(int i = 0; i < _len; ++i)
if(_cs[_off+i] != vs._cs[vs._off+i]) return false;
return true;
}
@Override public int hashCode() {
int hash = 0;
for(int i = 0; i < _len; ++i)
hash = 31 * hash + _cs[_off+i];
return hash;
}
}
@Test @Ignore public void dummy_test() {
/* this is just a dummy test to avoid JUnit complains about missing test */
}
}