package water.fvec;
import org.junit.*;
import java.io.*;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
import java.util.concurrent.atomic.AtomicLong;
import water.*;
import water.nbhm.NonBlockingHashMap;
import water.util.FileUtils;
public class WordCountTest extends TestUtil {
@BeforeClass static public void setup() { stall_till_cloudsize(1); }
// ==========================================================================
@Test public void testWordCount() throws IOException {
File file = FileUtils.getFile("./smalldata/junit/cars.csv");
doWordCount(file);
}
protected void doWordCount(File file) throws IOException {
NFSFileVec nfs=NFSFileVec.make(file);
System.out.printf("\nProgress: 00 percent");
final long start = System.currentTimeMillis();
NonBlockingHashMap<VStr,VStr> words = new WordCount().doAll(nfs)._words;
final long time_wc = System.currentTimeMillis();
VStr[] vss = new VStr[words.size()];
System.out.println("\nWC takes "+(time_wc-start)+"msec for "+vss.length+" words");
// Faster version of toArray - because calling toArray on a 16M entry array
// is slow.
// Start the walk at slot 2, because slots 0,1 hold meta-data
// In the raw backing array, Keys and Values alternate in slots
int cnt=0;
Object[] kvs = WordCount.WORDS.raw_array();
for( int i=2; i<kvs.length; i += 2 ) {
Object ok = kvs[i], ov = kvs[i+1];
if( ok != null && ok instanceof VStr && ok == ov )
vss[cnt++] = (VStr)ov;
}
final long time_ary = System.currentTimeMillis();
System.out.println("WC toArray "+(time_ary-time_wc)+"msec for "+cnt+" words");
Arrays.sort(vss,0,cnt,null);
final long time_sort = System.currentTimeMillis();
System.out.println("WC sort "+(time_sort-time_ary)+"msec for "+cnt+" words");
System.out.println("Found "+cnt+" unique words.");
System.out.println(Arrays.toString(vss));
nfs.remove(new Futures()).blockForPending();
}
private static class WordCount extends MRTask<WordCount> {
static NonBlockingHashMap<VStr,VStr> WORDS;
static AtomicLong PROGRESS;
transient NonBlockingHashMap<VStr,VStr> _words;
@Override public void setupLocal() { WORDS = new NonBlockingHashMap<>(); PROGRESS = new AtomicLong(0); }
private static int isChar( int b ) {
if( 'A'<=b && b<='Z' ) return b-'A'+'a';
if( 'a'<=b && b<='z' ) return b;
return -1;
}
@Override public void map( Chunk bv ) {
_words = WORDS;
final int len = bv._len;
int i=0; // Parse point
// Skip partial words at the start of chunks, assuming they belong to the
// trailing end of the prior chunk.
if( bv._start > 0 ) // Not on the 1st chunk...
while( i < len && isChar((int)bv.atd(i)) >= 0 ) i++; // skip any partial word from prior
VStr vs = new VStr(new byte[512],(short)0);
// Loop over the chunk, picking out words
while( i<len ) // Till we run dry
vs = doChar(vs,(int)bv.atd(i++)); // Load a char & make words
// Finish up partial word at Chunk end by flowing into the next Chunk
i = 0;
Chunk nv = bv.nextChunk();
if( nv == null ) vs = doChar(vs,' '); // No next Chunk, end partial word
while( vs._len > 0 ) // Till word breaks
vs = doChar(vs,(int)nv.atd(i++)); // Load a char & make words
// Show some progress
long progress = PROGRESS.addAndGet(len);
long pre = progress - len;
final long total = bv._vec.length();
int perc0 = (int)(100*pre /total);
int perc1 = (int)(100*progress/total);
if( perc0 != perc1 ) System.out.printf("\b\b\b%2d percent",perc1);
}
private VStr doChar( VStr vs, int raw ) {
int c = isChar(raw); // Check for letter & lowercase it
if( c >= 0 && vs._len < 32700/*break silly long words*/ ) // In a word?
return vs.append(c); // Append char
if( vs._len == 0 ) return vs; // Not a letter and not in a word?
// None-letter ends word; count word
VStr vs2 = WORDS.putIfAbsent(vs,vs);
if( vs2 == null ) { // If actually inserted, need new VStr
//if( vs._len>256 ) System.out.println("Too long: "+vs);
return new VStr(vs._cs,(short)(vs._off+vs._len)); // New VStr reuses extra space from old
}
vs2.inc(1); // Inc count on added word, and
vs._len = 0; // re-use VStr (since not added to NBHM)
return vs;
}
@Override public void reduce( WordCount wc ) {
if( _words != wc._words )
throw H2O.unimpl();
}
public final AutoBuffer write_impl(AutoBuffer ab) {
if( _words != null )
for( VStr key : WORDS.keySet() )
ab.put2((char)key._len).putA1(key._cs,key._off,key._off+key._len).put4(key._cnt);
return ab.put2((char)65535); // End of map marker
}
public final WordCount read_impl(AutoBuffer ab) {
final long start = System.currentTimeMillis();
int cnt=0;
_words = WORDS;
int len;
while( (len = ab.get2()) != 65535 ) { // Read until end-of-map marker
VStr vs = new VStr(ab.getA1(len),(short)0);
vs._len = (short)len;
vs._cnt = ab.get4();
VStr vs2 = WORDS.putIfAbsent(vs,vs);
if( vs2 != null ) vs2.inc(vs._cnt); // Inc count on added word
cnt++;
}
final long t = System.currentTimeMillis() - start;
System.out.println("WC Read takes "+t+"msec for "+cnt+" words");
return this;
}
@Override protected void copyOver(WordCount wc) { _words = wc._words; }
}
// A word, and a count of occurences. Typically the '_cs' buf is shared
// amongst many VStr's, all using different off/len pairs.
private static class VStr implements Comparable<VStr> {
byte[] _cs; // shared array of chars holding words
short _off,_len; // offset & len of this word
VStr(byte[]cs, short off) { assert off>=0:off; _cs=cs; _off=off; _len=0; _cnt=1; }
// append a char; return wasted pad space
public VStr append( int c ) {
if( _off+_len >= _cs.length ) { // no room for word?
int newlen = Math.min(32767,_cs.length<<1);
if( _off > 0 && _len < 512 ) newlen = Math.max(1024,newlen);
byte[] cs = new byte[newlen];
System.arraycopy(_cs,_off,cs,0,_len);
_off=0;
_cs = cs;
}
_cs[_off+_len++] = (byte)c;
return this;
}
volatile int _cnt; // Atomically update
private static final AtomicIntegerFieldUpdater<VStr> _cntUpdater =
AtomicIntegerFieldUpdater.newUpdater(VStr.class, "_cnt");
void inc(int d) {
int r = _cnt;
while( !_cntUpdater.compareAndSet(this,r,r+d) )
r = _cnt;
}
public String toString() { return new String(_cs,_off,_len)+"="+_cnt; }
@Override public int compareTo(VStr vs) {
int f = vs._cnt - _cnt; // sort by freq
if( f != 0 ) return f;
// alpha-sort, after tied on freq
int len = Math.min(_len,vs._len);
for(int i = 0; i < len; ++i)
if(_cs[_off+i] != vs._cs[vs._off+i])
return _cs[_off+i]-vs._cs[vs._off+i];
return _len - vs._len;
}
@Override public boolean equals(Object o){
if(!(o instanceof VStr)) return false;
VStr vs = (VStr)o;
if( vs._len != _len)return false;
for(int i = 0; i < _len; ++i)
if(_cs[_off+i] != vs._cs[vs._off+i]) return false;
return true;
}
@Override public int hashCode() {
int hash = 0;
for(int i = 0; i < _len; ++i)
hash = 31 * hash + _cs[_off+i];
return hash;
}
}
@Test public void dummy_test() {
/* this is just a dummy test to avoid JUnit complains about missing test */
}
}