package hex.word2vec; import org.junit.BeforeClass; import org.junit.Test; import water.TestUtil; import water.fvec.Frame; import water.fvec.TestFrameBuilder; import water.fvec.Vec; import water.parser.BufferedString; import static water.util.FileUtils.*; import water.util.IcedLong; import java.util.Map; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.notNullValue; import static org.junit.Assert.*; import static org.junit.Assume.assumeThat; public class WordCountTaskTest extends TestUtil { @BeforeClass() public static void setup() { stall_till_cloudsize(1); } @Test public void testWordCount() { String[] strData = new String[10000]; for (int i = 0; i < strData.length; i++) { int b = i % 10; if (b < 3) strData[i] = "A"; else if (b < 5) strData[i] = "B"; else strData[i] = "C"; } Frame fr = new TestFrameBuilder() .withName("data") .withColNames("Str") .withVecTypes(Vec.T_STR) .withDataForCol(0, strData) .withChunkLayout(100, 900, 5000, 4000) .build(); try { Map<BufferedString, IcedLong> counts = new WordCountTask().doAll(fr.vec(0))._counts; assertEquals(3, counts.size()); assertEquals(3000L, counts.get(new BufferedString("A"))._val); assertEquals(2000L, counts.get(new BufferedString("B"))._val); assertEquals(5000L, counts.get(new BufferedString("C"))._val); System.out.println(counts); } finally { fr.remove(); } } @Test public void testWordCountText8() { String fName = "bigdata/laptop/text8.gz"; assumeThat("text8 data available", locateFile(fName), is(notNullValue())); // only run if text8 is present Frame fr = parse_test_file(fName, "NA", 0, new byte[]{Vec.T_STR}); try { Map<BufferedString, IcedLong> counts = new WordCountTask().doAll(fr.vec(0))._counts; assertEquals(253854, counts.size()); assertEquals(303L, counts.get(new BufferedString("anarchism"))._val); assertEquals(316376L, counts.get(new BufferedString("to"))._val); assertNotNull(counts); } finally { fr.remove(); } } }