package water.parser;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import water.Key;
import water.TestUtil;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.PrettyPrint;
import water.util.StringUtils;
import java.util.Random;
import java.util.UUID;
import static water.parser.DefaultParserProviders.CSV_INFO;
public class ParserTest2 extends TestUtil {
@BeforeClass
public static void setup() {
stall_till_cloudsize(1);
}
private final char[] SEPARATORS = new char[] {',', ' '};
private static void testParsed(Frame fr, String[][] expected) {
Assert.assertEquals(expected .length,fr.numRows());
Assert.assertEquals(expected[0].length,fr.numCols());
for( int j = 0; j < fr.numCols(); ++j ) {
Vec vec = fr.vecs()[j];
for( int i = 0; i < expected.length; ++i ) {
if( expected[i][j]==null )
Assert.assertTrue(i+" -- "+j, vec.isNA(i));
else {
String pval = vec.domain()[(int)vec.at8(i)];
Assert.assertTrue(expected[i][j]+" -- "+pval,expected[i][j].equals(pval));
}
}
}
fr.delete();
}
@Test public void testNAs() {
String [] data = new String[]{
"'C1Chunk',C1SChunk, 'C2Chunk', 'C2SChunk', 'C4Chunk', 'C4FChunk', 'C8Chunk', 'C8DChunk', 'Categorical'\n" +
"0, 0.0, 0, 0, 0, 0 , 0, 8.878979, A \n" ,
"1, 0.1, 1, 0.1, 1, 1 , 1, 1.985934, B \n" ,
"2, 0.2, 2, 0.2, 2, 2 , 2, 3.398018, C \n" ,
"3, 0.3, 3, 0.3, 3, 3 , 3, 9.329589, D \n" ,
"4, 0.4, 4, 4, 4, 4 , 2147483649, 0.290184, A \n" ,
"0, 0.5, 0, 0, -100000, 1.234e2 ,-2147483650, 1e-30, B \n" ,
"254, 0.25, 2550, 6553.4, 100000, 2.345e-2, 0, 1e30, C \n" ,
" , , , , , , , , \n" ,
"?, NA, ?, ?, ?, ?, ?, ?, \n" ,
};
Key rkey = ParserTest.makeByteVec(data);
ParseSetup ps = new ParseSetup(CSV_INFO, (byte)',', false, ParseSetup.HAS_HEADER, 9,
new String[]{"'C1Chunk'","C1SChunk", "'C2Chunk'", "'C2SChunk'", "'C4Chunk'", "'C4FChunk'", "'C8Chunk'", "'C8DChunk'", "'Categorical'"},
ParseSetup.strToColumnTypes(new String[]{"Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Numeric", "Enum"}), null, null, null);
Frame fr = ParseDataset.parse(Key.make("na_test.hex"), new Key[]{rkey}, true, ps);
int nlines = (int)fr.numRows();
Assert.assertEquals(9,nlines);
Assert.assertEquals(9,fr.numCols());
for(int i = 0; i < nlines-2; ++i)
for( Vec v : fr.vecs() )
Assert.assertTrue("error at line "+i+", vec " + v.chunkForChunkIdx(0).getClass().getSimpleName(),
!Double.isNaN(v.at(i)) && !v.isNA(i) );
for( int j=0; j<fr.vecs().length; j++ ) {
Vec v = fr.vecs()[j];
for( int i = nlines-2; i < nlines; ++i )
Assert.assertTrue(i + ", " + j + ":" + v.at(i) + ", " + v.isNA(i), Double.isNaN(v.at(i)) && v.isNA(i) );
}
fr.delete();
}
@Test public void testSingleQuotes(){
String[] data = new String[]{"'Tomass,test,first,line'\n'Tomas''s,test2',test2\nlast,'line''","s, trailing, piece'"};
String[][] expectFalse = new String[][] { ar("'Tomass" ,"test" ,"first","line'"),
ar("'Tomas''s","test2'","test2",null),
ar("last","'line''s","trailing","piece'") };
Key k = ParserTest.makeByteVec(data);
ParseSetup gSetupF = ParseSetup.guessSetup(null, StringUtils.bytesOf(data[0]), CSV_INFO, (byte)',', 4, false/*single quote*/, ParseSetup.NO_HEADER, null, null, null, null);
gSetupF._column_types = ParseSetup.strToColumnTypes(new String[]{"Enum", "Enum", "Enum", "Enum"});
Frame frF = ParseDataset.parse(Key.make(), new Key[]{k}, false, gSetupF);
testParsed(frF,expectFalse);
String[][] expectTrue = new String[][] { ar("Tomass,test,first,line", null),
ar("Tomas''stest2","test2"),
ar("last", "lines trailing piece") };
ParseSetup gSetupT = ParseSetup.guessSetup(null, StringUtils.bytesOf(data[0]), CSV_INFO, (byte)',', 2, true/*single quote*/, ParseSetup.NO_HEADER, null, null, null, null);
gSetupT._column_types = ParseSetup.strToColumnTypes(new String[]{"Enum", "Enum", "Enum", "Enum"});
Frame frT = ParseDataset.parse(Key.make(), new Key[]{k}, true, gSetupT);
//testParsed(frT,expectTrue); // not currently passing
frT.delete();
}
@Test public void testSingleQuotes2() {
Frame fr = parse_test_file("smalldata/junit/test_quote.csv");
Assert.assertEquals(fr.numCols(),11);
Assert.assertEquals(fr.numRows(), 7);
fr.delete();
}
// Test very sparse data
@Test public void testSparse() {
// Build 100 zero's and 1 one.
double[][] exp = new double[101][1];
exp[50][0] = 1;
StringBuilder sb = new StringBuilder();
for( int i=0; i<50; i++ ) sb.append("0.0\n");
sb.append("1.0\n");
for( int i=0; i<50; i++ ) sb.append("0.0\n");
Key k = ParserTest.makeByteVec(sb.toString());
ParserTest.testParsed(ParseDataset.parse(Key.make(), k),exp,101);
// Build 100 zero's and 1 non-zero.
exp = new double[101][1];
exp[50][0] = 2;
sb = new StringBuilder();
for( int i=0; i<50; i++ ) sb.append("0\n");
sb.append("2\n");
for( int i=0; i<50; i++ ) sb.append("0\n");
k = ParserTest.makeByteVec(sb.toString());
ParserTest.testParsed(ParseDataset.parse(Key.make(), k),exp,101);
// Build 100 zero's and some non-zeros. Last line is truncated.
for (char sep : SEPARATORS) {
exp = new double[101][2];
exp[ 50][0] = 2;
exp[ 50][1] = 3;
exp[100][0] = 0; // Truncated final line
exp[100][1] = Double.NaN;
sb = new StringBuilder();
for( int i=0; i<50; i++ ) sb.append("0").append(sep).append("0\n");
sb.append("2").append(sep).append("3\n");
for( int i=0; i<49; i++ ) sb.append("0").append(sep).append("0\n");
sb.append("0"); // Truncated final line
k = ParserTest.makeByteVec(sb.toString());
ParserTest.testParsed(ParseDataset.parse(Key.make(), k),exp,101);
}
// Build 100000 zero's and some one's
sb = new StringBuilder();
exp = new double[100100][1];
for( int i=0; i<100; i++ ) {
for( int j=0; j<1000; j++ )
sb.append("0\n");
sb.append("1\n");
exp[i*1001+1000][0]=1;
}
k = ParserTest.makeByteVec(sb.toString());
ParserTest.testParsed(ParseDataset.parse(Key.make(), k),exp,100100);
// Build 100 zero's, then 100 mix of -1001 & 1001's (to force a
// sparse-short, that finally inflates to a full dense-short).
sb = new StringBuilder();
for( int i=0; i<100; i++ ) sb.append("0\n");
for( int i=0; i<100; i+=2 ) sb.append("-1001\n1001\n");
exp = new double[200][1];
for( int i=0; i<100; i+=2 ) { exp[i+100][0]=-1001; exp[i+101][0]= 1001; }
k = ParserTest.makeByteVec(sb.toString());
ParserTest.testParsed(ParseDataset.parse(Key.make(), k),exp,200);
}
// test correctnes of sparse chunks
// added after failing to encode properly following data as
// 0s were not considered when computing compression strategy and then
// lemin was 6108 and there was Short overflow when encoding zeros.
// So, the first column was compressed into C2SChunk with 0s causing short overflow,
@Test public void testSparse2(){
String data =
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"35351, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"6108, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"35351, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"6334, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n" +
"0, 0,0,0,0,0\n";
double[][] exp = new double[][] {
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(35351,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(6108,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(35351,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(6334,0,0,0,0,0),
ard(0,0,0,0,0,0),
ard(0,0,0,0,0,0),
};
Key k = ParserTest.makeByteVec(data);
ParserTest.testParsed(ParseDataset.parse(Key.make(), k),exp,33);
}
@Ignore
public void testSpeedOfCategoricalUpdate() {
Categorical cat = new Categorical();
int numOfUniqueCats = 363;
String values[] = new String[numOfUniqueCats];
for (int i = 0; i< numOfUniqueCats; i++) values[i] = UUID.randomUUID().toString();
int numOfIterations = 1000000000;
Random random = new Random(0xf267deadbabecafeL);
BufferedString bs = new BufferedString();
long startTime = System.currentTimeMillis();
for (int i = 0; i < numOfIterations; i++) {
int idx = random.nextInt(numOfUniqueCats);
bs.set(StringUtils.bytesOf(values[idx]));
cat.addKey(bs);
if (i % 10000000 == 0) System.out.println("Iterations: " + i);
}
System.out.println("Time: " + PrettyPrint.msecs(System.currentTimeMillis() - startTime, false));
}
}