package water.parser;
import com.google.common.io.Files;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import static org.junit.Assert.*;
import water.TestUtil;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.StringUtils;
/**
* Test suite for Avro parser.
*/
public class ParseTestAvro extends TestUtil {
private static double EPSILON = 1e-9;
@BeforeClass
static public void setup() { TestUtil.stall_till_cloudsize(5); }
@Test
public void testParseSimple() {
// Tests for basic files which are in smalldata
FrameAssertion[] assertions = new FrameAssertion[] {
// sequence100k.avro
new FrameAssertion("smalldata/parser/avro/sequence100k.avro", TestUtil.ari(1, 100000)) {
@Override public void check(Frame f) {
Vec values = f.vec(0);
for (int i = 0; i < f.numRows(); i++) {
assertEquals(i, values.at8(i));
}
}
},
// episodes.avro
new FrameAssertion("smalldata/parser/avro/episodes.avro", TestUtil.ari(3, 8)) {}
};
for (int i = 0; i < assertions.length; ++i) {
assertFrameAssertion(assertions[i]);
}
}
@Test public void testParsePrimitiveTypes() {
FrameAssertion[] assertions = new FrameAssertion[]{
new GenFrameAssertion("supportedPrimTypes.avro", TestUtil.ari(8, 100)) {
@Override protected File prepareFile() throws IOException { return AvroFileGenerator.generatePrimitiveTypes(file, nrows()); }
@Override
public void check(Frame f) {
assertArrayEquals("Column names need to match!", ar("CString", "CBytes", "CInt", "CLong", "CFloat", "CDouble", "CBoolean", "CNull"), f.names());
assertArrayEquals("Column types need to match!", ar(Vec.T_STR, Vec.T_STR, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_BAD), f.types());
int nrows = nrows();
BufferedString bs = new BufferedString();
for (int row = 0; row < nrows; row++) {
assertEquals("Value in column CString", String.valueOf(row), f.vec(0).atStr(bs, row).bytesToString());
assertEquals("Value in column CBytes", String.valueOf(row), f.vec(1).atStr(bs, row).bytesToString());
assertEquals("Value in column CInt", row, f.vec(2).at8(row));
assertEquals("Value in column CLong", row, f.vec(3).at8(row));
assertEquals("Value in column CFloat", row, f.vec(4).at(row), EPSILON);
assertEquals("Value in column CDouble", row, f.vec(5).at(row), EPSILON);
assertEquals("Value in column CBoolean", (row & 1) == 1, (((int) f.vec(5).at(row)) & 1) == 1);
assertTrue("Value in column CNull", f.vec(7).isNA(row));
}
}
}
};
for (int i = 0; i < assertions.length; ++i) {
assertFrameAssertion(assertions[i]);
}
}
@Test public void testParseUnionTypes() {
FrameAssertion[] assertions = new FrameAssertion[]{
new GenFrameAssertion("unionTypes.avro", TestUtil.ari(7, 101)) {
@Override protected File prepareFile() throws IOException { return AvroFileGenerator.generateUnionTypes(file, nrows()); }
@Override
public void check(Frame f) {
assertArrayEquals("Column names need to match!", ar("CUString", "CUBytes", "CUInt", "CULong", "CUFloat", "CUDouble", "CUBoolean"), f.names());
assertArrayEquals("Column types need to match!", ar(Vec.T_STR, Vec.T_STR, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM, Vec.T_NUM), f.types());
int nrows = nrows();
BufferedString bs = new BufferedString();
// NA in the first row
for (int col = 0; col < ncols(); col++) {
assertTrue("NA should be in first row and col " + col, f.vec(col).isNA(0));
}
for (int row = 1; row < nrows; row++) {
assertEquals("Value in column CString", String.valueOf(row), f.vec(0).atStr(bs, row).bytesToString());
assertEquals("Value in column CBytes", String.valueOf(row), f.vec(1).atStr(bs, row).bytesToString());
assertEquals("Value in column CInt", row, f.vec(2).at8(row));
assertEquals("Value in column CLong", row, f.vec(3).at8(row));
assertEquals("Value in column CFloat", row, f.vec(4).at(row), EPSILON);
assertEquals("Value in column CDouble", row, f.vec(5).at(row), EPSILON);
assertEquals("Value in column CBoolean", (row & 1) == 1, (((int) f.vec(5).at(row)) & 1) == 1);
}
}
}
};
for (int i = 0; i < assertions.length; ++i) {
assertFrameAssertion(assertions[i]);
}
}
@Test public void testParseEnumTypes() {
FrameAssertion[] assertions = new FrameAssertion[]{
new GenFrameAssertion("enumTypes.avro", TestUtil.ari(2, 100)) {
String[][] categories = AvroFileGenerator.generateSymbols(ar("CAT_A_", "CAT_B_"), ari(7, 13)); // Generated categories
@Override protected File prepareFile() throws IOException {
return AvroFileGenerator.generateEnumTypes(file, nrows(), categories);
}
@Override
public void check(Frame f) {
assertArrayEquals("Column names need to match!", ar("CEnum", "CUEnum"), f.names());
assertArrayEquals("Column types need to match!", ar(Vec.T_CAT, Vec.T_CAT), f.types());
assertArrayEquals("Category names need to match in CEnum!", categories[0], f.vec("CEnum").domain());
assertArrayEquals("Category names need to match in CUEnum!", categories[1], f.vec("CUEnum").domain());
int numOfCategories1 = categories[0].length;
int numOfCategories2 = categories[1].length;
int nrows = nrows();
for (int row = 0; row < nrows; row++) {
assertEquals("Value in column CEnum", row % numOfCategories1, (int) f.vec("CEnum").at(row));
if (row % (numOfCategories2+1) == 0) assertTrue("NA should be in row " + row + " and col CUEnum", f.vec("CUEnum").isNA(row));
else assertEquals("Value in column CUEnum", row % numOfCategories2, (int) f.vec("CUEnum").at(row));
}
}
}
};
for (int i = 0; i < assertions.length; ++i) {
assertFrameAssertion(assertions[i]);
}
}
}
/* A test file generator.
Use it offline, upload file into smalldata S3 bucket.
*/
class AvroFileGenerator {
public static void main(String[] args) throws IOException {
generatePrimitiveTypes("/tmp/h2o-avro-tests/primitiveTypes.avro", 100);
}
public static File generatePrimitiveTypes(String filename, int nrows) throws IOException {
File parentDir = Files.createTempDir();
File f = new File(parentDir, filename);
// Write output records
DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
Schema
schema = SchemaBuilder.builder()
.record("test_primitive_types").fields()
.name("CString").type("string").noDefault()
.name("CBytes").type("bytes").noDefault()
.name("CInt").type("int").noDefault()
.name("CLong").type("long").noDefault()
.name("CFloat").type("float").noDefault()
.name("CDouble").type("double").noDefault()
.name("CBoolean").type("boolean").noDefault()
.name("CNull").type("null").noDefault()
.endRecord();
try {
dw.create(schema, f);
for (int i = 0; i < nrows; i++) {
GenericRecord gr = new GenericData.Record(schema);
gr.put("CString", String.valueOf(i));
gr.put("CBytes", ByteBuffer.wrap(StringUtils.toBytes(i)));
gr.put("CInt", i);
gr.put("CLong", Long.valueOf(i));
gr.put("CFloat", Float.valueOf(i));
gr.put("CDouble", Double.valueOf(i));
gr.put("CBoolean", (i & 1) == 1);
gr.put("CNull", null);
dw.append(gr);
}
return f;
} finally {
dw.close();
}
}
public static File generateUnionTypes(String filename, int nrows) throws IOException {
File parentDir = Files.createTempDir();
File f = new File(parentDir, filename);
DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
// Based on SchemaBuilder javadoc:
// * The below two field declarations are equivalent:
// * <pre>
// * .name("f").type().unionOf().nullType().and().longType().endUnion().nullDefault()
// * .name("f").type().optional().longType()
// * </pre>
Schema
schema = SchemaBuilder.builder()
.record("test_union_types").fields()
.name("CUString").type().optional().stringType()
.name("CUBytes").type().optional().bytesType()
.name("CUInt").type().optional().intType()
.name("CULong").type().optional().longType()
.name("CUFloat").type().optional().floatType()
.name("CUDouble").type().optional().doubleType()
.name("CUBoolean").type().optional().booleanType()
.endRecord();
try {
dw.create(schema, f);
for (int i = 0; i < nrows; i++) {
GenericRecord gr = new GenericData.Record(schema);
gr.put("CUString", i == 0 ? null : String.valueOf(i));
gr.put("CUBytes", i == 0 ? null : ByteBuffer.wrap(StringUtils.toBytes(i)));
gr.put("CUInt", i == 0 ? null : i);
gr.put("CULong", i == 0 ? null : Long.valueOf(i));
gr.put("CUFloat", i == 0 ? null : Float.valueOf(i));
gr.put("CUDouble", i == 0 ? null : Double.valueOf(i));
gr.put("CUBoolean", i == 0 ? null : (i & 1) == 1);
dw.append(gr);
}
return f;
} finally {
dw.close();;
}
}
public static File generateEnumTypes(String filename, int nrows, String[][] categories) throws IOException {
assert categories.length == 2 : "Needs only 2 columns";
File parentDir = Files.createTempDir();
File f = new File(parentDir, filename);
DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
Schema enumSchema1 = SchemaBuilder.enumeration("CEnum1").symbols(categories[0]);
Schema enumSchema2 = SchemaBuilder.enumeration("CEnum2").symbols(categories[1]);
Schema
schema = SchemaBuilder.builder()
.record("test_enum_types").fields()
.name("CEnum").type(enumSchema1).noDefault()
.name("CUEnum").type().optional().type(enumSchema2)
.endRecord();
System.out.println(schema);
int numOfCategories1 = categories[0].length;
int numOfCategories2 = categories[1].length;
try {
dw.create(schema, f);
for (int i = 0; i < nrows; i++) {
GenericRecord gr = new GenericData.Record(schema);
gr.put("CEnum", new GenericData.EnumSymbol(enumSchema1, categories[0][i % numOfCategories1]));
gr.put("CUEnum", i % (numOfCategories2+1) == 0 ? null : new GenericData.EnumSymbol(enumSchema2, categories[1][i % numOfCategories2]));
dw.append(gr);
}
return f;
} finally {
dw.close();;
}
}
public static String[][] generateSymbols(String[] prefix, int[] num) {
assert prefix.length == num.length;
String[][] symbols = new String[prefix.length][];
for (int i = 0; i < prefix.length; i++) symbols[i] = generateSymbols(prefix[i], num[i]);
return symbols;
}
public static String[] generateSymbols(String prefix, int num) {
String[] symbols = new String[num];
for (int i = 0; i < num; i++) symbols[i] = prefix + i;
return symbols;
}
}