/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.avro;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import com.google.common.base.Charsets;
import org.apache.drill.exec.util.JsonStringArrayList;
import org.apache.drill.exec.util.JsonStringHashMap;
import org.apache.drill.exec.util.Text;
/**
* Utilities for generating Avro test data.
*/
public class AvroTestUtil {
public static final int RECORD_COUNT = 10_000;
public static int ARRAY_SIZE = 20;
/**
* Class to write records to an Avro file while simultaneously
* constructing a corresponding list of records in the format taken in
* by the Drill test builder to describe expected results.
*/
public static class AvroTestRecordWriter implements Closeable {
private final List<Map<String, Object>> expectedRecords;
GenericData.Record currentAvroRecord;
TreeMap<String, Object> currentExpectedRecord;
private Schema schema;
private final DataFileWriter<GenericData.Record> writer;
private final String filePath;
private AvroTestRecordWriter(Schema schema, File file) {
writer = new DataFileWriter<GenericData.Record>(new GenericDatumWriter<GenericData.Record>(schema));
try {
writer.create(schema, file);
} catch (IOException e) {
throw new RuntimeException("Error creating file in Avro test setup.", e);
}
this.schema = schema;
currentExpectedRecord = new TreeMap<>();
expectedRecords = new ArrayList<>();
filePath = file.getAbsolutePath();
}
public void startRecord() {
currentAvroRecord = new GenericData.Record(schema);
currentExpectedRecord = new TreeMap<>();
}
public void put(String key, Object value) {
currentAvroRecord.put(key, value);
// convert binary values into byte[], the format they will be given
// in the Drill result set in the test framework
currentExpectedRecord.put("`" + key + "`", convertAvroValToDrill(value, true));
}
// TODO - fix this the test wrapper to prevent the need for this hack
// to make the root behave differently than nested fields for String vs. Text
private Object convertAvroValToDrill(Object value, boolean root) {
if (value instanceof ByteBuffer) {
ByteBuffer bb = ((ByteBuffer)value);
byte[] drillVal = new byte[((ByteBuffer)value).remaining()];
bb.get(drillVal);
bb.position(0);
value = drillVal;
} else if (!root && value instanceof CharSequence) {
value = new Text(value.toString());
} else if (value instanceof GenericData.Array) {
GenericData.Array array = ((GenericData.Array) value);
final JsonStringArrayList<Object> drillList = new JsonStringArrayList<>();
for (Object o : array) {
drillList.add(convertAvroValToDrill(o, false));
}
value = drillList;
} else if (value instanceof GenericData.EnumSymbol) {
value = value.toString();
} else if (value instanceof GenericData.Record) {
GenericData.Record rec = ((GenericData.Record) value);
final JsonStringHashMap<String, Object> newRecord = new JsonStringHashMap<>();
for (Schema.Field field : rec.getSchema().getFields()) {
Object val = rec.get(field.name());
newRecord.put(field.name(), convertAvroValToDrill(val, false));
}
value = newRecord;
}
return value;
}
public void endRecord() throws IOException {
writer.append(currentAvroRecord);
expectedRecords.add(currentExpectedRecord);
}
@Override
public void close() throws IOException {
writer.close();
}
public String getFilePath() {
return filePath;
}
public List<Map<String, Object>>getExpectedRecords() {
return expectedRecords;
}
}
public static AvroTestRecordWriter generateSimplePrimitiveSchema_NoNullValues() throws Exception {
return generateSimplePrimitiveSchema_NoNullValues(RECORD_COUNT);
}
public static AvroTestRecordWriter generateSimplePrimitiveSchema_NoNullValues(int numRecords) throws Exception {
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_long").type().longType().noDefault()
.name("d_float").type().floatType().noDefault()
.name("e_double").type().doubleType().noDefault()
.name("f_bytes").type().bytesType().noDefault()
.name("g_null").type().nullType().noDefault()
.name("h_boolean").type().booleanType().noDefault()
.endRecord();
final File file = File.createTempFile("avro-primitive-test", ".avro");
file.deleteOnExit();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
ByteBuffer bb = ByteBuffer.allocate(2);
bb.put(0, (byte) 'a');
for (int i = 0; i < numRecords; i++) {
bb.put(1, (byte) ('0' + (i % 10)));
bb.position(0);
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
record.put("c_long", (long) i);
record.put("d_float", (float) i);
record.put("e_double", (double) i);
record.put("f_bytes", bb);
record.put("g_null", null);
record.put("h_boolean", (i % 2 == 0));
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateUnionSchema_WithNullValues() throws Exception {
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_long").type().longType().noDefault()
.name("d_float").type().floatType().noDefault()
.name("e_double").type().doubleType().noDefault()
.name("f_bytes").type().bytesType().noDefault()
.name("g_null").type().nullType().noDefault()
.name("h_boolean").type().booleanType().noDefault()
.name("i_union").type().optional().doubleType()
.endRecord();
final File file = File.createTempFile("avro-primitive-test", ".avro");
file.deleteOnExit();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
ByteBuffer bb = ByteBuffer.allocate(1);
bb.put(0, (byte) 1);
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
record.put("c_long", (long) i);
record.put("d_float", (float) i);
record.put("e_double", (double) i);
record.put("f_bytes", bb);
record.put("g_null", null);
record.put("h_boolean", (i % 2 == 0));
record.put("i_union", (i % 2 == 0 ? (double) i : null));
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateUnionSchema_WithNonNullValues() throws Exception {
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_long").type().longType().noDefault()
.name("d_float").type().floatType().noDefault()
.name("e_double").type().doubleType().noDefault()
.name("f_bytes").type().bytesType().noDefault()
.name("g_null").type().nullType().noDefault()
.name("h_boolean").type().booleanType().noDefault()
.name("i_union").type().unionOf().doubleType().and().longType().endUnion().noDefault()
.endRecord();
final File file = File.createTempFile("avro-primitive-test", ".avro");
file.deleteOnExit();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
ByteBuffer bb = ByteBuffer.allocate(1);
bb.put(0, (byte) 1);
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
record.put("c_long", (long) i);
record.put("d_float", (float) i);
record.put("e_double", (double) i);
record.put("f_bytes", bb);
record.put("g_null", null);
record.put("h_boolean", (i % 2 == 0));
record.put("i_union", (i % 2 == 0 ? (double) i : (long) i));
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateSimpleEnumSchema_NoNullValues() throws Exception {
final String[] symbols = { "E_SYM_A", "E_SYM_B", "E_SYM_C", "E_SYM_D" };
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_enum").type().enumeration("my_enum").symbols(symbols).noDefault()
.endRecord();
final File file = File.createTempFile("avro-primitive-test", ".avro");
file.deleteOnExit();
final Schema enumSchema = schema.getField("b_enum").schema();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
final GenericData.EnumSymbol symbol =
new GenericData.EnumSymbol(enumSchema, symbols[(i + symbols.length) % symbols.length]);
record.put("b_enum", symbol);
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateSimpleArraySchema_NoNullValues() throws Exception {
final File file = File.createTempFile("avro-array-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_string_array").type().array().items().stringType().noDefault()
.name("d_int_array").type().array().items().intType().noDefault()
.name("e_float_array").type().array().items().floatType().noDefault()
.endRecord();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
{
GenericArray<String> array = new GenericData.Array<>(ARRAY_SIZE, schema.getField("c_string_array").schema());
for (int j = 0; j < ARRAY_SIZE; j++) {
array.add(j, "c_string_array_" + i + "_" + j);
}
record.put("c_string_array", array);
}
{
GenericArray<Integer> array = new GenericData.Array<>(ARRAY_SIZE, schema.getField("d_int_array").schema());
for (int j = 0; j < ARRAY_SIZE; j++) {
array.add(j, i * j);
}
record.put("d_int_array", array);
}
{
GenericArray<Float> array = new GenericData.Array<>(ARRAY_SIZE, schema.getField("e_float_array").schema());
for (int j = 0; j < ARRAY_SIZE; j++) {
array.add(j, (float) (i * j));
}
record.put("e_float_array", array);
}
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateSimpleNestedSchema_NoNullValues() throws Exception {
final File file = File.createTempFile("avro-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_record").type().record("my_record_1")
.namespace("foo.blah.org")
.fields()
.name("nested_1_string").type().stringType().noDefault()
.name("nested_1_int").type().intType().noDefault()
.endRecord()
.noDefault()
.endRecord();
final Schema nestedSchema = schema.getField("c_record").schema();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
final GenericRecord nestedRecord = new GenericData.Record(nestedSchema);
nestedRecord.put("nested_1_string", "nested_1_string_" + i);
nestedRecord.put("nested_1_int", i * i);
record.put("c_record", nestedRecord);
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateUnionNestedArraySchema_withNullValues() throws Exception {
final File file = File.createTempFile("avro-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_array").type().optional().array().items().record("my_record_1")
.namespace("foo.blah.org").fields()
.name("nested_1_string").type().optional().stringType()
.name("nested_1_int").type().optional().intType()
.endRecord()
.endRecord();
final Schema nestedSchema = schema.getField("c_array").schema();
final Schema arraySchema = nestedSchema.getTypes().get(1);
final Schema itemSchema = arraySchema.getElementType();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
if (i % 2 == 0) {
GenericArray<GenericRecord> array = new GenericData.Array<>(1, arraySchema);
final GenericRecord nestedRecord = new GenericData.Record(itemSchema);
nestedRecord.put("nested_1_string", "nested_1_string_" + i);
nestedRecord.put("nested_1_int", i * i);
array.add(nestedRecord);
record.put("c_array", array);
}
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateNestedArraySchema() throws IOException {
return generateNestedArraySchema(RECORD_COUNT, ARRAY_SIZE);
}
public static AvroTestRecordWriter generateNestedArraySchema(int numRecords, int numArrayItems) throws IOException {
final File file = File.createTempFile("avro-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest").namespace("org.apache.drill.exec.store.avro")
.fields().name("a_int").type().intType().noDefault().name("b_array").type().array().items()
.record("my_record_1").namespace("foo.blah.org").fields().name("nested_1_int").type().optional().intType()
.endRecord().arrayDefault(Collections.emptyList()).endRecord();
final Schema arraySchema = schema.getField("b_array").schema();
final Schema itemSchema = arraySchema.getElementType();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < numRecords; i++) {
record.startRecord();
record.put("a_int", i);
GenericArray<GenericRecord> array = new GenericData.Array<>(ARRAY_SIZE, arraySchema);
for (int j = 0; j < numArrayItems; j++) {
final GenericRecord nestedRecord = new GenericData.Record(itemSchema);
nestedRecord.put("nested_1_int", j);
array.add(nestedRecord);
}
record.put("b_array", array);
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateMapSchema_withNullValues() throws Exception {
final File file = File.createTempFile("avro-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_map").type().optional().map().values(Schema.create(Type.STRING))
.endRecord();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
if (i % 2 == 0) {
Map<String, String> strMap = new HashMap<>();
strMap.put("key1", "nested_1_string_" + i);
strMap.put("key2", "nested_1_string_" + (i + 1 ));
record.put("c_map", strMap);
}
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateMapSchemaComplex_withNullValues() throws Exception {
final File file = File.createTempFile("avro-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_map").type().optional().map().values(Schema.create(Type.STRING))
.name("d_map").type().optional().map().values(Schema.createArray(Schema.create(Type.DOUBLE)))
.endRecord();
final Schema arrayMapSchema = schema.getField("d_map").schema();
final Schema arrayItemSchema = arrayMapSchema.getTypes().get(1).getValueType();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
if (i % 2 == 0) {
Map<String, String> c_map = new HashMap<>();
c_map.put("key1", "nested_1_string_" + i);
c_map.put("key2", "nested_1_string_" + (i + 1 ));
record.put("c_map", c_map);
} else {
Map<String, GenericArray<Double>> d_map = new HashMap<>();
GenericArray<Double> array = new GenericData.Array<>(ARRAY_SIZE, arrayItemSchema);
for (int j = 0; j < ARRAY_SIZE; j++) {
array.add((double)j);
}
d_map.put("key1", array);
d_map.put("key2", array);
record.put("d_map", d_map);
}
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateUnionNestedSchema_withNullValues() throws Exception {
final File file = File.createTempFile("avro-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_record").type().optional().record("my_record_1")
.namespace("foo.blah.org").fields()
.name("nested_1_string").type().optional().stringType()
.name("nested_1_int").type().optional().intType()
.endRecord()
.endRecord();
final Schema nestedSchema = schema.getField("c_record").schema();
final Schema optionalSchema = nestedSchema.getTypes().get(1);
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
if (i % 2 == 0) {
final GenericRecord nestedRecord = new GenericData.Record(optionalSchema);
nestedRecord.put("nested_1_string", "nested_1_string_" + i);
nestedRecord.put("nested_1_int", i * i);
record.put("c_record", nestedRecord);
}
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static AvroTestRecordWriter generateDoubleNestedSchema_NoNullValues() throws Exception {
final File file = File.createTempFile("avro-double-nested-test", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringType().noDefault()
.name("b_int").type().intType().noDefault()
.name("c_record").type().record("my_record_1")
.namespace("foo.blah.org")
.fields()
.name("nested_1_string").type().stringType().noDefault()
.name("nested_1_int").type().intType().noDefault()
.name("nested_1_record").type().record("my_double_nested_record_1")
.namespace("foo.blah.org.rot")
.fields()
.name("double_nested_1_string").type().stringType().noDefault()
.name("double_nested_1_int").type().intType().noDefault()
.endRecord()
.noDefault()
.endRecord()
.noDefault()
.endRecord();
final Schema nestedSchema = schema.getField("c_record").schema();
final Schema doubleNestedSchema = nestedSchema.getField("nested_1_record").schema();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_int", i);
final GenericRecord nestedRecord = new GenericData.Record(nestedSchema);
nestedRecord.put("nested_1_string", "nested_1_string_" + i);
nestedRecord.put("nested_1_int", i * i);
final GenericRecord doubleNestedRecord = new GenericData.Record(doubleNestedSchema);
doubleNestedRecord.put("double_nested_1_string", "double_nested_1_string_" + i + "_" + i);
doubleNestedRecord.put("double_nested_1_int", i * i * i);
nestedRecord.put("nested_1_record", doubleNestedRecord);
record.put("c_record", nestedRecord);
record.endRecord();
}
} finally {
record.close();
}
return record;
}
public static String generateLinkedList() throws Exception {
final File file = File.createTempFile("avro-linkedlist", ".avro");
file.deleteOnExit();
final Schema schema = SchemaBuilder.record("LongList")
.namespace("org.apache.drill.exec.store.avro")
.aliases("LinkedLongs")
.fields()
.name("value").type().optional().longType()
.name("next").type().optional().type("LongList")
.endRecord();
final DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(schema));
writer.create(schema, file);
GenericRecord previousRecord = null;
try {
for (int i = 0; i < RECORD_COUNT; i++) {
GenericRecord record = (GenericRecord) (previousRecord == null ? new GenericData.Record(schema) : previousRecord.get("next"));
record.put("value", (long) i);
if (previousRecord != null) {
writer.append(previousRecord);
}
GenericRecord nextRecord = new GenericData.Record(record.getSchema());
record.put("next", nextRecord);
previousRecord = record;
}
writer.append(previousRecord);
} finally {
writer.close();
}
return file.getAbsolutePath();
}
public static AvroTestRecordWriter generateStringAndUtf8Data() throws Exception {
final Schema schema = SchemaBuilder.record("AvroRecordReaderTest")
.namespace("org.apache.drill.exec.store.avro")
.fields()
.name("a_string").type().stringBuilder().prop("avro.java.string", "String").endString().noDefault()
.name("b_utf8").type().stringType().noDefault()
.endRecord();
final File file = File.createTempFile("avro-primitive-test", ".avro");
file.deleteOnExit();
final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
try {
ByteBuffer bb = ByteBuffer.allocate(1);
bb.put(0, (byte) 1);
for (int i = 0; i < RECORD_COUNT; i++) {
record.startRecord();
record.put("a_string", "a_" + i);
record.put("b_utf8", "b_" + i);
record.endRecord();
}
} finally {
record.close();
}
return record;
}
}