/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.FileReader; import org.apache.avro.file.SeekableInput; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.FsInput; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.testng.Assert; import org.testng.annotations.Test; public class AvroUtilsTest { private static final String AVRO_DIR = "gobblin-utility/src/test/resources/avroDirParent/"; @Test public void testGetDirectorySchema() throws IOException { Configuration conf = new Configuration(); conf.set("fs.default.name", "file:///"); conf.set("mapred.job.tracker", "local"); Path mockAvroFilePath = new Path(AVRO_DIR); Assert.assertNotNull(AvroUtils.getDirectorySchema(mockAvroFilePath, conf, true)); } /** * Test nullifying fields for non-union types, including array. */ @Test public void testNullifyFieldForNonUnionSchemaMerge() { Schema oldSchema1 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": \"int\"}" + "]}"); Schema newSchema1 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}" + "]}"); Schema expectedOutputSchema1 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [\"null\", \"int\"]}" + "]}"); Assert.assertEquals(expectedOutputSchema1, AvroUtils.nullifyFieldsForSchemaMerge(oldSchema1, newSchema1)); Schema oldSchema2 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": {\"type\": \"array\", \"items\": \"string\"}}" + "]}"); Schema newSchema2 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}" + "]}"); Schema expectedOutputSchema2 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [\"null\", {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Assert.assertEquals(expectedOutputSchema2, AvroUtils.nullifyFieldsForSchemaMerge(oldSchema2, newSchema2)); } /** * Test nullifying fields for union type. One case does not have "null", and the other case already has a default "null". */ @Test public void testNullifyFieldForUnionSchemaMerge() { Schema oldSchema1 = new Schema.Parser() .parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [{\"type\": \"string\"}, {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Schema newSchema1 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}" + "]}"); Schema expectedOutputSchema1 = new Schema.Parser() .parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [\"null\", {\"type\": \"string\"}, {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Assert.assertEquals(expectedOutputSchema1, AvroUtils.nullifyFieldsForSchemaMerge(oldSchema1, newSchema1)); Schema oldSchema2 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [\"null\", {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Schema newSchema2 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}" + "]}"); Schema expectedOutputSchema2 = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [\"null\", {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Assert.assertEquals(expectedOutputSchema2, AvroUtils.nullifyFieldsForSchemaMerge(oldSchema2, newSchema2)); } /** * Test nullifying fields when more than one field is removed in the new schema. */ @Test public void testNullifyFieldForMultipleFieldsRemoved() { Schema oldSchema = new Schema.Parser() .parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"color\", \"type\": \"string\"}, " + "{\"name\": \"number\", \"type\": [{\"type\": \"string\"}, {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Schema newSchema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}" + "]}"); Schema expectedOutputSchema = new Schema.Parser() .parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"color\", \"type\": [\"null\", \"string\"]}, " + "{\"name\": \"number\", \"type\": [\"null\", {\"type\": \"string\"}, {\"type\": \"array\", \"items\": \"string\"}]}" + "]}"); Assert.assertEquals(expectedOutputSchema, AvroUtils.nullifyFieldsForSchemaMerge(oldSchema, newSchema)); } /** * Test nullifying fields when one schema is not record type. */ @Test public void testNullifyFieldWhenOldSchemaNotRecord() { Schema oldSchema = new Schema.Parser().parse("{\"type\": \"array\", \"items\": \"string\"}"); Schema newSchema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}" + "]}"); Schema expectedOutputSchema = newSchema; Assert.assertEquals(expectedOutputSchema, AvroUtils.nullifyFieldsForSchemaMerge(oldSchema, newSchema)); } @Test public void testSwitchName() { String originalName = "originalName"; String newName = "newName"; Schema schema = SchemaBuilder.record(originalName).fields(). requiredDouble("double").optionalFloat("float").endRecord(); Schema newSchema = AvroUtils.switchName(schema, newName); Assert.assertEquals(newSchema.getName(), newName); for(Schema.Field field : newSchema.getFields()) { Assert.assertEquals(field, schema.getField(field.name())); } Assert.assertEquals(newName, AvroUtils.switchName(schema, newName).getName()); Assert.assertEquals(schema, AvroUtils.switchName(AvroUtils.switchName(schema, newName), schema.getName())); } @Test public void testSerializeAsPath() throws Exception { Schema schema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"test\", " + "\"fields\":[" + "{\"name\": \"name\", \"type\": \"string\"}, " + "{\"name\": \"title\", \"type\": \"string\"}" + "]}"); GenericRecord partition = new GenericData.Record(schema); partition.put("name", "a/b:c\\d e"); partition.put("title", "title"); Assert.assertEquals(AvroUtils.serializeAsPath(partition, true, true), new Path("name=a_b_c_d_e/title=title")); Assert.assertEquals(AvroUtils.serializeAsPath(partition, false, true), new Path("a_b_c_d_e/title")); Assert.assertEquals(AvroUtils.serializeAsPath(partition, false, false), new Path("a/b_c_d_e/title")); } public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; } /** * This is a test to validate support of maps in {@link gobblin.util.AvroUtils#getFieldValue(GenericRecord, String)} * and {@link gobblin.util.AvroUtils#getFieldSchema(Schema, String)} * @throws IOException */ @Test public void testGetObjectFromMap() throws IOException { final String TEST_FIELD_LOCATION = "Map.stringKey.Field"; String avroFilePath = this.AVRO_DIR + "avroDir/avroUtilsTestFile.avro"; GenericRecord record = getRecordFromFile(avroFilePath).get(0); Assert.assertEquals(AvroUtils.getFieldValue(record, TEST_FIELD_LOCATION).get().toString(), "stringValue2"); Assert.assertEquals(AvroUtils.getFieldSchema(record.getSchema(), TEST_FIELD_LOCATION).get().getType(), Schema.Type.STRING); } /** * In case of complex data types in union {@link AvroUtils#getFieldSchema(Schema, String)} should throw {@link AvroRuntimeException} * @throws IOException */ @Test(expectedExceptions = AvroRuntimeException.class) public void testComplexTypesInUnionNotSupported() throws IOException { final String TEST_LOCATION = "TestUnionObject.RecordInUnion"; String avroFilePath = this.AVRO_DIR + "avroDir/avroUtilsTestFile.avro"; GenericRecord record = getRecordFromFile(avroFilePath).get(0); AvroUtils.getFieldSchema(record.getSchema(), TEST_LOCATION); } }