/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.piggybank.test.storage.avro; import static org.apache.pig.builtin.mock.Storage.resetData; import static org.apache.pig.builtin.mock.Storage.schema; import static org.apache.pig.builtin.mock.Storage.tuple; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.pig.ExecType; import org.apache.pig.LoadFunc; import org.apache.pig.PigConfiguration; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.executionengine.ExecJob; import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS; import org.apache.pig.backend.hadoop.executionengine.JobCreationException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration; import org.apache.pig.builtin.mock.Storage.Data; import org.apache.pig.data.Tuple; import org.apache.pig.impl.io.FileLocalizer; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.piggybank.storage.avro.PigSchema2Avro; import org.apache.pig.test.Util; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; public class TestAvroStorage { protected static final Log LOG = LogFactory.getLog(TestAvroStorage.class); private static PigServer pigServerLocal = null; final private static String basedir = "src/test/java/org/apache/pig/piggybank/test/storage/avro/avro_test_files/"; private static String outbasedir; public static final PathFilter hiddenPathFilter = new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }; private static String getInputFile(String file) { String locations[] = LoadFunc.getPathStrings(file); if (locations.length == 1) return System.getProperty("user.dir") + "/" + basedir + file; else { ArrayList<String> pathStrings = new ArrayList<String>(); for (int index = 0; index < locations.length; index++) { String f = System.getProperty("user.dir") + "/" + basedir + locations[index].trim(); pathStrings.add(f); } return LoadFunc.join(pathStrings, ","); } } final private String testDir1 = getInputFile("test_dir1"); final private String testDir1AllFiles = getInputFile("test_dir1/*"); final private String testDir1Files123 = getInputFile("test_dir1/test_glob{1,2,3}.avro"); final private String testDir1Files321 = getInputFile("test_dir1/test_glob{3,2,1}.avro"); final private String testDir12AllFiles = getInputFile("{test_dir1,test_dir2}/test_glob*.avro"); final private String testDir21AllFiles = getInputFile("{test_dir2,test_dir1}/test_glob*.avro"); final private String testCommaSeparated1 = getInputFile("test_dir1/test_glob1.avro,test_dir1/test_glob2.avro,test_dir1/test_glob3.avro"); final private String testCommaSeparated2 = getInputFile("test_dir1/test_glob*,test_dir2/test_glob4.avro,test_dir2/test_glob5.avro"); final private String testNoMatchedFiles = getInputFile("test_dir{1,2}/file_that_does_not_exist*.avro"); final private String testArrayFile = getInputFile("test_array.avro"); final private String testArraySchema = getInputFile("test_array.avsc"); final private String testRecordFile = getInputFile("test_record.avro"); final private String testRecordSchema = getInputFile("test_record.avsc"); final private String testGenericUnionFile = getInputFile("test_generic_union.avro"); final private String testRecursiveRecordInMap = getInputFile("test_recursive_record_in_map.avro"); final private String testRecursiveRecordInArray = getInputFile("test_recursive_record_in_array.avro"); final private String testRecursiveRecordInUnion = getInputFile("test_recursive_record_in_union.avro"); final private String testRecursiveRecordInRecord = getInputFile("test_recursive_record_in_record.avro"); final private String testRecursiveRecordInUnionSchema = getInputFile("test_recursive_record_in_union.avsc"); final private String testTextFile = getInputFile("test_record.txt"); final private String testSingleTupleBagFile = getInputFile("messages.avro"); final private String testNoExtensionFile = getInputFile("test_no_extension"); final private String recursiveRecordInMap = " {" + " \"type\" : \"record\"," + " \"name\" : \"recursive_record\"," + " \"fields\" : [ {" + " \"name\" : \"id\"," + " \"type\" : \"int\"" + " }, {" + " \"name\" : \"nested\"," + " \"type\" : [ \"null\", {" + " \"type\" : \"map\"," + " \"values\" : \"recursive_record\"" + " } ]" + " } ]" + " }"; final private String recursiveRecordInArray = " {" + " \"type\" : \"record\"," + " \"name\" : \"recursive_record\"," + " \"fields\" : [ {" + " \"name\" : \"id\"," + " \"type\" : \"int\"" + " }, {" + " \"name\" : \"nested\"," + " \"type\" : [ \"null\", {" + " \"type\" : \"array\"," + " \"items\" : \"recursive_record\"" + " } ]" + " } ]" + " }"; final private String recursiveRecordInUnion = " {" + " \"type\" : \"record\"," + " \"name\" : \"recursive_record\"," + " \"fields\" : [ {" + " \"name\" : \"value\"," + " \"type\" : \"int\"" + " }, {" + " \"name\" : \"next\"," + " \"type\" : [ \"null\", \"recursive_record\" ]" + " } ]" + " }"; final private String recursiveRecordInRecord = " {" + " \"type\" : \"record\"," + " \"name\" : \"recursive_record\"," + " \"fields\" : [ {" + " \"name\" : \"id\"," + " \"type\" : \"int\"" + " }, {" + " \"name\" : \"nested\"," + " \"type\" : [ \"null\", {" + " \"type\" : \"record\"," + " \"name\" : \"nested_record\"," + " \"fields\" : [ {" + " \"name\" : \"value1\"," + " \"type\" : \"string\"" + " }, {" + " \"name\" : \"next\"," + " \"type\" : \"recursive_record\"" + " }, {" + " \"name\" : \"value2\"," + " \"type\" : \"string\"" + " } ]" + " } ]" + " } ]" + " }"; final private String testCorruptedFile = getInputFile("test_corrupted_file.avro"); final private String testMultipleSchemas1File = getInputFile("test_primitive_types/*"); final private String testMultipleSchemas2File = getInputFile("test_complex_types/*"); final private String testMultipleSchemasWithDefaultValue = getInputFile("test_merge_schemas_default/{Employee{3,4,6}.avro}"); final private String testUserDefinedLoadSchemaFile = getInputFile("test_user_defined_load_schema/*"); final private String testLoadwithNullValues = getInputFile("test_loadavrowithnulls.avro"); @BeforeClass public static void setup() throws ExecException, IOException { pigServerLocal = new PigServer(ExecType.LOCAL); String TMP_DIR = System.getProperty("user.dir") + "/build/test/tmp/"; pigServerLocal.getPigContext().getProperties().setProperty(PigConfiguration.PIG_TEMP_DIR, TMP_DIR); outbasedir = FileLocalizer.getTemporaryPath(pigServerLocal.getPigContext()).toString() + "/TestAvroStorage/"; deleteDirectory(new File(outbasedir)); } @AfterClass public static void teardown() { if(pigServerLocal != null) pigServerLocal.shutdown(); } @Test public void testRecursiveRecordInMap() throws IOException { // Verify that recursive records in map can be loaded/saved. String output= outbasedir + "testRecursiveRecordInMap"; String expected = testRecursiveRecordInMap; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInMap) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '" + recursiveRecordInMap + "' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordInArray() throws IOException { // Verify that recursive records in array can be loaded/saved. String output= outbasedir + "testRecursiveRecordInArray"; String expected = testRecursiveRecordInArray; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInArray) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '" + recursiveRecordInArray + "' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordInUnion() throws IOException { // Verify that recursive records in union can be loaded/saved. String output= outbasedir + "testRecursiveRecordInUnion"; String expected = testRecursiveRecordInUnion; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '" + recursiveRecordInUnion + "' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordInRecord() throws IOException { // Verify that recursive records in record can be loaded/saved. String output= outbasedir + "testRecursiveRecordInRecord"; String expected = testRecursiveRecordInRecord; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInRecord) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '" + Util.encodeEscape(recursiveRecordInRecord) + "' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordWithSame() throws IOException { // Verify that avro schema can be specified via an external avro file // instead of a json string. String output= outbasedir + "testRecursiveRecordWithSame"; String expected = testRecursiveRecordInUnion; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'same', '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordReference1() throws IOException { // The relation 'in' looks like this: // (1,(2,(3,))) // (2,(3,)) // (3,) // $0 looks like this: // (1) // (2) // (3) // Avro file stored after filtering out nulls looks like this: // 1 // 2 // 3 String output= outbasedir + "testRecursiveRecordReference1"; String expected = basedir + "expected_testRecursiveRecordReference1.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " first = FOREACH in GENERATE $0 AS value;", " filtered = FILTER first BY value is not null;", " STORE filtered INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '\"int\"' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordReference2() throws IOException { // The relation 'in' looks like this: // (1,(2,(3,))) // (2,(3,)) // (3,) // $1.$0 looks like this: // (2) // (3) // () // Avro file stored after filtering out nulls looks like this: // 2 // 3 String output= outbasedir + "testRecursiveRecordReference2"; String expected = basedir + "expected_testRecursiveRecordReference2.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " second = FOREACH in GENERATE $1.$0 AS value;", " filtered = FILTER second BY value is not null;", " STORE filtered INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '\"int\"' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordReference3() throws IOException { // The relation 'in' looks like this: // (1,(2,(3,))) // (2,(3,)) // (3,) // $1.$1.$0 looks like this: // (3) // () // () // Avro file stored after filtering out nulls looks like this: // 3 String output= outbasedir + "testRecursiveRecordReference3"; String expected = basedir + "expected_testRecursiveRecordReference3.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " third = FOREACH in GENERATE $1.$1.$0 AS value;", " filtered = FILTER third BY value is not null;", " STORE filtered INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema', '\"int\"' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testRecursiveRecordWithNoAvroSchema() throws IOException { // Verify that recursive records cannot be stored, // if no avro schema is specified either via 'schema' or 'same'. String output= outbasedir + "testRecursiveRecordWithNoAvroSchema"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check' );" }; // Since Avro schema is not specified via the 'schema' parameter, it is // derived from Pig schema. Job is expected to fail because this derived // Avro schema (bytes) is not compatible with data (tuples). testAvroStorage(true, queries); } @Test public void testRecursiveRecordWithSchemaCheck() throws IOException { // Verify that recursive records cannot be stored if schema check is enbled. String output= outbasedir + "testRecursiveWithSchemaCheck"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '" + recursiveRecordInUnion + "' );" }; try { testAvroStorage(queries); Assert.fail("Negative test to test an exception. Should not be succeeding!"); } catch (IOException e) { // An IOException is thrown by AvroStorage during schema check due to incompatible // data types. assertTrue(e.getMessage().contains("bytearray is not compatible with avro")); } } @Test public void testRecursiveRecordWithSchemaFile() throws IOException { // Verify that recursive records cannot be stored if avro schema is specified by 'schema_file'. String output= outbasedir + "testRecursiveWithSchemaFile"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'schema_file', '" + Util.encodeEscape(testRecursiveRecordInUnionSchema) + "' );" }; try { testAvroStorage(queries); Assert.fail("Negative test to test an exception. Should not be succeeding!"); } catch (FrontendException e) { // The IOException thrown by AvroSchemaManager for recursive record is caught // by the Pig frontend, and FrontendException is re-thrown. assertTrue(e.getMessage().contains("could not instantiate 'org.apache.pig.piggybank.storage.avro.AvroStorage'")); } } @Test public void testRecursiveRecordWithData() throws IOException { // Verify that recursive records cannot be stored if avro schema is specified by 'data'. String output= outbasedir + "testRecursiveWithData"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'no_schema_check'," + " 'data', '" + Util.encodeEscape(testRecursiveRecordInUnion) + "' );" }; try { testAvroStorage(queries); Assert.fail("Negative test to test an exception. Should not be succeeding!"); } catch (FrontendException e) { // The IOException thrown by AvroSchemaManager for recursive record is caught // by the Pig frontend, and FrontendException is re-thrown. assertTrue(e.getMessage().contains("could not instantiate 'org.apache.pig.piggybank.storage.avro.AvroStorage'")); } } @Test public void testGenericUnion() throws IOException { // Verify that a FrontendException is thrown if schema has generic union. String output= outbasedir + "testGenericUnion"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testGenericUnionFile) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; try { testAvroStorage(queries); Assert.fail("Negative test to test an exception. Should not be succeeding!"); } catch (FrontendException e) { // The IOException thrown by AvroStorage for generic union is caught // by the Pig frontend, and FrontendException is re-thrown. assertTrue(e.getMessage().contains("Cannot get schema")); } } @Test public void testMultipleSchemas1() throws IOException { // Verify that multiple primitive types can be loaded. // Input Avro files have the following schemas: // "int" // "long" // "float" // "double" // "string" // { "type" : "enum", "name" : "foo", "symbols" : [ "6" ] } // Merged Avro schema looks like this: // "string" // The relation 'in' looks like this: (order of rows can be different.) // (6) // (4.0) // (3.0) // (5) // (2) // (1) // Avro file stored after processing looks like this: // "1" // "2" // "3.0" // "4.0" // "5" // "6" String output= outbasedir + "testMultipleSchemas1"; String expected = basedir + "expected_testMultipleSchemas1.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testMultipleSchemas1File) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('multiple_schemas');", " s = FOREACH in GENERATE StringConcat($0);", " o = ORDER s BY $0;", " STORE o INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('schema', '\"string\"');" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testMultipleSchemas2() throws IOException { // Verify that multiple complex types (records) can be loaded. // Input Avro files have the following schemas: // { "type" : "record", "name" : "r", "fields" : [ { "name" : "i", "type" : "int" } ] } // { "type" : "record", "name" : "r", "fields" : [ { "name" : "l", "type" : "long" } ] } // { "type" : "record", "name" : "r", "fields" : [ { "name" : "f", "type" : "float" } ] } // { "type" : "record", "name" : "r", "fields" : [ { "name" : "d", "type" : "double" } ] } // { "type" : "record", "name" : "r", "fields" : [ { "name" : "s", "type" : "string" } ] } // { "type" : "record", "name" : "r", "fields" : [ { "name" : "e", "type" : { // "type" : "enum", "name" : "foo", "symbols" : [ "6" ] } } ] } // Merged Avro schema looks like this: // { "type" : "record", // "name" : "merged", // "fields" : [ { "name" : "i", "type" : "int" }, // { "name" : "l", "type" : "long" }, // { "name" : "f", "type" : "float" }, // { "name" : "d", "type" : "double" }, // { "name" : "s", "type" : "string" }, // { "name" : "e", "type" : { // "type" : "enum", "name" : "foo", "symbols" : [ "6" ] } } // ] // } // The relation 'in' looks like this: (order of rows can be different.) // (,,6,,,) // (,,,,4.0,) // (,,,,,3.0) // (,5,,,,) // (,,,2,,) // (1,,,,,) // Avro file stored after processing looks like this: // "1" // "2" // "3.0" // "4.0" // "5" // "6" String output= outbasedir + "testMultipleSchemas2"; String expected = basedir + "expected_testMultipleSchemas2.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testMultipleSchemas2File) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('multiple_schemas');", " f = FOREACH in GENERATE ($0 is not null ? (chararray)$0 : '')," + " ($1 is not null ? (chararray)$1 : '')," + " ($2 is not null ? (chararray)$2 : '')," + " ($3 is not null ? (chararray)$3 : '')," + " ($4 is not null ? (chararray)$4 : '')," + " ($5 is not null ? (chararray)$5 : '');", " c = FOREACH f GENERATE StringConcat( $0, $1, $2, $3, $4, $5 );", " o = ORDER c BY $0;", " STORE o INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('schema', '\"string\"');" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testUserDefinedLoadSchema() throws IOException { PigSchema2Avro.setTupleIndex(2); // Verify that user specified schema correctly maps to input schemas // Input Avro files have the following schemas: // name:"string", address:[customField1:"int", addressLine:"string"] // address:[addressLine:"string", customField2:"int"], name:"string" // User Avro schema looks like this: // name:"string", address:[customField1:"int", customField2:"int", customField3:"int"] // This test will confirm that AvroStorage correctly maps fields from writer to reader schema, // dropping, adding, and reordering fields where needed. String output= outbasedir + "testUserDefinedLoadSchema"; String expected = basedir + "expected_testUserDefinedLoadSchema.avro"; String customSchema = "{\"type\": \"record\", \"name\": \"employee\", \"fields\": [ " +"{ \"default\": \"***\", \"type\": \"string\", \"name\": \"name\" }, " +"{ \"name\": \"address\", \"type\": { " +"\"type\": \"record\", \"name\": \"addressDetails\", \"fields\": [ " +"{ \"default\": 0, \"type\": \"int\", \"name\": \"customField1\" }, " +"{ \"default\": 0, \"type\": \"int\", \"name\": \"customField2\" }, " +"{ \"default\": 0, \"type\": \"int\", \"name\": \"customField3\" } " +"] " +"} } " +"] } "; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + testUserDefinedLoadSchemaFile + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('schema', '" + customSchema + "');", " o = ORDER in BY name;", " STORE o INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testMultipleSchemasWithDefaultValue() throws IOException { // ==> Employee3.avro <== // { // "type" : "record", // "name" : "employee", // "fields":[ // {"name" : "name", "type" : "string", "default" : "NU"}, // {"name" : "age", "type" : "int", "default" : 0 }, // {"name" : "dept", "type": "string", "default" : "DU"} ] } // // ==> Employee4.avro <== // { // "type" : "record", // "name" : "employee", // "fields":[ // {"name" : "name", "type" : "string", "default" : "NU"}, // {"name" : "age", "type" : "int", "default" : 0}, // {"name" : "dept", "type": "string", "default" : "DU"}, // {"name" : "office", "type": "string", "default" : "OU"} ] } // // ==> Employee6.avro <== // { // "type" : "record", // "name" : "employee", // "fields":[ // {"name" : "name", "type" : "string", "default" : "NU"}, // {"name" : "lastname", "type": "string", "default" : "LNU"}, // {"name" : "age", "type" : "int","default" : 0}, // {"name" : "salary", "type": "int", "default" : 0}, // {"name" : "dept", "type": "string","default" : "DU"}, // {"name" : "office", "type": "string","default" : "OU"} ] } // The relation 'in' looks like this: (order of rows can be different.) // Avro file stored after processing looks like this: // The relation 'in' looks like this: (order of rows can be different.) // Employee3.avro // (Milo,30,DH) // (Asmya,34,PQ) // (Baljit,23,RS) // // Employee4.avro // (Praj,54,RMX,Champaign) // (Buba,767,HD,Sunnyvale) // (Manku,375,MS,New York) // // Employee6.avro // (Pune,Warriors,60,5466,Astrophysics,UTA) // (Rajsathan,Royals,20,1378,Biochemistry,Stanford) // (Chennai,Superkings,50,7338,Microbiology,Hopkins) // (Mumbai,Indians,20,4468,Applied Math,UAH) // Data file stored after without looks like this with the // following schema and data // {name: chararray,age: int,dept: chararray,office: chararray, // lastname: chararray,salary: int} //(Asmya,34,PQ,OU,LNU,0) //(Baljit,23,RS,OU,LNU,0) //(Buba,767,HD,Sunnyvale,LNU,0) //(Chennai,50,Microbiology,Hopkins,Superkings,7338) //(Manku,375,MS,New York,LNU,0) //(Milo,30,DH,OU,LNU,0) //(Mumbai,20,Applied Math,UAH,Indians,4468) //(Praj,54,RMX,Champaign,LNU,0) //(Pune,60,Astrophysics,UTA,Warriors,5466) //(Rajsathan,20,Biochemistry,Stanford,Royals,1378) Data data = resetData(pigServerLocal); String output= outbasedir + "testMultipleSchemasWithDefaultValue"; deleteDirectory(new File(output)); String expected = basedir + "expected_testMultipleSchemasWithDefaultValue.avro"; String [] queries = { " a = LOAD '" + testMultipleSchemasWithDefaultValue + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('multiple_schemas');", " b = foreach a generate name,age,dept,office,lastname,salary;", " c = filter b by age < 40 ;", " d = order c by name;", " STORE d INTO '" + output+ "' using mock.Storage();" }; testAvroStorage(queries); List<Tuple> out = data.get(output); assertEquals(out + " size", 5, out.size()); assertEquals( schema("name: chararray,age: int,dept: chararray,office: chararray,lastname: chararray,salary: int"), data.getSchema(output)); assertEquals(tuple("Asmya", 34, "PQ", "OU", "LNU", 0), out.get(0)); assertEquals(tuple("Baljit", 23, "RS", "OU", "LNU", 0), out.get(1)); assertEquals(tuple("Milo", 30, "DH", "OU", "LNU", 0), out.get(2)); assertEquals(tuple("Mumbai", 20, "Applied Math", "UAH", "Indians", 4468), out.get(3)); assertEquals(tuple("Rajsathan", 20, "Biochemistry", "Stanford", "Royals", 1378), out.get(4)); } @Test // Verify the default values specified in the schema in AvroStorage // are actually written to the schema in the output avro file public void testDefaultValueSchemaWrite() throws IOException { String output = outbasedir + "testDefaultValueSchemaWrite"; String expected = basedir + "expected_testDefaultSchemaWrite.avro"; Data data = resetData(pigServerLocal); data.set("testDefaultValueSchemaWrite", tuple(0,115,115000,115000.1), tuple(1,116,116000,116000.1), tuple(2,117,117000,117000.1), tuple(3,118,118000,118000.1), tuple(4,119,119000,119000.1) ); deleteDirectory(new File(output)); String [] queries = { " a = LOAD 'testDefaultValueSchemaWrite' USING mock.Storage as " + " (id: int, intval:int, longval:long, floatval:float);", " b = foreach a generate id, longval, floatval;", " c = order b by id;", " STORE c INTO '" + output + "' USING "+ " org.apache.pig.piggybank.storage.avro.AvroStorage (' { \"debug\" : 5, \"schema\" : "+ " { \"name\" : \"rmyrecord\", \"type\" : \"record\", \"fields\" : [ { \"name\" : \"id\", "+ " \"type\" : \"int\" , \"default\" : 0 }, { \"name\" : \"longval\", \"type\" : \"long\","+ " \"default\" : 0 }, { \"name\" : \"floatval\", \"type\" : \"float\", \"default\" : 1.0 } ] } } " + " ');" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testDir() throws IOException { // Verify that all files in a directory including its sub-directories are loaded. String output= outbasedir + "testDir"; String expected = basedir + "expected_testDir.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testDir1) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testGlob1() throws IOException { // Verify that the a glob pattern matches files properly. String output = outbasedir + "testGlob1"; String expected = basedir + "expected_testDir.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testDir1AllFiles) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testGlob2() throws IOException { // Verify that comma-separated filenames are escaped properly. String output = outbasedir + "testGlob2"; String expected = basedir + "expected_test_dir_1.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testDir1Files123) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testGlob3() throws IOException { // Verify that comma-separated filenames are escaped properly. String output = outbasedir + "testGlob3"; String expected = basedir + "expected_test_dir_1.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testDir1Files321) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testGlob4() throws IOException { // Verify that comma-separated directory names are escaped properly. String output = outbasedir + "testGlob4"; String expected = basedir + "expected_test_dir_1_2.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testDir12AllFiles) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testGlob5() throws IOException { // Verify that comma-separated directory names are escaped properly. String output = outbasedir + "testGlob5"; String expected = basedir + "expected_test_dir_1_2.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testDir21AllFiles) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testGlob6() throws IOException { // Verify that an IOException is thrown if no files are matched by the glob pattern. String output = outbasedir + "testGlob6"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testNoMatchedFiles) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; try { testAvroStorage(queries); Assert.fail("Negative test to test an exception. Should not be succeeding!"); } catch (JobCreationException e) { // The IOException thrown by AvroStorage for input file not found is catched // by the Pig backend, and JobCreationException (a subclass of IOException) // is re-thrown while creating a job configuration. assertEquals(e.getMessage(), "Internal error creating job configuration."); } } @Test public void testComma1() throws IOException { // Verify that comma-separated file can be processed String output = outbasedir + "testComma1"; String expected = basedir + "expected_test_dir_1.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testCommaSeparated1) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testComma2() throws IOException { // Verify that comma-separated file can be processed String output = outbasedir + "testComma2"; String expected = basedir + "expected_test_dir_1_2.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testCommaSeparated2) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testArrayDefault() throws IOException { String output= outbasedir + "testArrayDefault"; String expected = basedir + "expected_testArrayDefault.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testArrayFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testArrayWithSchema() throws IOException { String output= outbasedir + "testArrayWithSchema"; String expected = basedir + "expected_testArrayWithSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testArrayFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( " + " 'schema', '{\"type\":\"array\",\"items\":\"float\"}' );" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testArrayWithSchemaURI() throws IOException { String output= outbasedir + "testArrayWithSchemaURI"; String expected = basedir + "expected_testArrayWithSchemaURI.avro"; // doubles (not floats) stored deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testArrayFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( " + " 'schema_uri', '" + Util.encodeEscape(testArraySchema) + "' );" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testArrayWithNotNull() throws IOException { String output= outbasedir + "testArrayWithNotNull"; String expected = basedir + "expected_testArrayWithSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testArrayFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( " + " '{\"nullable\": false }' );" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testArrayWithSame() throws IOException { String output= outbasedir + "testArrayWithSame"; String expected = basedir + "expected_testArrayWithSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testArrayFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( " + " 'same', '" + Util.encodeEscape(testArrayFile) + "' );" }; testAvroStorage(queries); verifyResults(output, expected); } @Test public void testArrayWithSnappyCompression() throws IOException { String output= outbasedir + "testArrayWithSnappyCompression"; String expected = basedir + "expected_testArrayDefault.avro"; deleteDirectory(new File(output)); Properties properties = new Properties(); properties.setProperty(MRConfiguration.OUTPUT_COMPRESS, "true"); properties.setProperty(MRConfiguration.OUTPUT_COMPRESSION_CODEC, "org.apache.hadoop.io.compress.SnappyCodec"); properties.setProperty("avro.output.codec", "snappy"); PigServer pigServer = new PigServer(ExecType.LOCAL, properties); pigServer.setBatchOn(); String [] queries = { " in = LOAD '" + Util.encodeEscape(testArrayFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; for (String query: queries){ pigServer.registerQuery(query); } pigServer.executeBatch(); verifyResults(output, expected, "snappy"); } @Test public void testRecordWithSplit() throws IOException { PigSchema2Avro.setTupleIndex(0); String output1= outbasedir + "testRecordSplit1"; String output2= outbasedir + "testRecordSplit2"; String expected1 = basedir + "expected_testRecordSplit1.avro"; String expected2 = basedir + "expected_testRecordSplit2.avro"; deleteDirectory(new File(output1)); deleteDirectory(new File(output2)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testRecordFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " groups = GROUP avro BY member_id;", " sc = FOREACH groups GENERATE group AS key, COUNT(avro) AS cnt;", " STORE sc INTO '" + output1 + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'{\"index\": 1, " + " \"schema\": {\"type\":\"record\", " + " \"name\":\"result\", " + " \"fields\":[ {\"name\":\"member_id\",\"type\":\"int\"}, " + "{\"name\":\"count\", \"type\":\"long\"} " + "]" + "}" + " }');", " STORE sc INTO '" + output2 + " 'USING org.apache.pig.piggybank.storage.avro.AvroStorage ('index', '2');" }; testAvroStorage( queries); verifyResults(output1, expected1); verifyResults(output2, expected2); } @Test public void testRecordWithSplitFromText() throws IOException { PigSchema2Avro.setTupleIndex(0); String output1= outbasedir + "testRecordSplitFromText1"; String output2= outbasedir + "testRecordSplitFromText2"; String expected1 = basedir + "expected_testRecordSplitFromText1.avro"; String expected2 = basedir + "expected_testRecordSplitFromText2.avro"; deleteDirectory(new File(output1)); deleteDirectory(new File(output2)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile) + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});", " groups = GROUP avro BY member_id;", " sc = FOREACH groups GENERATE group AS key, COUNT(avro) AS cnt;", " STORE sc INTO '" + output1 + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'{\"index\": 1, " + " \"schema\": {\"type\":\"record\", " + " \"name\":\"result\", " + " \"fields\":[ {\"name\":\"member_id\",\"type\":\"int\"}, " + "{\"name\":\"count\", \"type\":\"long\"} " + "]" + "}" + " }');", " STORE sc INTO '" + output2 + " 'USING org.apache.pig.piggybank.storage.avro.AvroStorage ('index', '2');" }; testAvroStorage( queries); verifyResults(output1, expected1); verifyResults(output2, expected2); } @Test public void testRecordWithFieldSchema() throws IOException { PigSchema2Avro.setTupleIndex(1); String output= outbasedir + "testRecordWithFieldSchema"; String expected = basedir + "expected_testRecordWithFieldSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testRecordFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " avro1 = FILTER avro BY member_id > 1211;", " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;", " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'{\"data\": \"" + Util.encodeEscape(testRecordFile) + "\" ," + " \"field0\": \"int\", " + " \"field1\": \"def:browser_id\", " + " \"field3\": \"def:act_content\" " + " }');" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testRecordWithFieldSchemaFromText() throws IOException { PigSchema2Avro.setTupleIndex(1); String output= outbasedir + "testRecordWithFieldSchemaFromText"; String expected = basedir + "expected_testRecordWithFieldSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile) + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});", " avro1 = FILTER avro BY member_id > 1211;", " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;", " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'{\"data\": \"" + Util.encodeEscape(testRecordFile) + "\" ," + " \"field0\": \"int\", " + " \"field1\": \"def:browser_id\", " + " \"field3\": \"def:act_content\" " + " }');" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testRecordWithFieldSchemaFromTextWithSchemaFile() throws IOException { PigSchema2Avro.setTupleIndex(1); String output= outbasedir + "testRecordWithFieldSchemaFromTextWithSchemaFile"; String expected = basedir + "expected_testRecordWithFieldSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile) + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});", " avro1 = FILTER avro BY member_id > 1211;", " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;", " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'{\"schema_file\": \"" + Util.encodeEscape(testRecordSchema) + "\" ," + " \"field0\": \"int\", " + " \"field1\": \"def:browser_id\", " + " \"field3\": \"def:act_content\" " + " }');" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testSingleFieldTuples() throws IOException { String output= outbasedir + "testSingleFieldTuples"; String expected = basedir + "expected_testSingleFieldTuples.avro"; deleteDirectory(new File(output)); String [] queries = { " messages = LOAD '" + Util.encodeEscape(testSingleTupleBagFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " a = foreach (group messages by user_id) { sorted = order messages by message_id DESC; GENERATE group AS user_id, sorted AS messages; };", " STORE a INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; testAvroStorage( queries); } @Test public void testFileWithNoExtension() throws IOException { PigSchema2Avro.setTupleIndex(4); String output= outbasedir + "testFileWithNoExtension"; String expected = basedir + "expected_testFileWithNoExtension.avro"; deleteDirectory(new File(output)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testNoExtensionFile) + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " avro1 = FILTER avro BY member_id > 1211;", " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;", " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'{\"data\": \"" + Util.encodeEscape(testNoExtensionFile) + "\" ," + " \"field0\": \"int\", " + " \"field1\": \"def:browser_id\", " + " \"field3\": \"def:act_content\" " + " }');" }; testAvroStorage( queries); verifyResults(output, expected); } // Same as above, just without using json in the constructor @Test public void testRecordWithFieldSchemaFromTextWithSchemaFile2() throws IOException { PigSchema2Avro.setTupleIndex(1); String output= outbasedir + "testRecordWithFieldSchemaFromTextWithSchemaFile2"; String expected = basedir + "expected_testRecordWithFieldSchema.avro"; deleteDirectory(new File(output)); String [] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile) + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});", " avro1 = FILTER avro BY member_id > 1211;", " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;", " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage (" + "'schema_file', '" + Util.encodeEscape(testRecordSchema) + "'," + "'field0','int'," + "'field1','def:browser_id'," + "'field3','def:act_content'" + ");" }; testAvroStorage( queries); verifyResults(output, expected); } @Test public void testCorruptedFile1() throws IOException { // Verify that load fails when bad files are found if ignore_bad_files is disabled. String output = outbasedir + "testCorruptedFile1"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testCorruptedFile) + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; // Job is expected to fail for bad files. testAvroStorage(true, queries); } @Test public void testCorruptedFile2() throws IOException { // Verify that corrupted files are skipped if ignore_bad_files is enabled. // Output is expected to be empty. String output = outbasedir + "testCorruptedFile2"; String expected = basedir + "expected_testCorruptedFile.avro"; deleteDirectory(new File(output)); String [] queries = { " in = LOAD '" + Util.encodeEscape(testCorruptedFile) + "'" + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ('ignore_bad_files');", " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" }; testAvroStorage(queries); verifyResults(output, expected); } @Test // Schema for the generated avro file test_loadavrowithnulls.avro // ["null",{"type":"record","name":"TUPLE_0", // "fields":[ // {"name":"name","type":["null","string"],"doc":"autogenerated from Pig Field Schema"}, // {"name":"age","type":["null","int"],"doc":"autogenerated from Pig Field Schema"}, // {"name":"gpa","type":["null","double"],"doc":"autogenerated from Pig Field Schema"}]}] public void testLoadwithNullValues() throws IOException { //Input is supposed to have empty tuples PigSchema2Avro.setTupleIndex(0); Data data = resetData(pigServerLocal); String output = outbasedir + "testLoadwithNulls"; deleteDirectory(new File(output)); String [] queries = { " A = load '" + testLoadwithNullValues + "' USING " + " org.apache.pig.piggybank.storage.avro.AvroStorage(); ", " B = order A by name;", " store B into '" + output +"' USING mock.Storage();" }; testAvroStorage(queries); List<Tuple> out = data.get(output); assertEquals(out + " size", 4, out.size()); assertEquals(schema("name:chararray,age:int,gpa:double"), data.getSchema(output)); // sorted data ordered by name assertEquals(tuple((String)null),out.get(0)); assertEquals(tuple((String)null),out.get(1)); assertEquals(tuple("calvin ellison", 24, 0.71), out.get(2)); assertEquals(tuple("wendy johnson", 60, 0.07), out.get(3)); } @Test public void testMultipleLoadStore() throws Exception { PigSchema2Avro.setTupleIndex(0); Data data = resetData(pigServerLocal); data.set("foo", tuple(1, 2, 3), tuple(4, 5, 6), tuple(7, 8, 9)); data.set("bar", tuple("a", "b", "c"), tuple("d", "e", "f"), tuple("g", "h", "i")); String output = outbasedir + "testMultipleLoadStore"; deleteDirectory(new File(output)); String[] storeQuery = { "A = LOAD 'foo' USING " + "mock.Storage() as (a1:int, a2:int, a3:int);", "B = LOAD 'bar' USING " + "mock.Storage() as (b1:chararray, b2:chararray, b3:chararray);", "STORE A into '"+ output +"/A' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();", "STORE B into '"+ output +"/B' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();" }; testAvroStorage(storeQuery); String[] loadQuery = { "C = LOAD '"+ output +"/A' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();", "D = LOAD '"+ output +"/B' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();", "STORE C into 'foo-actual' USING mock.Storage();", "STORE D into 'bar-actual' USING mock.Storage();" }; testAvroStorage(loadQuery); assertEquals(data.get("foo"), data.get("foo-actual")); assertEquals(data.get("bar"), data.get("bar-actual")); assertEquals("{a1: int,a2: int,a3: int}", data.getSchema("foo-actual").toString()); assertEquals("{b1: chararray,b2: chararray,b3: chararray}", data.getSchema("bar-actual").toString()); } private static void deleteDirectory (File path) { if ( path.exists()) { File [] files = path.listFiles(); for (File file: files) { if (file.isDirectory()) deleteDirectory(file); file.delete(); } } } private void testAvroStorage(String ...queries) throws IOException { testAvroStorage(false, queries); } private void testAvroStorage(boolean expectedToFail, String ...queries) throws IOException { pigServerLocal.setBatchOn(); for (String query: queries){ if (query != null && query.length() > 0) { pigServerLocal.registerQuery(query); } } int numOfFailedJobs = 0; for (ExecJob job : pigServerLocal.executeBatch()) { if (job.getStatus().equals(JOB_STATUS.FAILED)) { numOfFailedJobs++; } } if (expectedToFail) { assertTrue("There was no failed job!", numOfFailedJobs > 0); } else { assertTrue("There was a failed job!", numOfFailedJobs == 0); } } private void verifyResults(String outPath, String expectedOutpath) throws IOException { verifyResults(outPath, expectedOutpath, null); } private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException { FileSystem fs = FileSystem.getLocal(new Configuration()) ; /* read in expected results*/ Set<Object> expected = getExpected (expectedOutpath); /* read in output results and compare */ Path output = new Path(outPath); assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir()); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); DataFileStream<Object> in = new DataFileStream<Object>( fs.open(filePath), reader); assertEquals("codec", expectedCodec, in.getMetaString("avro.codec")); int count = 0; while (in.hasNext()) { Object obj = in.next(); //System.out.println("obj = " + (GenericData.Array<Float>)obj); assertTrue("Avro result object found that's not expected: " + obj, expected.contains(obj)); count++; } in.close(); assertEquals(expected.size(), count); } } } private Set<Object> getExpected (String pathstr ) throws IOException { Set<Object> ret = new HashSet<Object>(); FileSystem fs = FileSystem.getLocal(new Configuration()); /* read in output results and compare */ Path output = new Path(pathstr); assertTrue("Expected output does not exists!", fs.exists(output)); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); DataFileStream<Object> in = new DataFileStream<Object>(fs.open(filePath), reader); while (in.hasNext()) { Object obj = in.next(); ret.add(obj); } in.close(); } } return ret; } }