/** * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.data.spi.filesystem; import java.io.IOException; import java.net.URI; import java.util.Date; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.reflect.ReflectData; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.junit.Assert; import org.junit.Test; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.Formats; import org.kitesdk.data.IncompatibleSchemaException; import org.kitesdk.data.PartitionStrategy; import org.kitesdk.data.TestDatasetRepositories; import org.kitesdk.data.TestHelpers; import org.kitesdk.data.ValidationException; import org.kitesdk.data.spi.DatasetRepository; import org.kitesdk.data.spi.MetadataProvider; import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.checkTestUsers; import static org.kitesdk.data.spi.filesystem.DatasetTestUtilities.writeTestUsers; public class TestFileSystemDatasetRepository extends TestDatasetRepositories { public TestFileSystemDatasetRepository(boolean distributed) { super(distributed); } @Override @SuppressWarnings("deprecation") public DatasetRepository newRepo(MetadataProvider provider) { // this purposely does not set the Configuration to test that the code // relies on filesystem URIs set in the DatasetDescriptor. return new FileSystemDatasetRepository( conf, testDirectory, provider); } @Test public void testCreatePath() throws IOException { Dataset<Record> created = repo.create(NAMESPACE, NAME, testDescriptor); URI location = created.getDescriptor().getLocation(); Assert.assertNotNull( "FileSystemDatasetRepository should return descriptor locations", location); Assert.assertTrue("Dataset data directory:" + location + " should exist", fileSystem.exists(new Path(location))); } @Test public void testLoadNewHasZeroSize() { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); /* * We perform a read test to make sure only data files are encountered * during a read. */ Assert.assertEquals(0, DatasetTestUtilities.datasetSize(dataset)); } @Test public void testUpdateFailsWithFormatChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder(testDescriptor) .format(Formats.AVRO) .build()); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .format(Formats.PARQUET) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to format change"); } catch (ValidationException e) { // expected } Assert.assertEquals( Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat()); } @Test public void testUpdateFailsWithPartitionStrategyChange() { PartitionStrategy ps1 = new PartitionStrategy.Builder() .hash("username", 2) .build(); PartitionStrategy ps2 = new PartitionStrategy.Builder() .hash("username", 2) .hash("email", 3) .build(); Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder(testDescriptor) .partitionStrategy(ps1) .build()); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .partitionStrategy(ps2) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to partition strategy change"); } catch (ValidationException e) { // expected } Assert.assertEquals( ps1, repo.load(NAMESPACE, NAME).getDescriptor().getPartitionStrategy()); } @Test public void testUpdateFailsWithLocationChange() { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); URI location = dataset.getDescriptor().getLocation(); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .location(new Path(testDirectory, "newDataLocation").toUri()) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to data location change"); } catch (ValidationException ex) { // expected } Assert.assertEquals( location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation()); } @Test public void testUpdateFailsWithIncompatibleSchemaChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); Assert.assertEquals("Dataset name is propagated", NAME, dataset.getName()); Assert.assertEquals("Dataset schema is propagated", testSchema, dataset .getDescriptor().getSchema()); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .requiredString("email") .requiredString("favoriteColor") // incompatible - no default .endRecord(); try { repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder( dataset.getDescriptor()).schema(testSchemaV2).build()); Assert.fail("Should fail due to incompatible update"); } catch (ValidationException e) { // expected } dataset = repo.load(NAMESPACE, NAME); Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset .getDescriptor().getSchema()); } @Test public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldAdded() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); writeTestUsers(dataset, 5, 0, "email"); checkTestUsers(dataset, 5, "email"); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .requiredString("email") .nullableString("favoriteColor", "orange") .endRecord(); Dataset<Record> datasetV2 = repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(dataset.getDescriptor()) .schema(testSchemaV2) .build()); Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2 .getDescriptor().getSchema()); // test that the old records can be read back with the new schema checkTestUsers(datasetV2, 5, "email", "favoriteColor"); // write more users and test that the mixed set can be read back with the new schema writeTestUsers(datasetV2, 5, 5, "email", "favoriteColor"); checkTestUsers(datasetV2, 10, "email", "favoriteColor"); } @Test public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldRemoved() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); writeTestUsers(dataset, 5, 0, "email"); checkTestUsers(dataset, 5, "email"); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .endRecord(); Dataset<Record> datasetV2 = repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(dataset.getDescriptor()) .schema(testSchemaV2) .build()); Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2 .getDescriptor().getSchema()); // test that the old records can be read back with the new schema checkTestUsers(datasetV2, 5, new String[0]); // write more users and test that the mixed set can be read back with the new schema writeTestUsers(datasetV2, 5, 5, new String[0]); checkTestUsers(datasetV2, 10, new String[0]); } @Test public void testDeleteRemovesDatasetPath() throws IOException { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); Path dataPath = new Path(dataset.getDescriptor().getLocation()); Assert.assertTrue(fileSystem.exists(dataPath)); repo.delete(NAMESPACE, NAME); Assert.assertFalse(fileSystem.exists(dataPath)); } private static class ObjectPoJo { private Long id; private String name; private Date birthDate; public ObjectPoJo(Long id, String name, Date birthDate) { this.id = id; this.name = name; this.birthDate = birthDate; } } private static class PrimitivePoJo { private long id; private String name; private Date birthDate; } @Test public void testCreateWithAllowNullSchema() { String name = "allowNull"; try { repo.create(NAMESPACE, name, new DatasetDescriptor.Builder() .schema(ReflectData.AllowNull.get().getSchema(ObjectPoJo.class)) .build()); } catch (RuntimeException e) { throw e; } finally { repo.delete(NAMESPACE, name); } } @Test public void testReadNullsWithPrimitivesAllowNullSchema() { final String name = "allowNullPrimitives"; try { repo.create(NAMESPACE, name, new DatasetDescriptor.Builder() .schema(ReflectData.AllowNull.get().getSchema(ObjectPoJo.class)) .build(), ObjectPoJo.class); // should load the dataset because PrimitivePoJo can be used to write final Dataset<PrimitivePoJo> dataset = repo.load( NAMESPACE, name, PrimitivePoJo.class); TestHelpers.assertThrows("AllowNull primitives cannot read nullable type", IncompatibleSchemaException.class, new Runnable() { @Override public void run() { dataset.newReader(); } }); } catch (RuntimeException e) { throw e; } finally { repo.delete(NAMESPACE, name); } } @Test public void testReplaceProvidedPartition() { Schema event = SchemaBuilder.record("Event").fields() .requiredLong("created_at") .requiredLong("version") .name("properties").type().optional().map().values().stringType() .endRecord(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(event) .partitionStrategy(new PartitionStrategy.Builder() .provided("v", "int") .year("created_at") .month("created_at") .day("created_at") .build()) .build(); Dataset<?> ds = repo.create("ns", "test", descriptor); DatasetDescriptor update = new DatasetDescriptor.Builder(ds.getDescriptor()) .partitionStrategy(new PartitionStrategy.Builder() .identity("version", "v") .year("created_at") .month("created_at") .day("created_at") .build()) .build(); repo.update("ns", "test", update); } }