/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.data.spi.hive; import java.io.IOException; import java.net.URI; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.metastore.api.Table; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.DatasetNotFoundException; import org.kitesdk.data.Datasets; import org.kitesdk.data.Formats; import org.kitesdk.data.TestHelpers; import org.kitesdk.data.ValidationException; /** * Tests external URI backward-compatibility with existing datasets. * * Before namespaces, all tables were stored in the metastore using the default * namespace. External tables now use the containing folder name as the * namespace, so the old metadata will not be found because the namespace no * longer matches. The fix is to check the default namespace for the table when * the table isn't found. * * This problem doesn't affect other FileSystemDatasets because the metadata * for non-Hive datasets is stored along with the data. The change to using * namespaces was a change in internal book-keeping. * * This problem doesn't affect managed datasets because the managed URI doesn't * require a namespace. The namespace will be defaulted to "default" if no * namespace is included, which matches the old URI syntax. Failure to find the * dataset would only happen if the dataset was previously created with a * useless namespace query option. */ public class TestExternalBackwardCompatibility { private Configuration conf; private DatasetDescriptor descriptor = null; private MetaStoreUtil metastore = null; @Before public void addTableToDefault() { // this test uses the local FS because this.conf = new Configuration(); this.metastore = MetaStoreUtil.get(conf); cleanHive(); metastore.dropTable("default", "test"); this.descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .location(URI.create("file:/tmp/datasets/test")) .build(); Table table = HiveUtils.tableForDescriptor( "default", "test", descriptor, true); metastore.createTable(table); } @After public void cleanHive() { // ensures all tables are removed MetaStoreUtil metastore = MetaStoreUtil.get(conf); for (String database : metastore.getAllDatabases()) { for (String table : metastore.getAllTables(database)) { metastore.dropTable(database, table); } if (!"default".equals(database)) { metastore.dropDatabase(database, true); } } } @Test public void testLoadChecksDefaultNamespace() { Assert.assertNotNull("Should find dataset by checking default db", Datasets.load("dataset:hive:/tmp/datasets/test")); TestHelpers.assertThrows("Should not load dataset (there isn't one)", DatasetNotFoundException.class, new Runnable() { @Override public void run() { Datasets.load("dataset:hive:/tmp/datasets/test2"); } }); } @Test public void testLoadWithUpdatedURI() { // Adding the namespace in the URI works because the data location is kept Dataset<GenericRecord> ds = Datasets.load( "dataset:hive:/tmp/datasets/default/test"); Assert.assertNotNull("Should find dataset with new URI", ds); Assert.assertEquals("Storage location should be unchanged", URI.create("file:/tmp/datasets/test"), ds.getDescriptor().getLocation()); } @Test public void testExistsChecksDefaultNamespace() { Assert.assertTrue("Should find dataset by checking default db", Datasets.exists("dataset:hive:/tmp/datasets/test")); Assert.assertFalse("Should not find dataset (there isn't one)", Datasets.exists("dataset:hive:/tmp/datasets/test2")); } @Test public void testExistsWithUpdatedURI() { Assert.assertTrue("Should find dataset by checking default db", Datasets.exists("dataset:hive:/tmp/datasets/default/test")); } @Test public void testCreateSucceedsIfCompatible() { // this will create a Dataset in the right namespace/name that is backed // by the same data location. this can be used to migrate. Assert.assertNotNull("Create should succeed even if there is a duplicate", Datasets.create("dataset:hive:/tmp/datasets/test", descriptor)); } @Test public void testCreateFailsIfNotCompatible() { // this will fail because the new descriptor uses a different format // the old descriptor is found and used to validate the change TestHelpers.assertThrows("Create should fail because of a format change", ValidationException.class, new Runnable() { @Override public void run() { Datasets.create("dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .format(Formats.PARQUET) .build()); } }); } @Test public void testCreateIncompatibleSucceedsWithLocation() { // if there is a requested location then the default table isn't checked // because only the default location would have been used Assert.assertNotNull("Create should succeed if location doesn't match", Datasets.create("dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .location(URI.create("file:/tmp/test-data/test")) .format(Formats.PARQUET) .build())); } @Test public void testUpdateChangesDefaultNamespace() { Dataset<GenericRecord> updated = Datasets.update( "dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .property("added.property", "true") .build()); Assert.assertNotNull("Update should succeed", updated); DatasetDescriptor stored = HiveUtils.descriptorForTable(conf, metastore.getTable("default", "test")); Assert.assertEquals("Should update default.test descriptor", stored, updated.getDescriptor()); Assert.assertEquals("Added property should be present", stored.getProperty("added.property"), "true"); } @Test public void testUpdateWithUpdatedURI() { Dataset<GenericRecord> updated = Datasets.update( "dataset:hive:/tmp/datasets/default/test", new DatasetDescriptor.Builder(descriptor) .property("added.property", "true") .build()); Assert.assertNotNull("Update should succeed", updated); DatasetDescriptor stored = HiveUtils.descriptorForTable(conf, metastore.getTable("default", "test")); Assert.assertEquals("Should update default.test descriptor", stored, updated.getDescriptor()); Assert.assertEquals("Added property should be present", stored.getProperty("added.property"), "true"); } @Test public void testUpdateValidatesAgainstDefaultNamespace() { TestHelpers.assertThrows("Update should fail because of a format change", ValidationException.class, new Runnable() { @Override public void run() { Datasets.update("dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .format(Formats.PARQUET) .build()); } }); } @Test public void testDeleteWithDefaultNamespace() throws IOException { Assert.assertTrue("Delete should succeed if the location matches", Datasets.delete("dataset:hive:/tmp/datasets/test")); Assert.assertFalse("Delete should return false if there is no dataset", Datasets.delete("dataset:hive:/tmp/datasets/test")); // recreate the default.test dataset, but with a different storage location DatasetDescriptor doNotDelete = new DatasetDescriptor.Builder(descriptor) .location(URI.create("file:/tmp/datasets/default/test")) .build(); metastore.createTable(HiveUtils.tableForDescriptor( "default", "test", doNotDelete, true)); Assert.assertFalse("Delete should not find a dataset to delete", Datasets.delete("dataset:hive:/tmp/datasets/test")); Assert.assertTrue("Delete should not change the dataset", metastore.exists("default", "test")); } @Test public void testDeleteWithUpdatedURI() { Assert.assertTrue("Delete should succeed with explicit namespace", Datasets.delete("dataset:hive:/tmp/datasets/default/test")); Assert.assertFalse("Delete should return false if there is no dataset", Datasets.delete("dataset:hive:/tmp/datasets/default/test")); Assert.assertFalse("Delete should remove the dataset", metastore.exists("default", "test")); } }