/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.cli.commands; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.io.ByteStreams; import com.google.common.io.Closeables; import com.google.common.io.Resources; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.kitesdk.cli.example.User; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.DatasetWriter; import org.kitesdk.data.Datasets; import org.kitesdk.data.MiniDFSTest; import org.kitesdk.data.PartitionStrategy; import org.kitesdk.data.spi.DatasetRepository; import org.kitesdk.data.spi.DefaultConfiguration; import org.kitesdk.data.spi.OptionBuilder; import org.kitesdk.data.spi.Registration; import org.kitesdk.data.spi.URIPattern; import org.kitesdk.data.spi.hive.MetaStoreUtil; import org.slf4j.Logger; import static org.mockito.Mockito.contains; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; public class TestCreateDatasetCommandCluster extends MiniDFSTest { private static final AtomicInteger ids = new AtomicInteger(0); private static final Map<String, DatasetRepository> repos = Maps.newHashMap(); private String id = null; private CreateDatasetCommand command = null; private Logger console; @BeforeClass public static void addMockRepoBuilder() throws Exception { Registration.register( new URIPattern("mock::id"), new URIPattern("mock::id"), new OptionBuilder<DatasetRepository>() { @Override public DatasetRepository getFromOptions(Map<String, String> options) { DatasetRepository repo = mock(DatasetRepository.class); repos.put(options.get("id"), repo); return repo; } } ); } @Before public void setUp() { this.id = Integer.toString(ids.addAndGet(1)); this.console = mock(Logger.class); this.command = new CreateDatasetCommand(console); this.command.setConf(getConfiguration()); this.command.repoURI = "repo:mock:" + id; } public DatasetRepository getMockRepo() { return repos.get(id); } @Test public void testBasicUse() throws Exception { command.avroSchemaFile = "resource:test-schemas/user.avsc"; command.datasets = Lists.newArrayList("users"); command.run(); DatasetDescriptor expectedDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:test-schemas/user.avsc") .build(); verify(getMockRepo()).create("default", "users", expectedDescriptor); verify(console).debug(contains("Created"), eq("users")); } @Test public void testBasicUseLocalSchema() throws Exception { String avsc = "target/localUser.avsc"; FSDataOutputStream out = getFS() .create(new Path(avsc), true /* overwrite */ ); ByteStreams.copy(Resources.getResource("test-schemas/user.avsc").openStream(), out); out.close(); command.avroSchemaFile = avsc; command.datasets = Lists.newArrayList("users"); command.run(); DatasetDescriptor expectedDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:test-schemas/user.avsc") .build(); verify(getMockRepo()).create("default", "users", expectedDescriptor); verify(console).debug(contains("Created"), eq("users")); } @Test public void testBasicUseHDFSSchema() throws Exception { // this test needs to set the default configuration so that the // DatasetDescriptor can resolve HDFS to qualify the path. otherwise, // the default FS is file:/ and the avsc path is not qualified, causing // an IOException when it tries to read the file. Setting up HDFS correctly // in the environment fixes the problem. Configuration existing = DefaultConfiguration.get(); DefaultConfiguration.set(getConfiguration()); String avsc = "hdfs:/tmp/schemas/hdfsUser.avsc"; FSDataOutputStream out = getDFS() .create(new Path(avsc), true /* overwrite */ ); ByteStreams.copy(Resources.getResource("test-schemas/user.avsc").openStream(), out); out.close(); command.avroSchemaFile = avsc; command.datasets = Lists.newArrayList("users"); command.run(); DatasetDescriptor expectedDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:test-schemas/user.avsc") .build(); verify(getMockRepo()).create("default", "users", expectedDescriptor); verify(console).debug(contains("Created"), eq("users")); // restore the previous Configuration DefaultConfiguration.set(existing); } @Test public void testCreateWithExistingDataPartitionsExternalHive() throws IOException { createWithExistingDataPartitions("dataset:hive:/tmp/datasets/users", "datasets"); } @Test public void testCreateWithExistingDataPartitionsManagedHive() throws IOException { createWithExistingDataPartitions("dataset:hive:users", "default"); } private void createWithExistingDataPartitions(String datasetUri, String database) throws IOException { Configuration existing = DefaultConfiguration.get(); try { DefaultConfiguration.set(getConfiguration()); // create a partitioned dataset and add a record DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(User.class) .partitionStrategy(new PartitionStrategy.Builder() .hash("username", 4) .build()) .build(); Dataset<User> users = Datasets.create( "dataset:hdfs:/tmp/datasets/users", descriptor, User.class); DatasetWriter<User> writer = null; try { writer = users.newWriter(); writer.write(new User("test", "test@example.com")); } finally { Closeables.closeQuietly(writer); } // remove the dataset's metadata getDFS().delete(new Path("/tmp/datasets/users/.metadata"), true); // use the create command to create an external table in Hive Logger console = mock(Logger.class); CreateDatasetCommand create = new CreateDatasetCommand(console); create.setConf(getConfiguration()); create.datasets = Lists.newArrayList(datasetUri); create.location = "hdfs:/tmp/datasets/users"; create.run(); // validate the dataset Dataset<GenericRecord> loaded = Datasets.load(datasetUri); Assert.assertNotNull("Should successfully create Hive dataset", loaded); Assert.assertTrue("Should be partitioned", loaded.getDescriptor().isPartitioned()); PartitionStrategy expectedStrategy = new PartitionStrategy.Builder() .provided("username_hash", "int") .build(); Assert.assertEquals("Should have a provided partition strategy", expectedStrategy, loaded.getDescriptor().getPartitionStrategy()); MetaStoreUtil meta = new MetaStoreUtil(getConfiguration()); List<String> partitions = meta.listPartitions(database, "users", (short) 10); Assert.assertEquals("Table should have a partition", 1, partitions.size()); Assert.assertTrue("Partition should exist", getDFS().exists(new Path(partitions.get(0)))); Assert.assertTrue("Partition should be a partition directory", partitions.get(0).contains("/tmp/datasets/users/username_hash=")); } finally { Datasets.delete(datasetUri); DefaultConfiguration.set(existing); } } }