/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.spi.filesystem;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.UUID;
import java.util.concurrent.Callable;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetWriter;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Formats;
import org.kitesdk.data.LocalFileSystem;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.TestHelpers;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.DescriptorUtil;
import static org.kitesdk.data.CompressionType.Uncompressed;
public class TestFileSystemUtil {
private static final Schema USER_SCHEMA = SchemaBuilder.record("User").fields()
.requiredLong("id")
.requiredString("username")
.endRecord();
private static final Schema EVENT_SCHEMA = SchemaBuilder.record("Event").fields()
.requiredLong("timestamp")
.requiredString("level")
.requiredString("message")
.endRecord();
private static final Record USER = new Record(USER_SCHEMA);
private static final Record EVENT = new Record(EVENT_SCHEMA);
@BeforeClass
public static void initRecords() {
USER.put("id", 1L);
USER.put("username", "test");
EVENT.put("timestamp", System.currentTimeMillis());
EVENT.put("level", "DEBUG");
EVENT.put("message", "Useless information!");
}
@Rule
public TemporaryFolder temp = new TemporaryFolder();
@Test
public void testEmptyDataset() throws Exception {
File folder = temp.newFolder("a/b/c/d/e/dataset_name");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(USER_SCHEMA)
.build();
Datasets.create(datasetUri, descriptor);
Collection<DatasetDescriptor> expected = Lists.newArrayList();
Assert.assertEquals("Should succeed and find no datasets",
expected, FileSystemUtil.findPotentialDatasets(fs, root));
}
@Test
public void testUnpartitionedDataset() throws Exception {
File folder = temp.newFolder("a/b/c/d/e/dataset_name");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(USER_SCHEMA)
.build();
Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);
// write two so that the descriptor uses the directory rather than a file
writeUserToView(dataset);
writeUserToView(dataset);
DatasetDescriptor expected = dataset.getDescriptor();
DatasetDescriptor actual = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertEquals("Should succeed and find an equivalent descriptor",
expected, actual);
}
@Test
public void testPartitionedDataset() throws Exception {
File folder = temp.newFolder("a/b/c/d/e/dataset_name");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(USER_SCHEMA)
.partitionStrategy(new PartitionStrategy.Builder()
.hash("id", 4)
.build())
.build();
Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);
// write two so that the descriptor uses the directory rather than a file
writeUserToView(dataset);
writeUserToView(dataset);
Path datasetPath = new Path(folder.toURI());
Path partitionPath = new Path(datasetPath, "id_hash=1");
DatasetDescriptor actual = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Location should be at the partition directory",
partitionPath.toUri(), actual.getLocation());
Assert.assertEquals("Should use user schema",
USER_SCHEMA, actual.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, actual.getFormat());
Assert.assertFalse("Should not be partitioned", actual.isPartitioned());
}
@Test
public void testPartitionedDatasetWithEscapedChars() throws Exception {
File folder = temp.newFolder("a/b/c/d/e/dataset_name");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(USER_SCHEMA)
.partitionStrategy(new PartitionStrategy.Builder()
.provided("s")
.build())
.build();
Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);
// write two so that the descriptor uses the directory rather than a file
writeUserToView(dataset.with("s", "test/-0"));
writeUserToView(dataset.with("s", "test/-0"));
Path datasetPath = new Path(folder.toURI());
Path partitionPath = new Path(datasetPath, "s=test%2F-0");
DatasetDescriptor actual = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Location should be at the partition directory",
URI.create(partitionPath.toString()), actual.getLocation());
Assert.assertEquals("Should use user schema",
USER_SCHEMA, actual.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, actual.getFormat());
Assert.assertFalse("Should not be partitioned", actual.isPartitioned());
}
@Test
public void testSingleAvroFile() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a single Avro file
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, parent);
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri().getPath(),
parent(descriptor.getLocation()).getPath());
Assert.assertTrue("Should be a .avro file",
descriptor.getLocation().toString().endsWith(".avro"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, descriptor.getFormat());
Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
@Test
public void testMultipleAvroFilesInOneFolder() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files in parent
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, parent);
createAvroUserFile(fs, parent);
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri(), descriptor.getLocation());
Assert.assertEquals("Should use user schema",
USER_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, descriptor.getFormat());
Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
@Test
public void testMultipleAvroFilesInSeparateFolders() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files under separate folders
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, new Path(parent, "part=1"));
createAvroUserFile(fs, new Path(parent, "2"));
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("part", "int")
.build();
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri(), descriptor.getLocation());
Assert.assertEquals("Should use user schema",
USER_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, descriptor.getFormat());
Assert.assertEquals("Should be partitioned by part=int",
strategy, descriptor.getPartitionStrategy());
}
@Test
public void testMultipleAvroFilesAtDifferentDepths() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files under separate folders
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, new Path(parent, "part=1"));
createAvroUserFile(fs, parent);
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("part", "int")
.build();
Assert.assertTrue("Should flag data at mixed depth in the directory tree",
DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
Assert.assertEquals("Should be directly under parent",
parent.toUri(), descriptor.getLocation());
Assert.assertEquals("Should use user schema",
USER_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, descriptor.getFormat());
Assert.assertEquals("Should be partitioned by part=int",
strategy, descriptor.getPartitionStrategy());
}
@Test
public void testMultipleMergeTablesAtDifferentDepths() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files under separate folders
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, new Path(parent, "part=1"));
createAvroUserFile(fs, new Path(parent, "part=1"));
createAvroUserFile(fs, parent);
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("part", "int")
.build();
Assert.assertTrue("Should flag data at mixed depth in the directory tree",
DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
Assert.assertEquals("Should be directly under parent",
parent.toUri(), descriptor.getLocation());
Assert.assertEquals("Should use user schema",
USER_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, descriptor.getFormat());
Assert.assertEquals("Should be partitioned by part=int",
strategy, descriptor.getPartitionStrategy());
}
@Test
public void testMultipleAvroFilesInSeparateFoldersWithUnknown() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files under separate folders, with an unknown
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, new Path(parent, "part=1"));
createAvroUserFile(fs, new Path(parent, "part=2"));
createUnknownFile(fs, new Path(parent, "part=3"));
Collection<DatasetDescriptor> descriptors = FileSystemUtil
.findPotentialDatasets(fs, root);
Assert.assertEquals("Should have 2 descriptors", 2, descriptors.size());
DatasetDescriptor users1;
DatasetDescriptor users2;
DatasetDescriptor first = Iterables.getFirst(descriptors, null);
if (first.getLocation().toString().contains("part=1")) {
users1 = first;
users2 = Iterables.getLast(descriptors, null);
} else {
users2 = first;
users1 = Iterables.getLast(descriptors, null);
}
// the descriptors may be out of order, so check and swap
if (users1.getLocation().toString().contains("part=2")) {
users2 = Iterables.getFirst(descriptors, null);
users1 = Iterables.getLast(descriptors, null);
}
Assert.assertFalse("Should not flag at mixed depth",
users1.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
new Path(parent, "part=1").toUri(),
parent(users1.getLocation()));
Assert.assertTrue("Should be a .avro file",
users1.getLocation().toString().endsWith(".avro"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, users1.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, users1.getFormat());
Assert.assertFalse("Should not be partitioned",
users1.isPartitioned());
Assert.assertFalse("Should not flag at mixed depth",
users2.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
new Path(parent, "part=2").toUri(),
parent(users2.getLocation()));
Assert.assertTrue("Should be a .avro file",
users2.getLocation().toString().endsWith(".avro"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, users2.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, users2.getFormat());
Assert.assertFalse("Should not be partitioned",
users2.isPartitioned());
}
@Test
public void testSingleParquetFile() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a single Avro file
Path parent = new Path(folder.toURI());
createParquetEventFile(fs, parent);
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri().getPath(),
parent(descriptor.getLocation()).getPath());
Assert.assertTrue("Should be a .parquet file",
descriptor.getLocation().toString().endsWith(".parquet"));
Assert.assertEquals("Should use event schema",
EVENT_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Parquet format",
Formats.PARQUET, descriptor.getFormat());
Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
@Test
public void testMultipleParquetFilesInOneFolder() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a single Avro file
Path parent = new Path(folder.toURI());
createParquetEventFile(fs, parent);
createParquetEventFile(fs, parent);
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri(), descriptor.getLocation());
Assert.assertEquals("Should use event schema",
EVENT_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Parquet format",
Formats.PARQUET, descriptor.getFormat());
Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
@Test
public void testMultipleParquetFilesInSeparateFolders() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files under separate folders
Path parent = new Path(folder.toURI());
createParquetEventFile(fs, new Path(parent, "part"));
createParquetEventFile(fs, new Path(parent, "2"));
DatasetDescriptor descriptor = Iterables.getOnlyElement(
FileSystemUtil.findPotentialDatasets(fs, root));
PartitionStrategy strategy = new PartitionStrategy.Builder()
.provided("partition_1", "string")
.build();
Assert.assertFalse("Should not flag at mixed depth",
descriptor.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri(), descriptor.getLocation());
Assert.assertEquals("Should use user schema",
EVENT_SCHEMA, descriptor.getSchema());
Assert.assertEquals("Should have Parquet format",
Formats.PARQUET, descriptor.getFormat());
Assert.assertEquals("Should be partitioned by part=int",
strategy, descriptor.getPartitionStrategy());
}
@Test
public void testIncompatibleSchemaFilesInSeparateFolders() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Avro files under separate folders, with different schemas
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, new Path(parent, "part=1"));
createAvroEventFile(fs, new Path(parent, "part=2"));
Collection<DatasetDescriptor> descriptors = FileSystemUtil
.findPotentialDatasets(fs, root);
Assert.assertEquals("Should have 2 descriptors", 2, descriptors.size());
DatasetDescriptor users;
DatasetDescriptor events;
DatasetDescriptor first = Iterables.getFirst(descriptors, null);
if (first.getLocation().toString().contains("part=1")) {
users = first;
events = Iterables.getLast(descriptors, null);
} else {
events = first;
users = Iterables.getLast(descriptors, null);
}
Assert.assertFalse("Should not flag at mixed depth",
users.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
new Path(parent, "part=1").toUri(),
parent(users.getLocation()));
Assert.assertTrue("Should be a .avro file",
users.getLocation().toString().endsWith(".avro"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, users.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, users.getFormat());
Assert.assertFalse("Should not be partitioned",
users.isPartitioned());
Assert.assertFalse("Should not flag at mixed depth",
events.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
new Path(parent, "part=2").toUri(),
parent(events.getLocation()));
Assert.assertTrue("Should be a .avro file",
events.getLocation().toString().endsWith(".avro"));
Assert.assertEquals("Should use event schema",
EVENT_SCHEMA, events.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, events.getFormat());
Assert.assertFalse("Should not be partitioned",
events.isPartitioned());
}
@Test
public void testIncompatibleSchemaParquetFilesInSeparateFolders() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a two Parquet files under separate folders, with different schemas
Path parent = new Path(folder.toURI());
createParquetUserFile(fs, new Path(parent, "part=1"));
createParquetEventFile(fs, new Path(parent, "part=2"));
Collection<DatasetDescriptor> descriptors = FileSystemUtil
.findPotentialDatasets(fs, root);
Assert.assertEquals("Should have 2 descriptors", 2, descriptors.size());
DatasetDescriptor users;
DatasetDescriptor events;
DatasetDescriptor first = Iterables.getFirst(descriptors, null);
if (first.getLocation().toString().contains("part=1")) {
users = first;
events = Iterables.getLast(descriptors, null);
} else {
events = first;
users = Iterables.getLast(descriptors, null);
}
Assert.assertFalse("Should not flag at mixed depth",
users.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
new Path(parent, "part=1").toUri(),
parent(users.getLocation()));
Assert.assertTrue("Should be a .parquet file",
users.getLocation().toString().endsWith(".parquet"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, users.getSchema());
Assert.assertEquals("Should have Parquet format",
Formats.PARQUET, users.getFormat());
Assert.assertFalse("Should not be partitioned",
users.isPartitioned());
Assert.assertFalse("Should not flag at mixed depth",
events.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
new Path(parent, "part=2").toUri(),
parent(events.getLocation()));
Assert.assertTrue("Should be a .parquet file",
events.getLocation().toString().endsWith(".parquet"));
Assert.assertEquals("Should use event schema",
EVENT_SCHEMA, events.getSchema());
Assert.assertEquals("Should have Parquet format",
Formats.PARQUET, events.getFormat());
Assert.assertFalse("Should not be partitioned",
events.isPartitioned());
}
@Test
public void testIncompatibleFormatFilesInSameFolder() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create Avro and Parquet files under separate folders, with the same schema
Path parent = new Path(folder.toURI());
createAvroUserFile(fs, parent);
createParquetUserFile(fs, parent);
Collection<DatasetDescriptor> descriptors = FileSystemUtil
.findPotentialDatasets(fs, root);
Assert.assertEquals("Should have 2 descriptors", 2, descriptors.size());
DatasetDescriptor avro;
DatasetDescriptor parquet;
DatasetDescriptor first = Iterables.getFirst(descriptors, null);
if (first.getFormat() == Formats.AVRO) {
avro = first;
parquet = Iterables.getLast(descriptors, null);
} else {
parquet = first;
avro = Iterables.getLast(descriptors, null);
}
Assert.assertFalse("Should not flag at mixed depth",
avro.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri(),
parent(avro.getLocation()));
Assert.assertTrue("Should be a .avro file",
avro.getLocation().toString().endsWith(".avro"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, avro.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.AVRO, avro.getFormat());
Assert.assertFalse("Should not be partitioned",
avro.isPartitioned());
Assert.assertFalse("Should not flag at mixed depth",
parquet.hasProperty("kite.filesystem.mixed-depth"));
Assert.assertEquals("Should be directly under parent",
parent.toUri(),
parent(parquet.getLocation()));
Assert.assertTrue("Should be a .parquet file",
parquet.getLocation().toString().endsWith(".parquet"));
Assert.assertEquals("Should use user schema",
USER_SCHEMA, parquet.getSchema());
Assert.assertEquals("Should have Avro format",
Formats.PARQUET, parquet.getFormat());
Assert.assertFalse("Should not be partitioned",
parquet.isPartitioned());
}
@Test
public void testSingleUnknownFile() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a single Avro file
Path parent = new Path(folder.toURI());
createUnknownFile(fs, parent);
Collection<DatasetDescriptor> expected = Lists.newArrayList();
Assert.assertEquals("Should succeed and find no datasets",
expected, FileSystemUtil.findPotentialDatasets(fs, root));
}
@Test
public void testMultipleUnknownFiles() throws Exception {
File folder = temp.newFolder("a/b/c/d/e");
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
// create a single Avro file
Path parent = new Path(folder.toURI());
createUnknownFile(fs, parent);
createUnknownFile(fs, parent);
Collection<DatasetDescriptor> expected = Lists.newArrayList();
Assert.assertEquals("Should succeed and find no datasets",
expected, FileSystemUtil.findPotentialDatasets(fs, root));
}
@Test
public void testEmptyDirectory() throws IOException {
Path root = new Path(temp.getRoot().toURI());
FileSystem fs = LocalFileSystem.getInstance();
Collection<DatasetDescriptor> expected = Lists.newArrayList();
Assert.assertEquals("Should succeed and find no datasets",
expected, FileSystemUtil.findPotentialDatasets(fs, root));
}
@Test
public void testMissingDirectory() throws IOException {
final Path root = new Path(new Path(temp.getRoot().toURI()), "not_there");
final FileSystem fs = LocalFileSystem.getInstance();
TestHelpers.assertThrows("Should propagate missing file IOException",
FileNotFoundException.class, new Callable<Void>() {
@Override
public Void call() throws IOException {
FileSystemUtil.findPotentialDatasets(fs, root);
return null;
}
});
}
@Test
public void testSupportsRenameConfigNotSet() {
Assert.assertFalse("Should default to false for S3A",
FileSystemUtil.supportsRename(URI.create("s3a://bucket/path"), new Configuration()));
Assert.assertFalse("Should default to false for S3N",
FileSystemUtil.supportsRename(URI.create("s3n://bucket/path"), new Configuration()));
Assert.assertTrue("Should default to true for HDFS",
FileSystemUtil.supportsRename(URI.create("hdfs://cluster/path"), new Configuration()));
Assert.assertTrue("Should default to true for FILE",
FileSystemUtil.supportsRename(URI.create("file:///path"), new Configuration()));
}
@Test
public void testSupportsRenameConfigFalse() {
Configuration conf = new Configuration();
conf.setBoolean(FileSystemProperties.SUPPORTS_RENAME_PROP, false);
Assert.assertFalse("Should override via config to false for HDFS",
FileSystemUtil.supportsRename(URI.create("hdfs://cluster/path"), conf));
Assert.assertFalse("Should override via config to false for FILE",
FileSystemUtil.supportsRename(URI.create("file:///path"), conf));
}
@Test
public void testSupportsRenameConfigTrue() {
Configuration conf = new Configuration();
conf.setBoolean(FileSystemProperties.SUPPORTS_RENAME_PROP, true);
Assert.assertTrue("Should override via config to true for S3A",
FileSystemUtil.supportsRename(URI.create("s3a://bucket/path"), conf));
Assert.assertTrue("Should override via config true for S3N",
FileSystemUtil.supportsRename(URI.create("s3n://bucket/path"), conf));
}
private URI parent(URI file) {
return new Path(file).getParent().toUri();
}
public void writeUserToView(View<GenericRecord> dataset) {
DatasetWriter<GenericRecord> writer = null;
try {
writer = dataset.newWriter();
writer.write(USER);
} finally {
if (writer != null) {
writer.close();
}
}
}
public void createAvroUserFile(FileSystem fs, Path parent) throws IOException {
Path file = new Path(parent, UUID.randomUUID().toString() + ".avro");
AvroAppender<Record> appender = new AvroAppender<Record>(
fs, file, USER_SCHEMA, Uncompressed);
appender.open();
appender.append(USER);
appender.close();
}
public void createAvroEventFile(FileSystem fs, Path parent) throws IOException {
Path file = new Path(parent, UUID.randomUUID().toString() + ".avro");
AvroAppender<Record> appender = new AvroAppender<Record>(
fs, file, EVENT_SCHEMA, Uncompressed);
appender.open();
appender.append(EVENT);
appender.close();
}
public void createParquetUserFile(FileSystem fs, Path parent) throws IOException {
Path file = new Path(parent, UUID.randomUUID().toString() + ".parquet");
ParquetAppender<Record> appender = new ParquetAppender<Record>(
fs, file, USER_SCHEMA, new Configuration(), Uncompressed);
appender.open();
appender.append(USER);
appender.close();
}
public void createParquetEventFile(FileSystem fs, Path parent) throws IOException {
Path file = new Path(parent, UUID.randomUUID().toString() + ".parquet");
ParquetAppender<Record> appender = new ParquetAppender<Record>(
fs, file, EVENT_SCHEMA, new Configuration(), Uncompressed);
appender.open();
appender.append(EVENT);
appender.close();
}
public void createUnknownFile(FileSystem fs, Path parent) throws IOException {
Path file = new Path(parent, UUID.randomUUID().toString() + ".unknown");
fs.create(file);
}
}