package com.twitter.elephantbird.util; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.junit.Test; import static org.junit.Assert.assertEquals; /** * @author Alex Levenson */ public class TestHdfsUtils { private static final String SAMPLE_DIR_LOCATION = "src/test/resources/com/twitter/elephantbird/util/"; private static final Pattern SAMPLE_DIR_PATTERN = Pattern.compile(".*/" + SAMPLE_DIR_LOCATION + "(.*)"); private static final Function<Path, String> PATH_TO_RELATIVE_STRING = new Function<Path, String>() { @Override public String apply(Path path) { Matcher m = SAMPLE_DIR_PATTERN.matcher(path.toString()); m.matches(); return m.group(1); } }; private static final PathFilter SKIP_A_PATH_FILTER = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().equals("a.txt"); } }; @Test public void testCollectPathsWithDirs() throws Exception { List<Path> accumulator = Lists.newLinkedList(); Path p = new Path(SAMPLE_DIR_LOCATION + "sample_dir"); HdfsUtils.collectPaths( p, p.getFileSystem(new Configuration()), SKIP_A_PATH_FILTER, accumulator); Set<String> expected = Sets.newHashSet( "sample_dir", "sample_dir/b.txt", "sample_dir/nested", "sample_dir/nested/c.txt", "sample_dir/nested/d.txt", "sample_dir/nested/double_nested", "sample_dir/nested/double_nested/e.txt"); Set<String> found = Sets.newHashSet(Iterables.transform(accumulator, PATH_TO_RELATIVE_STRING)); assertEquals(expected, found); } @Test public void testCollectPathsWithoutDirs() throws Exception { List<Path> accumulator = Lists.newLinkedList(); Configuration conf = new Configuration(); Path p = new Path(SAMPLE_DIR_LOCATION + "sample_dir"); HdfsUtils.collectPaths( p, p.getFileSystem(conf), new PathFilters.CompositePathFilter( PathFilters.newExcludeDirectoriesFilter(conf), SKIP_A_PATH_FILTER), accumulator ); Set<String> expected = Sets.newHashSet( "sample_dir/b.txt", "sample_dir/nested/c.txt", "sample_dir/nested/d.txt", "sample_dir/nested/double_nested/e.txt"); Set<String> found = Sets.newHashSet(Iterables.transform(accumulator, PATH_TO_RELATIVE_STRING)); assertEquals(expected, found); } @Test public void testGetDirectorySize() throws Exception { Path p = new Path(SAMPLE_DIR_LOCATION + "sample_dir"); long size = HdfsUtils.getDirectorySize(p, p.getFileSystem(new Configuration())); assertEquals(18, size); } @Test public void testGetDirectorySizeWithFilter() throws Exception { Path p = new Path(SAMPLE_DIR_LOCATION + "sample_dir"); long size = HdfsUtils.getDirectorySize( p, p.getFileSystem(new Configuration()), PathFilters.ACCEPT_ALL_PATHS_FILTER); assertEquals(18, size); size = HdfsUtils.getDirectorySize( p, p.getFileSystem(new Configuration()), SKIP_A_PATH_FILTER); assertEquals(16, size); } @Test public void testExpandGlobs() throws Exception { List<Path> paths = HdfsUtils.expandGlobs( Lists.newArrayList(SAMPLE_DIR_LOCATION + "sample_dir/*.txt"), new Configuration()); assertEquals(Lists.newArrayList("sample_dir/a.txt", "sample_dir/b.txt"), Lists.transform(paths, PATH_TO_RELATIVE_STRING)); paths = HdfsUtils.expandGlobs( Lists.newArrayList(SAMPLE_DIR_LOCATION + "sample_dir/a.txt"), new Configuration()); assertEquals(Lists.newArrayList("sample_dir/a.txt"), Lists.transform(paths, PATH_TO_RELATIVE_STRING)); paths = HdfsUtils.expandGlobs( Lists.newArrayList(SAMPLE_DIR_LOCATION + "sample_dir/*.txt", SAMPLE_DIR_LOCATION + "sample_dir/*nes*/*.txt"), new Configuration()); assertEquals(Lists.newArrayList( "sample_dir/a.txt", "sample_dir/b.txt", "sample_dir/nested/c.txt", "sample_dir/nested/d.txt"), Lists.transform(paths, PATH_TO_RELATIVE_STRING)); } }