package com.scaleunlimited.cascading.hadoop;
import java.io.File;
import java.util.Map;
import java.util.UUID;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.mapred.JobConf;
import org.junit.Before;
import org.junit.Test;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowDef;
import cascading.pipe.Pipe;
import cascading.scheme.Scheme;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.partition.DelimitedPartition;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import com.scaleunlimited.cascading.AbstractPlatformTest;
import com.scaleunlimited.cascading.BasePath;
import com.scaleunlimited.cascading.BasePlatform;
import com.scaleunlimited.cascading.hadoop.test.MiniClusterPlatform;
public class HadoopPlatformTest extends AbstractPlatformTest {
private static final String WORKING_DIR = "build/test/HadoopPlatformTest";
@Before
public void setup() {
File workingDir = new File(WORKING_DIR);
if (workingDir.exists()) {
FileUtils.deleteQuietly(workingDir);
}
workingDir.mkdirs();
}
@SuppressWarnings({ "rawtypes", "unchecked" })
@Test
public void testTempPath() throws Exception {
BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);
BasePath tempDir = platform.getTempDir();
// Verify we can write and then read
BasePath testDir = platform.makePath(tempDir, UUID.randomUUID().toString());
Scheme scheme = platform.makeBinaryScheme(new Fields("name", "age"));
Tap tap = platform.makeTap(scheme, testDir);
TupleEntryCollector writer = tap.openForWrite(platform.makeFlowProcess());
writer.add(new Tuple("ken", 37));
writer.close();
TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
assertTrue(iter.hasNext());
TupleEntry te = iter.next();
assertEquals("ken", te.getString("name"));
assertFalse(iter.hasNext());
iter.close();
}
@Test
public void testPathCreation() throws Exception {
// Clear it out first.
final String targetDirname = "build/test/HadoopPlatformTest/testPathCreation";
File targetDirFile = new File(targetDirname);
FileUtils.deleteDirectory(targetDirFile);
assertFalse(targetDirFile.exists());
BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);
BasePath path = platform.makePath(targetDirname);
assertEquals(targetDirname, path.getPath());
assertEquals(targetDirFile.toURI().toString(), path.getAbsolutePath());
assertEquals(targetDirFile.toURI().toString(), path.toString());
assertFalse(path.exists());
assertTrue(path.mkdirs());
assertTrue(path.isDirectory());
assertFalse(path.isFile());
assertTrue(targetDirFile.exists());
assertTrue(targetDirFile.isDirectory());
// Check out sub-dir support.
File subDirFile = new File(targetDirFile, "subdir");
BasePath child = platform.makePath(path, "subdir");
assertEquals(targetDirname + "/" + "subdir", child.getPath());
assertEquals(subDirFile.toURI().toString(), child.getAbsolutePath());
assertFalse(child.exists());
assertTrue(child.mkdirs());
assertTrue(child.isDirectory());
assertFalse(child.isFile());
assertTrue(subDirFile.exists());
assertTrue(subDirFile.isDirectory());
}
@Test
public void testRename() throws Exception {
BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);
final String targetDirname = "build/test/HadoopPlatformTest/testRename";
BasePath path = platform.makePath(targetDirname);
if (path.exists()) {
path.delete(true);
}
path.mkdirs();
BasePath src = platform.makePath(path, "src");
src.mkdirs();
assertTrue(src.exists());
BasePath dst = platform.makePath(path, "dst");
assertFalse(dst.exists());
platform.rename(src, dst);
assertFalse(src.exists());
assertTrue(dst.exists());
assertEquals("dst", dst.getName());
platform = new MiniClusterPlatform(HadoopPlatformTest.class);
platform.rename(dst, src);
assertFalse(dst.exists());
assertTrue(src.exists());
assertEquals("src", src.getName());
BasePath subDir = platform.makePath(path, "dst/subDir");
platform.rename(src, subDir);
assertFalse(src.exists());
assertTrue(dst.exists());
assertTrue(subDir.exists());
assertEquals("subDir", subDir.getName());
BasePath aFile = platform.makePath(subDir, "aFile");
assertFalse(aFile.exists());
aFile.createNewFile();
assertTrue(aFile.exists());
src.mkdirs();
BasePath bFile = platform.makePath(src, "bFile");
assertFalse(bFile.exists());
platform.rename(aFile, bFile);
assertFalse(aFile.exists());
assertTrue(bFile.exists());
}
@Test
public void testSerialization() throws Exception {
HadoopPlatform platform = new HadoopPlatform(HadoopPlatformTest.class);
platform.setMapSpeculativeExecution(true);
platform.setProperty("my.bogus.property", 999);
testSerialization(platform);
}
@Test
public void testPlatformType() throws Exception {
BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);
assertEquals(HadoopPlatform.PLATFORM_TYPE, platform.getPlatformType());
}
@SuppressWarnings("rawtypes")
@Test
public void testPartitionTap() throws Exception {
BasePlatform platform = new HadoopPlatform(HadoopPlatformTest.class);
BasePath workingDir = platform.makePath(WORKING_DIR);
BasePath testDir = platform.makePath(workingDir, "testPartitionTap");
BasePath input = platform.makePath(testDir, "input");
input.mkdirs();
// Make two month-year directories
createDataDir(platform, input, "07-2014", "input1 test");
createDataDir(platform, input, "08-2014", "input2 test");
// create a flow to read from the input
Pipe inputPipe = new Pipe("input");
Tap parentSourceTap = platform.makeTap(platform.makeTextScheme(), input);
DelimitedPartition monthYearPartition = new DelimitedPartition( new Fields( "month", "year" ), "-" );
Tap monthYearTap = platform.makePartitionTap(parentSourceTap, monthYearPartition);
// and write to output - but as year-month
BasePath output = platform.makePath(testDir, "output");
Tap parentSinkTap = platform.makeTap(platform.makeTextScheme(), output);
DelimitedPartition yearMonthPartition = new DelimitedPartition( new Fields( "year", "month" ), "-" );
Tap yearMonthTap = platform.makePartitionTap(parentSinkTap, yearMonthPartition, SinkMode.REPLACE);
FlowDef flowDef = new FlowDef()
.setName("Hadoop PartitionTap Test")
.addSource(inputPipe, monthYearTap)
.addTailSink(inputPipe, yearMonthTap);
Flow flow = platform.makeFlowConnector().connect(flowDef);
flow.complete();
// verify that input and output exist
BasePath input1 = platform.makePath(input, "07-2014");
BasePath input2 = platform.makePath(input, "08-2014");
assertTrue(input1.exists());
assertTrue(input2.exists());
BasePath output1 = platform.makePath(output, "2014-07");
BasePath output2 = platform.makePath(output, "2014-08");
assertTrue(output1.exists());
assertTrue(output2.exists());
}
@Test
public void testFlowConnectorProperties() throws Exception {
BasePlatform platform;
FlowConnector fc;
Map<Object, Object> props;
// Verify we can set properties in the JobConf and get them back
platform = new HadoopPlatform(HadoopPlatformTest.class);
fc = platform.makeFlowConnector();
props = fc.getProperties();
assertNull(props.get("my.special.property"));
JobConf conf = new JobConf();
conf.set("my.special.property", "value1");
platform = new HadoopPlatform(HadoopPlatformTest.class, conf);
fc = platform.makeFlowConnector();
props = fc.getProperties();
assertEquals("value1", props.get("my.special.property"));
// Verify we can set custom Cascading properties
assertNull(props.get("cascading.bogus.property"));
platform.setProperty("cascading.bogus.property", "value2");
fc = platform.makeFlowConnector();
props = fc.getProperties();
assertEquals("value2", props.get("cascading.bogus.property"));
// Verify we can override Hadoop properties. Find a property
conf.setCompressMapOutput(true);
platform = new HadoopPlatform(HadoopPlatformTest.class, conf);
fc = platform.makeFlowConnector();
props = fc.getProperties();
// Different versions of Hadoop use different settings
String propName = "mapred.compress.map.output";
Object compressMapOutput = props.get(propName);
if (compressMapOutput == null) {
propName = "mapreduce.map.output.compress";
compressMapOutput = props.get(propName);
}
assertEquals(Boolean.TRUE.toString(), (String)compressMapOutput);
// Now change it via properties.
platform.setProperty(propName, Boolean.FALSE.toString());
fc = platform.makeFlowConnector();
props = fc.getProperties();
compressMapOutput = props.get(propName);
assertEquals(Boolean.FALSE.toString(), (String)compressMapOutput);
}
@SuppressWarnings({ "rawtypes", "unchecked" })
private void createDataDir(BasePlatform platform, BasePath input, String dirName, String data) throws Exception {
BasePath input1 = platform.makePath(input, dirName);
Tap tap = platform.makeTap(platform.makeTextScheme(), input1);
TupleEntryCollector tupleEntryCollector = tap.openForWrite(platform.makeFlowProcess());
tupleEntryCollector.add(new Tuple(data));
tupleEntryCollector.close();
}
}