package com.scaleunlimited.cascading;
import java.util.ArrayList;
import junit.framework.Assert;
import org.apache.commons.lang.ArrayUtils;
import org.junit.Test;
import cascading.flow.Flow;
import cascading.flow.planner.PlannerException;
import cascading.pipe.Pipe;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import com.scaleunlimited.cascading.hadoop.test.MiniClusterPlatform;
import com.scaleunlimited.cascading.local.LocalPlatform;
@SuppressWarnings("rawtypes")
public class UniqueCountTest extends Assert {
private static final String OUTPUT_DIR = "build/test/UniqueCountTest";
private static final Fields IN_FIELDS = new Fields("user", "id", "value");
private static final Fields COUNT_FIELD = new Fields("count");
private static final Fields OUT_FIELDS = new Fields("user", "count");
// Using a very small threshold, to verify results when map-side uniqueness has to flush.
@Test
public void testSingleFields() throws Exception {
final Fields groupFields = new Fields("user");
LocalPlatform platform = new LocalPlatform(UniqueCountTest.class);
Flow flow = makeFlow("testSingleFields", 10, groupFields, new Fields("id"), false, platform);
flow.complete();
// validate
int[] counts = getUniqueCounts(platform, "testSingleFields", groupFields, "user-0", 4);
assertEquals(1, counts.length);
assertEquals(2, counts[0]);
// Also check that we don't get nulls for the id field
BasePath outputDir = platform.makePath(OUTPUT_DIR);
BasePath testDir = platform.makePath(outputDir, "testSingleFields");
BasePath dataPath = platform.makePath(testDir, "out");
Tap tap = platform.makeTap(platform.makeBinaryScheme(OUT_FIELDS), dataPath);
TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
while (iter.hasNext()) {
TupleEntry next = iter.next();
assertFalse(next.getFields().contains(new Fields("id")));
}
iter.close();
}
@Test
public void testMultipleGroupFields() throws Exception {
final Fields groupFields = new Fields("user", "id");
LocalPlatform platform = new LocalPlatform(UniqueCountTest.class);
Flow flow = makeFlow("testMultipleGroupFields", 10, groupFields, new Fields("value"), false, platform);
flow.complete();
// validate
int[] counts = getUniqueCounts(platform, "testMultipleGroupFields", groupFields, "user-6", 7);
assertEquals(2, counts.length);
assertEquals(2, counts[0]);
assertEquals(1, counts[1]);
}
@Test
public void testMultipleUniqueFields() throws Exception {
final Fields groupFields = new Fields("user");
LocalPlatform platform = new LocalPlatform(UniqueCountTest.class);
Flow flow = makeFlow("testMultipleUniqueFields", 10, groupFields, new Fields("id", "value"), false, platform);
flow.complete();
// I should get a total of four records, one for each user we wind up creating (0, 3, 6, 9)
int[] counts = getUniqueCounts(platform, "testMultipleUniqueFields", groupFields, "user-6", 4);
// I should get one entry for "user-6", with the three unique combinations of "id" and "value".
assertEquals(1, counts.length);
assertEquals(3, counts[0]);
}
@Test
public void testNullUniqueFieldValue() throws Exception {
final Fields groupFields = new Fields("user");
LocalPlatform platform = new LocalPlatform(UniqueCountTest.class);
Flow flow = makeFlow("testNullUniqueFieldValue", 10, groupFields, new Fields("id"), true, platform);
flow.complete();
int[] counts = getUniqueCounts(platform, "testNullUniqueFieldValue", groupFields, "user-6", 4);
assertEquals(1, counts.length);
assertEquals(1, counts[0]);
}
@Test
public void testHadoopCluster() throws Exception {
final Fields groupFields = new Fields("user");
final int numContainers = 2;
MiniClusterPlatform platform = new MiniClusterPlatform( UniqueCountTest.class,
numContainers);
Flow flow = makeFlow("testHadoopCluster", 10, groupFields, new Fields("id"), false, platform);
flow.complete();
// validate
int[] counts = getUniqueCounts(platform, "testHadoopCluster", groupFields, "user-0", 4);
assertEquals(1, counts.length);
assertEquals(2, counts[0]);
}
@SuppressWarnings({"unchecked" })
private Flow makeFlow(String testName, int numDatums,
Fields groupFields, Fields uniqueFields,
boolean insertNullIdField,
BasePlatform platform) throws Exception {
BasePath outputDir = platform.makePath(OUTPUT_DIR);
BasePath testDir = platform.makePath(outputDir, testName);
BasePath in = platform.makePath(testDir, "in");
Tap sourceTap = platform.makeTap(platform.makeBinaryScheme(IN_FIELDS), in, SinkMode.REPLACE);
TupleEntryCollector write = sourceTap.openForWrite(platform.makeFlowProcess());
int i = 0;
while (i < numDatums) {
// have user x 3, id x 2 and value x 1
String username = "user-" + i;
int j = 0;
while (j < 3) {
if (i >= numDatums) {
break;
}
if (insertNullIdField) {
write.add(new Tuple(username, null, i));
} else {
Tuple t = new Tuple(username, i % 2, i);
write.add(t);
}
i++;
j++;
}
}
write.close();
Pipe pipe = new Pipe("test");
UniqueCount assembly = new UniqueCount(pipe, groupFields, uniqueFields, COUNT_FIELD, 2);
Pipe uniqueCountsPipe = assembly.getTailPipe();
BasePath out = platform.makePath(testDir, "out");
Tap sinkTap = platform.makeTap(platform.makeBinaryScheme(groupFields.append(COUNT_FIELD)), out, SinkMode.REPLACE);
Flow flow = platform.makeFlowConnector().connect(testName, sourceTap, sinkTap, uniqueCountsPipe);
FlowUtils.nameFlowSteps(flow);
return flow;
}
@SuppressWarnings("unchecked")
private int[] getUniqueCounts(BasePlatform platform, String testName, Fields groupFields,
String user, int total) throws Exception {
ArrayList<Integer> uniqueCountsList = new ArrayList<Integer>();
BasePath outputDir = platform.makePath(OUTPUT_DIR);
BasePath testDir = platform.makePath(outputDir, testName);
BasePath dataPath = platform.makePath(testDir, "out");
Tap tap = platform.makeTap(platform.makeBinaryScheme(groupFields.append(COUNT_FIELD)), dataPath);
TupleEntryIterator iter = tap.openForRead(platform.makeFlowProcess());
int num = 0;
while (iter.hasNext()) {
TupleEntry next = iter.next();
if (next.getString("user").equals(user)) {
uniqueCountsList.add(next.getInteger("count"));
}
num++;
}
iter.close();
assertEquals("Total number of records", total, num);
return ArrayUtils.toPrimitive(uniqueCountsList.toArray(new Integer[uniqueCountsList.size()]));
}
}