package com.twitter.elephantbird.pig.load;
import com.google.common.collect.Lists;
import com.twitter.elephantbird.pig.store.RCFilePigStorage;
import com.twitter.elephantbird.pig.util.PigTestUtil;
import com.twitter.elephantbird.util.CoreTestUtil;
import org.apache.hadoop.fs.FileUtil;
import org.apache.pig.PigServer;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.StorageUtil;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.*;
import java.util.Iterator;
/**
* Test to make sure PigStorage and RCFilePigStorage return the same tuples.
*/
public class TestRCFilePigStorage {
private PigServer pigServer;
private final String testDir =
CoreTestUtil.getTestDataDir(TestRCFilePigStorage.class);
private final File pigDir = new File(testDir, "pig_in");
private final File rcfileDir = new File(testDir, "rcfile_in");
private final int numRecords = 5;
private final String schema = "name : chararray, "
+ "age: int, "
+ "phone:(number: chararray, type: chararray),"
+ "occupation: chararray";
@Before
public void setUp() throws Exception {
FileUtil.fullyDelete(new File(testDir));
pigServer = PigTestUtil.makePigServer();
pigDir.mkdirs();
// write same data using PigStorage() and RCFileStorage() and
OutputStream out = new FileOutputStream(new File(pigDir, "part-1.txt"));
for(int i=0; i<numRecords; i++) {
writePersonTuple(out, i);
}
out.close();
// rewrite the tuples using RCFilePigStorage()
for(String line : String.format(
"A = load '%s' as (%s);\n" +
"STORE A into '%s' using %s();\n"
, pigDir.toURI().toString()
, schema
, rcfileDir.toURI().toString()
, RCFilePigStorage.class.getName()
).split("\n")) {
pigServer.registerQuery(line + "\n");
}
}
@Test
public void testRCFilePigStorage() throws IOException {
// make sure both PigStorage & RCFilePigStorage read the same data
for(String line : String.format(
"A = load '%s' as (%s);\n" +
"B = load '%s' using %s() as (%s);\n" +
"-- projection \n" +
"C = foreach A generate name, phone.number;\n" +
"D = foreach B generate name, phone.number;\n"
, pigDir.toURI().toString()
, schema
, rcfileDir.toURI().toString()
, RCFilePigStorage.class.getName()
, schema
).split("\n")) {
pigServer.registerQuery(line + "\n");
}
Iterator<Tuple> rowsA = pigServer.openIterator("A");
Iterator<Tuple> rowsB = pigServer.openIterator("B");
// compare.
for (int i=0; i<numRecords; i++) {
Assert.assertEquals(rowsA.next().toString(), rowsB.next().toString());
}
Iterator<Tuple> rowsC = pigServer.openIterator("C");
Iterator<Tuple> rowsD = pigServer.openIterator("D");
for (int i=0; i<numRecords; i++) {
Assert.assertEquals(rowsC.next().toString(), rowsD.next().toString());
}
FileUtil.fullyDelete(new File(testDir));
}
// write a person tuple using StorageUtil.putField()
private static void writePersonTuple(OutputStream out, int index) throws IOException {
final TupleFactory tf = TupleFactory.getInstance();
// should use Pig's mock loader when we move to Pig 11
// see schema above
StorageUtil.putField(out, "bob " + index + " jenkins");
StorageUtil.putField(out, "\t");
StorageUtil.putField(out, 20 + index);
StorageUtil.putField(out, "\t");
StorageUtil.putField(out, tf.newTuple(Lists.newArrayList(
"415-555-" + (1234 + index),
"HOME")));
StorageUtil.putField(out, "\t");
StorageUtil.putField(out, "engineer " + index);
out.write('\n');
}
}