/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.pig.piggybank.test.storage;
import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.pig.ExecType;
import org.apache.pig.FuncSpec;
import org.apache.pig.PigServer;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.test.Util;
import org.junit.Assert;
import org.junit.Test;
/**
*
* Tests that the HiveColumnLoader can:
* <ul>
* <li>Load files without partitioning</li>
* <li>Load files with partitioning and dates defined in constructor, or as a
* filter</li>
* <li>Load files using pig's push down loader capabilities.</li>
* </ul>
*
*/
public class TestHiveColumnarLoader extends TestCase {
static Configuration conf = null;
// for single non partitioned file testing
static File simpleDataFile = null;
// for multiple non partitioned file testing
static File simpleDataDir = null;
static File datePartitionedDir = null;
static File yearMonthDayHourPartitionedDir = null;
// used for cleanup
static List<String> datePartitionedRCFiles;
static List<String> datePartitionedDirs;
static private FileSystem fs;
static int columnMaxSize = 30;
static int columnCount = 3;
static int simpleDirFileCount = 3;
static int simpleRowCount = 10;
static String endingDate = null;
static String startingDate = null;
static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
static Calendar calendar = null;
static int datePartitionedRowCount;
private static Calendar yearMonthDayHourcalendar;
@Override
public synchronized void setUp() throws Exception {
conf = new Configuration();
fs = LocalFileSystem.getLocal(conf);
produceSimpleData();
produceDatePartitionedData();
produceYearMonthDayHourPartitionedData();
}
@Override
public void tearDown() {
Util.deleteDirectory(datePartitionedDir);
Util.deleteDirectory(yearMonthDayHourPartitionedDir);
Util.deleteDirectory(simpleDataDir);
simpleDataFile.delete();
}
@Test
public void testReadingSingleFileNoProjections() throws IOException {
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')";
String singlePartitionedFile = simpleDataFile.getAbsolutePath();
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedFile) + "' using " + funcSpecString
+ ";");
Iterator<Tuple> result = server.openIterator("a");
int count = 0;
Tuple t = null;
while ((t = result.next()) != null) {
assertEquals(3, t.size());
assertEquals(DataType.CHARARRAY, t.getType(0));
count++;
}
Assert.assertEquals(simpleRowCount, count);
}
@Test
public void testReadingMultipleNonPartitionedFiles() throws IOException {
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')";
String singlePartitionedDir = simpleDataDir.getAbsolutePath();
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedDir) + "' using " + funcSpecString
+ ";");
server.registerQuery("b = foreach a generate f1;");
Iterator<Tuple> result = server.openIterator("b");
int count = 0;
Tuple t = null;
while ((t = result.next()) != null) {
assertEquals(1, t.size());
assertEquals(DataType.CHARARRAY, t.getType(0));
count++;
}
Assert.assertEquals(simpleDirFileCount * simpleRowCount, count);
}
@Test
public void testReadingSingleFile() throws IOException {
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')";
String singlePartitionedFile = simpleDataFile.getAbsolutePath();
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedFile) + "' using " + funcSpecString
+ ";");
server.registerQuery("b = foreach a generate f1;");
Iterator<Tuple> result = server.openIterator("b");
int count = 0;
Tuple t = null;
while ((t = result.next()) != null) {
assertEquals(1, t.size());
assertEquals(DataType.CHARARRAY, t.getType(0));
count++;
}
Assert.assertEquals(simpleRowCount, count);
}
@Test
public void testYearMonthDayHourPartitionedFilesWithProjection() throws IOException {
int count = 0;
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')";
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(yearMonthDayHourPartitionedDir.getAbsolutePath())
+ "' using " + funcSpecString + ";");
server.registerQuery("f = FILTER a by year=='2010';");
server.registerQuery("b = foreach f generate f1,f2;");
Iterator<Tuple> result = server.openIterator("b");
Tuple t = null;
while ((t = result.next()) != null) {
assertEquals(2, t.size());
assertEquals(DataType.CHARARRAY, t.getType(0));
count++;
}
Assert.assertEquals(240, count);
}
@Test
public void testYearMonthDayHourPartitionedFilesWithProjectionAndPartitionColumns()
throws IOException {
int count = 0;
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')";
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(yearMonthDayHourPartitionedDir.getAbsolutePath())
+ "' using " + funcSpecString + ";");
server.registerQuery("f = FILTER a by year=='2010';");
server.registerQuery("r = foreach f generate year, f2, f3, month, day, hour;");
server.registerQuery("b = ORDER r BY year, month, day, hour;");
Iterator<Tuple> result = server.openIterator("b");
Tuple t = null;
while ((t = result.next()) != null) {
System.out.println("Tuple: " + t);
assertEquals(6, t.size());
count++;
}
System.out.println("Count: " + count);
Assert.assertEquals(240, count);
}
@Test
public void test1DayDatePartitionedFilesWithProjection() throws IOException {
int count = 0;
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string'"
+ ", '" + startingDate + ":" + startingDate + "')";
System.out.println(funcSpecString);
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionedDir.getAbsolutePath()) + "' using "
+ funcSpecString + ";");
server.registerQuery("b = FOREACH a GENERATE f2 as p;");
Iterator<Tuple> result = server.openIterator("b");
Tuple t = null;
while ((t = result.next()) != null) {
assertEquals(1, t.size());
assertEquals(DataType.CHARARRAY, t.getType(0));
count++;
}
Assert.assertEquals(50, count);
}
@Test
public void test1DayDatePartitionedFiles() throws IOException {
int count = 0;
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string'"
+ ", '" + startingDate + ":" + startingDate + "')";
System.out.println(funcSpecString);
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionedDir.getAbsolutePath()) + "' using "
+ funcSpecString + ";");
Iterator<Tuple> result = server.openIterator("a");
while ((result.next()) != null) {
count++;
}
Assert.assertEquals(50, count);
}
@Test
public void testDatePartitionedFiles() throws IOException {
int count = 0;
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string'"
+ ", '" + startingDate + ":" + endingDate + "')";
System.out.println(funcSpecString);
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionedDir.getAbsolutePath()) + "' using "
+ funcSpecString + ";");
Iterator<Tuple> result = server.openIterator("a");
while ((result.next()) != null) {
count++;
}
Assert.assertEquals(datePartitionedRowCount, count);
}
@Test
public void testNumerOfColumnsWhenDatePartitionedFiles() throws IOException {
int count = 0;
String funcSpecString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string'"
+ ", '" + startingDate + ":" + endingDate + "')";
System.out.println(funcSpecString);
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerFunction("org.apache.pig.piggybank.storage.HiveColumnarLoader",
new FuncSpec(funcSpecString));
server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionedDir.getAbsolutePath()) + "' using "
+ funcSpecString + ";");
Iterator<Tuple> result = server.openIterator("a");
Tuple t = null;
while ((t = result.next()) != null) {
Assert.assertEquals(4, t.size());
count++;
}
Assert.assertEquals(datePartitionedRowCount, count);
}
private static void produceDatePartitionedData() throws IOException {
datePartitionedRowCount = 0;
datePartitionedDir = new File("testhiveColumnarLoader-dateDir-"
+ System.currentTimeMillis());
datePartitionedDir.mkdir();
datePartitionedDir.deleteOnExit();
int dates = 4;
calendar = Calendar.getInstance();
calendar.set(Calendar.DAY_OF_MONTH, Calendar.MONDAY);
calendar.set(Calendar.MONTH, Calendar.JANUARY);
startingDate = dateFormat.format(calendar.getTime());
datePartitionedRCFiles = new ArrayList<String>();
datePartitionedDirs = new ArrayList<String>();
for (int i = 0; i < dates; i++) {
File file = new File(datePartitionedDir, "daydate="
+ dateFormat.format(calendar.getTime()));
calendar.add(Calendar.DAY_OF_MONTH, 1);
file.mkdir();
file.deleteOnExit();
// for each daydate write 5 partitions
for (int pi = 0; pi < 5; pi++) {
Path path = new Path(new Path(file.getAbsolutePath()), "parition" + pi);
datePartitionedRowCount += writeRCFileTest(fs, simpleRowCount, path, columnCount,
new DefaultCodec(), columnCount);
new File(path.toString()).deleteOnExit();
datePartitionedRCFiles.add(path.toString());
datePartitionedDirs.add(file.toString());
}
}
endingDate = dateFormat.format(calendar.getTime());
}
private static void produceYearMonthDayHourPartitionedData() throws IOException {
yearMonthDayHourPartitionedDir = new File("testhiveColumnarLoader-yearMonthDayHourDir-"
+ System.currentTimeMillis());
yearMonthDayHourPartitionedDir.mkdir();
yearMonthDayHourPartitionedDir.deleteOnExit();
int years = 1;
int months = 2;
int days = 3;
int hours = 4;
yearMonthDayHourcalendar = Calendar.getInstance();
yearMonthDayHourcalendar.set(Calendar.YEAR, 2010);
yearMonthDayHourcalendar.set(Calendar.DAY_OF_MONTH, Calendar.MONDAY);
yearMonthDayHourcalendar.set(Calendar.MONTH, Calendar.JANUARY);
for (int i = 0; i < years; i++) {
File file = new File(yearMonthDayHourPartitionedDir, "year="
+ yearMonthDayHourcalendar.get(Calendar.YEAR));
file.mkdir();
file.deleteOnExit();
for (int monthIndex = 0; monthIndex < months; monthIndex++) {
File monthFile = new File(file, "month="
+ yearMonthDayHourcalendar.get(Calendar.MONTH));
monthFile.mkdir();
monthFile.deleteOnExit();
for (int dayIndex = 0; dayIndex < days; dayIndex++) {
File dayFile = new File(monthFile, "day="
+ yearMonthDayHourcalendar.get(Calendar.DAY_OF_MONTH));
dayFile.mkdir();
dayFile.deleteOnExit();
for (int hourIndex = 0; hourIndex < hours; hourIndex++) {
File hourFile = new File(dayFile, "hour="
+ yearMonthDayHourcalendar.get(Calendar.HOUR_OF_DAY));
hourFile.mkdir();
hourFile.deleteOnExit();
File rcFile = new File(hourFile.getAbsolutePath() + "/attempt-00000");
Path hourFilePath = new Path(rcFile.getAbsolutePath());
rcFile.deleteOnExit();
writeRCFileTest(fs, simpleRowCount, hourFilePath, columnCount,
new DefaultCodec(), columnCount);
yearMonthDayHourcalendar.add(Calendar.HOUR_OF_DAY, 1);
}
yearMonthDayHourcalendar.add(Calendar.DAY_OF_MONTH, 1);
}
yearMonthDayHourcalendar.add(Calendar.MONTH, 1);
}
}
endingDate = dateFormat.format(calendar.getTime());
}
/**
* Writes out a simple temporary file with 5 columns and 100 rows.<br/>
* Data is random numbers.
*
* @throws SerDeException
* @throws IOException
*/
private static final void produceSimpleData() throws SerDeException, IOException {
// produce on single file
simpleDataFile = File.createTempFile("testhiveColumnarLoader", ".txt");
simpleDataFile.deleteOnExit();
Path path = new Path(simpleDataFile.getPath());
writeRCFileTest(fs, simpleRowCount, path, columnCount, new DefaultCodec(), columnCount);
// produce a folder of simple data
simpleDataDir = new File("simpleDataDir" + System.currentTimeMillis());
simpleDataDir.mkdir();
for (int i = 0; i < simpleDirFileCount; i++) {
simpleDataFile = new File(simpleDataDir, "testhiveColumnarLoader-" + i + ".txt");
Path filePath = new Path(simpleDataFile.getPath());
writeRCFileTest(fs, simpleRowCount, filePath, columnCount, new DefaultCodec(),
columnCount);
}
}
static Random randomCharGenerator = new Random(3);
static Random randColLenGenerator = new Random(20);
private static void resetRandomGenerators() {
randomCharGenerator = new Random(3);
randColLenGenerator = new Random(20);
}
private static int writeRCFileTest(FileSystem fs, int rowCount, Path file, int columnNum,
CompressionCodec codec, int columnCount) throws IOException {
fs.delete(file, true);
int rowsWritten = 0;
resetRandomGenerators();
RCFileOutputFormat.setColumnNumber(conf, columnNum);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, codec);
byte[][] columnRandom;
BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
columnRandom = new byte[columnNum][];
for (int i = 0; i < columnNum; i++) {
BytesRefWritable cu = new BytesRefWritable();
bytes.set(i, cu);
}
for (int i = 0; i < rowCount; i++) {
nextRandomRow(columnRandom, bytes, columnCount);
rowsWritten++;
writer.append(bytes);
}
writer.close();
return rowsWritten;
}
private static void nextRandomRow(byte[][] row, BytesRefArrayWritable bytes, int columnCount) {
bytes.resetValid(row.length);
for (int i = 0; i < row.length; i++) {
row[i] = new byte[columnCount];
for (int j = 0; j < columnCount; j++)
row[i][j] = getRandomChar(randomCharGenerator);
bytes.get(i).set(row[i], 0, columnCount);
}
}
private static int CHAR_END = 122 - 7;
private static byte getRandomChar(Random random) {
byte b = 0;
do {
b = (byte) random.nextInt(CHAR_END);
} while ((b < 65));
if (b > 90) {
b = 7;
}
return b;
}
}