/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.piggybank.test.storage; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.concurrent.atomic.AtomicInteger; import junit.framework.TestCase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.pig.ExecType; import org.apache.pig.FuncSpec; import org.apache.pig.PigServer; import org.apache.pig.data.Tuple; import org.apache.pig.piggybank.storage.AllLoader; import org.apache.pig.piggybank.storage.HiveColumnarLoader; import org.apache.pig.piggybank.storage.allloader.LoadFuncHelper; import org.apache.pig.test.Util; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.mortbay.log.Log; public class TestAllLoader extends TestCase { enum TYPE { HIVERC(".rc", new HiveRCFileTestWriter()), GZIP_PLAIN(".gz", new GzipFileTestWriter()); String extension; FileTestWriter writer; TYPE(String extension, FileTestWriter writer) { this.extension = extension; this.writer = writer; } } String colSeparator = ","; /** * All test files will contain this amount of records */ int fileRecords = 100; /** * All files will contain this amount of columns in each row */ int fileColumns = 2; final TYPE fileTypes[] = new TYPE[] { TYPE.GZIP_PLAIN, TYPE.HIVERC }; final String extensionLoaders = "gz:org.apache.pig.builtin.PigStorage('" + colSeparator + "'), rc:" + HiveColumnarLoader.class.getCanonicalName() + "('v1 float,v2 float')"; File baseDir; File simpleDir; // --------------- For date partitioning based on daydate= (this will work // for year=, day= etc File datePartitionDir; final String[] datePartitions = new String[] { "daydate=2010-11-01", "daydate=2010-11-02", "daydate=2010-11-03", "daydate=2010-11-04", "daydate=2010-11-05" }; // -------------- Logic partitioning that does not involve dates File logicPartitionDir; final String[] logicPartitions = new String[] { "block=1", "block=2", "block=3" }; // -------------- Logic Tagged partitioning that is /type1/block=1 // /type2/block2 // This is needed because the Schema for each path might be different and we // want to use a different loader based on if we are // looking in type1 or in type2 see taggedExtensionLoaders File taggedLogicPartitionDir; final String taggedExtensionLoaders = "gz:org.apache.pig.builtin.PigStorage('" + colSeparator + "'), rc:type1:" + HiveColumnarLoader.class.getCanonicalName() + "('v1 float,v2 float'), rc:type2:" + HiveColumnarLoader.class.getCanonicalName() + "('v1 float')"; final String[] tags = new String[] { "type1", "type2" }; final String[] taggedSchemas = new String[] { "(p: float, q:float)", "(p:float)" }; // -------------- Test Load Files by Content File filesByContentDir; final String contentLoaders = "gz:org.apache.pig.builtin.PigStorage('" + colSeparator + "'), rc:" + HiveColumnarLoader.class.getCanonicalName() + "('v1 float,v2 float'), seq::org.apache.hadoop.hive.ql.io.RCFile:" + HiveColumnarLoader.class.getCanonicalName() + "('v1 float,v2 float')"; Properties configuration; final String allLoaderName = AllLoader.class.getCanonicalName(); final String allLoaderFuncSpec = allLoaderName + "()"; PigServer server; /** * Test that we can load files with the correct loaders as per file content. * * @throws IOException */ @Test public void testFilesByContentDir() throws IOException { server.shutdown(); configuration.setProperty(LoadFuncHelper.FILE_EXTENSION_LOADERS, contentLoaders); server = new PigServer(ExecType.LOCAL, configuration); server.setBatchOn(); server.registerFunction(allLoaderName, new FuncSpec(allLoaderFuncSpec)); server.registerQuery("a = LOAD '" + Util.encodeEscape(filesByContentDir.getAbsolutePath()) + "' using " + allLoaderFuncSpec + " as (p:float, q:float);"); readRecordsFromLoader(server, "a", fileRecords); } /** * Test that we can read a tagged logic partitioned directory * * @throws IOException */ @Test public void testTaggedLogicPartitionDir() throws IOException { server.shutdown(); configuration.setProperty(LoadFuncHelper.FILE_EXTENSION_LOADERS, taggedExtensionLoaders); server = new PigServer(ExecType.LOCAL, configuration); server.setBatchOn(); server.registerFunction(allLoaderName, new FuncSpec(allLoaderFuncSpec)); int i = 0; for (String tag : tags) { String schema = taggedSchemas[i++]; server.registerQuery(tag + " = LOAD '" + Util.encodeEscape(taggedLogicPartitionDir.getAbsolutePath()) + "/" + tag + "' using " + allLoaderFuncSpec + " as " + schema + ";"); readRecordsFromLoader(server, tag, fileRecords * logicPartitions.length * fileTypes.length); } } /** * Test that we can filter by a logic partition based on a range, in this * case block<=2 * * @throws IOException */ @Test public void testLogicPartitionFilter() throws IOException { server.registerQuery("a = LOAD '" + Util.encodeEscape(logicPartitionDir.getAbsolutePath()) + "' using " + allLoaderName + "('block<=2')" + " as (q:float, p:float);"); server.registerQuery("r = FOREACH a GENERATE q, p;"); Iterator<Tuple> it = server.openIterator("r"); int count = 0; while (it.hasNext()) { count++; Tuple t = it.next(); // System.out.println(count + " : " + t.toDelimitedString(",")); assertEquals(2, t.size()); } // only 2 partitions are used in the query block=3 is filtered out. assertEquals(fileRecords * 2 * fileTypes.length, count); } /** * Test that we can extract only the partition column * * @throws IOException */ @Test public void testLogicPartitionPartitionColumnExtract() throws IOException { server.registerQuery("a = LOAD '" + Util.encodeEscape(logicPartitionDir.getAbsolutePath()) + "' using " + allLoaderFuncSpec + " as (q:float, p:float, block:chararray);"); server.registerQuery("r = foreach a generate block;"); Iterator<Tuple> it = server.openIterator("r"); int count = 0; Map<String, AtomicInteger> partitionCount = new HashMap<String, AtomicInteger>(); while (it.hasNext()) { count++; Tuple t = it.next(); assertEquals(1, t.size()); String key = t.get(0).toString(); AtomicInteger ati = partitionCount.get(key); if (ati == null) { ati = new AtomicInteger(1); partitionCount.put(key, ati); } else { ati.incrementAndGet(); } } // test that all partitions where read for (AtomicInteger ati : partitionCount.values()) { assertEquals(fileRecords * fileTypes.length, ati.get()); } assertEquals(fileRecords * logicPartitions.length * fileTypes.length, count); } /** * Test that we can read a logic partitioned directory * * @throws IOException */ @Test public void testLogicPartitionDir() throws IOException { server.registerQuery("a = LOAD '" + Util.encodeEscape(logicPartitionDir.getAbsolutePath()) + "' using " + allLoaderFuncSpec + " as (q:float, p:float);"); readRecordsFromLoader(server, "a", fileRecords * logicPartitions.length * fileTypes.length); } /** * Test that we can filter a date partitioned directory by a date range * * @throws IOException */ @Test public void testDateParitionFilterWithAsSchema() throws IOException { server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionDir.getAbsolutePath()) + "' using " + allLoaderName + "('daydate >= \"2010-11-02\" and daydate <= \"2010-11-04\"') AS (q:float, p:float, daydate:chararray); "); server.registerQuery("r = FOREACH a GENERATE $0;"); Iterator<Tuple> it = server.openIterator("r"); int count = 0; while (it.hasNext()) { count++; Tuple t = it.next(); assertEquals(1, t.size()); } // we filter out 2 date partitions using only 3 readRecordsFromLoader(server, "a", fileRecords * 3 * fileTypes.length); } /** * Test that we can filter a date partitioned directory by a date range * * @throws IOException */ @Test public void testDateParitionFilterWithoutSchema() throws IOException { server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionDir.getAbsolutePath()) + "' using " + allLoaderName + "('daydate >= \"2010-11-02\" and daydate <= \"2010-11-04\"'); "); server.registerQuery("r = FOREACH a GENERATE $0;"); Iterator<Tuple> it = server.openIterator("r"); int count = 0; while (it.hasNext()) { count++; Tuple t = it.next(); float f = Float.valueOf(t.get(0).toString()); assertTrue(f < 1L); assertEquals(1, t.size()); } // we filter out 2 date partitions using only 3 readRecordsFromLoader(server, "a", fileRecords * 3 * fileTypes.length); } /** * Test that we can read a date partitioned directory * * @throws IOException * @throws ParseException */ @Test public void testDateParitionDir() throws IOException, ParseException { server.registerQuery("a = LOAD '" + Util.encodeEscape(datePartitionDir.getAbsolutePath()) + "' using " + allLoaderFuncSpec + " as (q:float, p:float, daydate:chararray);"); server.registerQuery("r = FOREACH a GENERATE daydate;"); Iterator<Tuple> it = server.openIterator("r"); DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd"); Calendar cal = Calendar.getInstance(); while (it.hasNext()) { Date date = dateformat.parse(it.next().get(0).toString()); cal.setTime(date); assertEquals(2010, cal.get(Calendar.YEAR)); assertEquals(10, cal.get(Calendar.MONTH)); // month starts at 0 so // November 11 is 10 } readRecordsFromLoader(server, "a", fileRecords * datePartitions.length * fileTypes.length); } /** * Test that we can read from a simple directory * * @throws IOException */ @Test public void testSimpleDir() throws IOException { server.registerQuery("a = LOAD '" + Util.encodeEscape(simpleDir.getAbsolutePath()) + "' using " + allLoaderFuncSpec + " as (q:float, p:float);"); readRecordsFromLoader(server, "a", fileRecords * fileTypes.length); } /** * Validates that the loadAlias can read the correct amount of records * * @param server * @param loadAlias * @throws IOException */ private void readRecordsFromLoader(PigServer server, String loadAlias, int totalRowCount) throws IOException { Iterator<Tuple> result = server.openIterator(loadAlias); int count = 0; while ((result.next()) != null) { count++; } Log.info("Validating expected: " + totalRowCount + " against " + count); assertEquals(totalRowCount, count); } @Before public void setUp() throws Exception { Configuration hadoopConf = new Configuration(); FileSystem.setDefaultUri(hadoopConf, LocalFileSystem.getDefaultUri(hadoopConf)); if (baseDir == null) { configuration = new Properties(); configuration.setProperty(LoadFuncHelper.FILE_EXTENSION_LOADERS, extensionLoaders); baseDir = new File("build/test/testAllLoader"); if (baseDir.exists()) { FileUtil.fullyDelete(baseDir); } assertTrue(baseDir.mkdirs()); createSimpleDir(); createDatePartitionDir(); createLogicPartitionDir(); createTaggedLogicPartitionDir(); createFileByContentDir(); server = new PigServer(ExecType.LOCAL, configuration); server.setBatchOn(); } } /** * Write out all files without there extensions * * @throws IOException */ private void createFileByContentDir() throws IOException { filesByContentDir = new File(baseDir, "filesByContentDir"); assertTrue(filesByContentDir.mkdirs()); // for each type create the files without its extension File uniqueFile = new File(filesByContentDir, "" + System.currentTimeMillis()); TYPE.HIVERC.writer.writeTestData(uniqueFile, fileRecords, fileColumns, colSeparator); } /** * Write out the tagged logical partitioning directories e.g. type=1/block=1 * * @throws IOException */ private void createTaggedLogicPartitionDir() throws IOException { taggedLogicPartitionDir = new File(baseDir, "taggedLogicPartitionDir"); assertTrue(taggedLogicPartitionDir.mkdirs()); // for each tag create a directory for (String tag : tags) { // for each logic partition create a directory File tagDir = new File(taggedLogicPartitionDir, tag); for (String partition : logicPartitions) { File logicPartition = new File(tagDir, partition); assertTrue(logicPartition.mkdirs()); for (TYPE fileType : fileTypes) { writeFile(logicPartition, fileType); } } } } /** * Write out logical partitioned directories with one file per fileType:TYPE * * @throws IOException */ private void createLogicPartitionDir() throws IOException { logicPartitionDir = new File(baseDir, "logicPartitionDir"); assertTrue(logicPartitionDir.mkdirs()); // for each logic partition create a directory for (String partition : logicPartitions) { File logicPartition = new File(logicPartitionDir, partition); assertTrue(logicPartition.mkdirs()); for (TYPE fileType : fileTypes) { writeFile(logicPartition, fileType); } } } /** * Write out date partitioned directories with one file per fileType:TYPE * * @throws IOException */ private void createDatePartitionDir() throws IOException { datePartitionDir = new File(baseDir, "dateParitionDir"); assertTrue(datePartitionDir.mkdirs()); // for each date partition create a directory for (String partition : datePartitions) { File datePartition = new File(datePartitionDir, partition); assertTrue(datePartition.mkdirs()); for (TYPE fileType : fileTypes) { writeFile(datePartition, fileType); } } } /** * Write out a simple directory with one file per fileType:TYPE * * @throws IOException */ private void createSimpleDir() throws IOException { simpleDir = new File(baseDir, "simpleDir"); assertTrue(simpleDir.mkdirs()); for (TYPE fileType : fileTypes) { writeFile(simpleDir, fileType); } } /** * Create a unique file name with format * [currentTimeInMillis].[type.extension] * * @param dir * @param type * @throws IOException */ private void writeFile(File dir, TYPE type) throws IOException { File uniqueFile = new File(dir, System.currentTimeMillis() + type.extension); type.writer.writeTestData(uniqueFile, fileRecords, fileColumns, colSeparator); } @After public void tearDown() throws Exception { server.shutdown(); FileUtil.fullyDelete(baseDir); baseDir = null; } /** * Simple interface to help with writting test data. * */ private static interface FileTestWriter { void writeTestData(File file, int recordCounts, int columnCount, String colSeparator) throws IOException; } /** * Write Gzip Content * */ private static class GzipFileTestWriter implements FileTestWriter { @Override public void writeTestData(File file, int recordCounts, int columnCount, String colSeparator) throws IOException { // write random test data GzipCodec gzipCodec = new GzipCodec(); CompressionOutputStream out = gzipCodec .createOutputStream(new FileOutputStream(file)); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( out)); try { for (int r = 0; r < recordCounts; r++) { // foreach row write n columns for (int c = 0; c < columnCount; c++) { if (c != 0) { writer.append(colSeparator); } writer.append(String.valueOf(Math.random())); } writer.append("\n"); } } finally { writer.close(); out.close(); } } } private static class HiveRCFileTestWriter implements FileTestWriter { @Override public void writeTestData(File file, int recordCounts, int columnCount, String colSeparator) throws IOException { // write random test data Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); RCFileOutputFormat.setColumnNumber(conf, columnCount); RCFile.Writer writer = new RCFile.Writer(fs, conf, new Path( file.getAbsolutePath())); BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnCount); for (int c = 0; c < columnCount; c++) { bytes.set(c, new BytesRefWritable()); } try { for (int r = 0; r < recordCounts; r++) { // foreach row write n columns for (int c = 0; c < columnCount; c++) { byte[] stringbytes = String.valueOf(Math.random()) .getBytes(); bytes.get(c).set(stringbytes, 0, stringbytes.length); } writer.append(bytes); } } finally { writer.close(); } } } }