package org.apache.pig.piggybank.test.storage; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.regex.Pattern; import junit.framework.TestCase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; import org.apache.hadoop.hive.ql.io.RCFileRecordReader; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; import org.apache.hadoop.hive.serde2.columnar.ColumnarStruct; import org.apache.hadoop.hive.serde2.lazy.LazyArray; import org.apache.hadoop.hive.serde2.lazy.LazyMap; import org.apache.hadoop.hive.serde2.lazy.LazyString; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.apache.pig.piggybank.storage.hiverc.HiveRCSchemaUtil; import org.apache.pig.test.Util; import org.junit.Test; public class TestHiveColumnarStorage extends TestCase { static Configuration conf = null; static private FileSystem fs; static File simpleDataFile = null; static File simpleDataDir = null; static int simpleDirFileCount = 3; static int simpleRowCount = 10; static int columnCount = 3; @Override public synchronized void setUp() throws Exception { conf = new Configuration(); fs = LocalFileSystem.getLocal(conf); produceSimpleData(); // Util.deleteDirectory(new File("testhiveColumnarStore")); } @Override public void tearDown() { Util.deleteDirectory(simpleDataDir); Util.deleteDirectory(new File("testhiveColumnarStore")); simpleDataFile.delete(); } @Test public void testShouldStoreRowInHiveFormat() throws IOException, InterruptedException, SerDeException { String loadString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')"; String storeString = "org.apache.pig.piggybank.storage.HiveColumnarStorage()"; String singlePartitionedFile = simpleDataFile.getAbsolutePath(); File outputFile = new File("testhiveColumnarStore"); PigServer server = new PigServer(ExecType.LOCAL); server.setBatchOn(); server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedFile) + "' using " + loadString + ";"); //when server.store("a", outputFile.getAbsolutePath(), storeString); //then Path outputPath = new Path(outputFile.getAbsolutePath()+"/part-m-00000.rc"); ColumnarStruct struct = readRow(outputFile, outputPath, "f1 string,f2 string,f3 string"); assertEquals(3, struct.getFieldsAsList().size()); Object o = struct.getField(0); assertEquals(LazyString.class, o.getClass()); o = struct.getField(1); assertEquals(LazyString.class, o.getClass()); o = struct.getField(2); assertEquals(LazyString.class, o.getClass()); } @Test public void testShouldStoreTupleAsHiveArray() throws IOException, InterruptedException, SerDeException { String loadString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')"; String storeString = "org.apache.pig.piggybank.storage.HiveColumnarStorage()"; String singlePartitionedFile = simpleDataFile.getAbsolutePath(); File outputFile = new File("testhiveColumnarStore"); PigServer server = new PigServer(ExecType.LOCAL); server.setBatchOn(); server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedFile) + "' using " + loadString + ";"); server.registerQuery("b = FOREACH a GENERATE f1, TOTUPLE(f2,f3);"); //when server.store("b", outputFile.getAbsolutePath(), storeString); //then Path outputPath = new Path(outputFile.getAbsolutePath()+"/part-m-00000.rc"); ColumnarStruct struct = readRow(outputFile, outputPath, "f1 string,f2 array<string>"); assertEquals(2, struct.getFieldsAsList().size()); Object o = struct.getField(0); assertEquals(LazyString.class, o.getClass()); o = struct.getField(1); assertEquals(LazyArray.class, o.getClass()); LazyArray arr = (LazyArray)o; List<Object> values = arr.getList(); for(Object value : values) { assertEquals(LazyString.class, value.getClass()); String valueStr =((LazyString) value).getWritableObject().toString(); assertEquals("Sample value", valueStr); } } @Test public void testShouldStoreBagAsHiveArray() throws IOException, InterruptedException, SerDeException { String loadString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')"; String storeString = "org.apache.pig.piggybank.storage.HiveColumnarStorage()"; String singlePartitionedFile = simpleDataFile.getAbsolutePath(); File outputFile = new File("testhiveColumnarStore"); PigServer server = new PigServer(ExecType.LOCAL); server.setBatchOn(); server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedFile) + "' using " + loadString + ";"); server.registerQuery("b = FOREACH a GENERATE f1, TOBAG(f2,f3);"); //when server.store("b", outputFile.getAbsolutePath(), storeString); //then Path outputPath = new Path(outputFile.getAbsolutePath()+"/part-m-00000.rc"); ColumnarStruct struct = readRow(outputFile, outputPath, "f1 string,f2 array<string>"); assertEquals(2, struct.getFieldsAsList().size()); Object o = struct.getField(0); assertEquals(LazyString.class, o.getClass()); o = struct.getField(1); assertEquals(LazyArray.class, o.getClass()); LazyArray arr = (LazyArray)o; List<Object> values = arr.getList(); for(Object value : values) { assertEquals(LazyString.class, value.getClass()); String valueStr =((LazyString) value).getWritableObject().toString(); assertEquals("Sample value", valueStr); } } @Test public void testShouldStoreMapAsHiveMap() throws IOException, InterruptedException, SerDeException { String loadString = "org.apache.pig.piggybank.storage.HiveColumnarLoader('f1 string,f2 string,f3 string')"; String storeString = "org.apache.pig.piggybank.storage.HiveColumnarStorage()"; String singlePartitionedFile = simpleDataFile.getAbsolutePath(); File outputFile = new File("testhiveColumnarStore"); PigServer server = new PigServer(ExecType.LOCAL); server.setBatchOn(); server.registerQuery("a = LOAD '" + Util.encodeEscape(singlePartitionedFile) + "' using " + loadString + ";"); server.registerQuery("b = FOREACH a GENERATE f1, TOMAP(f2,f3);"); //when server.store("b", outputFile.getAbsolutePath(), storeString); //then Path outputPath = new Path(outputFile.getAbsolutePath()+"/part-m-00000.rc"); ColumnarStruct struct = readRow(outputFile, outputPath, "f1 string,f2 map<string,string>"); assertEquals(2, struct.getFieldsAsList().size()); Object o = struct.getField(0); assertEquals(LazyString.class, o.getClass()); o = struct.getField(1); assertEquals(LazyMap.class, o.getClass()); LazyMap arr = (LazyMap)o; Map<Object,Object> values = arr.getMap(); for(Entry<Object,Object> entry : values.entrySet()) { assertEquals(LazyString.class, entry.getKey().getClass()); assertEquals(LazyString.class, entry.getValue().getClass()); String keyStr =((LazyString) entry.getKey()).getWritableObject().toString(); assertEquals("Sample value", keyStr); String valueStr =((LazyString) entry.getValue()).getWritableObject().toString(); assertEquals("Sample value", valueStr); } } private ColumnarStruct readRow(File outputFile, Path outputPath, String schema) throws IOException, InterruptedException, SerDeException { FileSplit fileSplit = new FileSplit(outputPath, 0L, outputFile.length(), (String[])null); Path splitPath = fileSplit.getPath(); RCFileRecordReader<LongWritable, BytesRefArrayWritable> rcFileRecordReader = new RCFileRecordReader<LongWritable, BytesRefArrayWritable>( new Configuration(false), new org.apache.hadoop.mapred.FileSplit(splitPath, fileSplit.getStart(), fileSplit.getLength(), new org.apache.hadoop.mapred.JobConf(conf))); LongWritable key = rcFileRecordReader.createKey(); BytesRefArrayWritable value = rcFileRecordReader.createValue(); rcFileRecordReader.next(key, value); rcFileRecordReader.close(); ColumnarStruct struct = readColumnarStruct(value, schema); return struct; } private ColumnarStruct readColumnarStruct(BytesRefArrayWritable buff, String schema) throws SerDeException { Pattern pcols = Pattern.compile("[a-zA-Z_0-9]*[ ]"); List<String> types = HiveRCSchemaUtil.parseSchemaTypes(schema); List<String> cols = HiveRCSchemaUtil.parseSchema(pcols, schema); List<FieldSchema> fieldSchemaList = new ArrayList<FieldSchema>( cols.size()); for (int i = 0; i < cols.size(); i++) { fieldSchemaList.add(new FieldSchema(cols.get(i), HiveRCSchemaUtil .findPigDataType(types.get(i)))); } Properties props = new Properties(); props.setProperty(Constants.LIST_COLUMNS, HiveRCSchemaUtil.listToString(cols)); props.setProperty(Constants.LIST_COLUMN_TYPES, HiveRCSchemaUtil.listToString(types)); Configuration hiveConf = new HiveConf(conf, SessionState.class); ColumnarSerDe serde = new ColumnarSerDe(); serde.initialize(hiveConf, props); return (ColumnarStruct) serde.deserialize(buff); } /** * Writes out a simple temporary file with 5 columns and 100 rows.<br/> * Data is random numbers. * * @throws SerDeException * @throws IOException */ private static final void produceSimpleData() throws SerDeException, IOException { // produce on single file simpleDataFile = File.createTempFile("testhiveColumnarLoader", ".txt"); simpleDataFile.deleteOnExit(); Path path = new Path(simpleDataFile.getPath()); writeRCFileTest(fs, simpleRowCount, path, columnCount, new DefaultCodec(), columnCount); // produce a folder of simple data simpleDataDir = new File("simpleDataDir" + System.currentTimeMillis()); simpleDataDir.mkdir(); for (int i = 0; i < simpleDirFileCount; i++) { simpleDataFile = new File(simpleDataDir, "testhiveColumnarLoader-" + i + ".txt"); Path filePath = new Path(simpleDataFile.getPath()); writeRCFileTest(fs, simpleRowCount, filePath, columnCount, new DefaultCodec(), columnCount); } } private static int writeRCFileTest(FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec, int columnCount) throws IOException { fs.delete(file, true); int rowsWritten = 0; RCFileOutputFormat.setColumnNumber(conf, columnNum); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, codec); byte[][] columnRandom; BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum); columnRandom = new byte[columnNum][]; for (int i = 0; i < columnNum; i++) { BytesRefWritable cu = new BytesRefWritable(); bytes.set(i, cu); } for (int i = 0; i < rowCount; i++) { bytes.resetValid(columnRandom.length); for (int j = 0; j < columnRandom.length; j++) { columnRandom[j]= "Sample value".getBytes(); bytes.get(j).set(columnRandom[j], 0, columnRandom[j].length); } rowsWritten++; writer.append(bytes); } writer.close(); return rowsWritten; } }