/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.howl.rcfile; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import junit.framework.Assert; import junit.framework.TestCase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.howl.common.HowlException; import org.apache.howl.common.HowlUtil; import org.apache.howl.data.DefaultHowlRecord; import org.apache.howl.data.HowlRecord; import org.apache.howl.data.schema.HowlSchema; public class TestRCFileInputStorageDriver extends TestCase{ private static Configuration conf = new Configuration(); private static Path file; private static FileSystem fs; static { try { fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.data.dir", ".") + "/mapred"); file = new Path(dir, "test_rcfile"); fs.delete(dir, true); } catch (Exception e) { } } public void testConvertValueToTuple() throws IOException,InterruptedException{ fs.delete(file, true); byte[][] record_1 = {"123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"), new byte[0], "\\N".getBytes("UTF-8")}; byte[][] record_2 = {"100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"), new byte[0], "\\N".getBytes("UTF-8")}; RCFileOutputFormat.setColumnNumber(conf, 8); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec()); BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length); for (int i = 0; i < record_1.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length); bytes.set(i, cu); } writer.append(bytes); BytesRefArrayWritable bytes2 = new BytesRefArrayWritable(record_2.length); for (int i = 0; i < record_2.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length); bytes2.set(i, cu); } writer.append(bytes2); writer.close(); BytesRefArrayWritable[] bytesArr = new BytesRefArrayWritable[]{bytes,bytes2}; HowlSchema schema = buildHiveSchema(); RCFileInputDriver sd = new RCFileInputDriver(); JobContext jc = new JobContext(conf, new JobID()); sd.setInputPath(jc, file.toString()); InputFormat<?,?> iF = sd.getInputFormat(null); InputSplit split = iF.getSplits(jc).get(0); sd.setOriginalSchema(jc, schema); sd.setOutputSchema(jc, schema); sd.initialize(jc, getProps()); TaskAttemptContext tac = new TaskAttemptContext(conf, new TaskAttemptID()); RecordReader<?,?> rr = iF.createRecordReader(split,tac); rr.initialize(split, tac); HowlRecord[] tuples = getExpectedRecords(); for(int j=0; j < 2; j++){ Assert.assertTrue(rr.nextKeyValue()); BytesRefArrayWritable w = (BytesRefArrayWritable)rr.getCurrentValue(); Assert.assertEquals(bytesArr[j], w); HowlRecord t = sd.convertToHowlRecord(null,w); Assert.assertEquals(8, t.size()); Assert.assertEquals(t,tuples[j]); } } public void testPruning() throws IOException,InterruptedException{ fs.delete(file, true); byte[][] record_1 = {"123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"), new byte[0], "\\N".getBytes("UTF-8")}; byte[][] record_2 = {"100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"), new byte[0], "\\N".getBytes("UTF-8")}; RCFileOutputFormat.setColumnNumber(conf, 8); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec()); BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length); for (int i = 0; i < record_1.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length); bytes.set(i, cu); } writer.append(bytes); BytesRefArrayWritable bytes2 = new BytesRefArrayWritable(record_2.length); for (int i = 0; i < record_2.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length); bytes2.set(i, cu); } writer.append(bytes2); writer.close(); BytesRefArrayWritable[] bytesArr = new BytesRefArrayWritable[]{bytes,bytes2}; RCFileInputDriver sd = new RCFileInputDriver(); JobContext jc = new JobContext(conf, new JobID()); sd.setInputPath(jc, file.toString()); InputFormat<?,?> iF = sd.getInputFormat(null); InputSplit split = iF.getSplits(jc).get(0); sd.setOriginalSchema(jc, buildHiveSchema()); sd.setOutputSchema(jc, buildPrunedSchema()); sd.initialize(jc, getProps()); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,jc.getConfiguration().get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); TaskAttemptContext tac = new TaskAttemptContext(conf, new TaskAttemptID()); RecordReader<?,?> rr = iF.createRecordReader(split,tac); rr.initialize(split, tac); HowlRecord[] tuples = getPrunedRecords(); for(int j=0; j < 2; j++){ Assert.assertTrue(rr.nextKeyValue()); BytesRefArrayWritable w = (BytesRefArrayWritable)rr.getCurrentValue(); Assert.assertFalse(bytesArr[j].equals(w)); Assert.assertEquals(w.size(), 8); HowlRecord t = sd.convertToHowlRecord(null,w); Assert.assertEquals(5, t.size()); Assert.assertEquals(t,tuples[j]); } assertFalse(rr.nextKeyValue()); } public void testReorderdCols() throws IOException,InterruptedException{ fs.delete(file, true); byte[][] record_1 = {"123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"), new byte[0], "\\N".getBytes("UTF-8")}; byte[][] record_2 = {"100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "howl and hadoop".getBytes("UTF-8"), new byte[0], "\\N".getBytes("UTF-8")}; RCFileOutputFormat.setColumnNumber(conf, 8); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec()); BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length); for (int i = 0; i < record_1.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length); bytes.set(i, cu); } writer.append(bytes); BytesRefArrayWritable bytes2 = new BytesRefArrayWritable(record_2.length); for (int i = 0; i < record_2.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length); bytes2.set(i, cu); } writer.append(bytes2); writer.close(); BytesRefArrayWritable[] bytesArr = new BytesRefArrayWritable[]{bytes,bytes2}; RCFileInputDriver sd = new RCFileInputDriver(); JobContext jc = new JobContext(conf, new JobID()); sd.setInputPath(jc, file.toString()); InputFormat<?,?> iF = sd.getInputFormat(null); InputSplit split = iF.getSplits(jc).get(0); sd.setOriginalSchema(jc, buildHiveSchema()); sd.setOutputSchema(jc, buildReorderedSchema()); sd.initialize(jc, getProps()); Map<String,String> map = new HashMap<String,String>(1); map.put("part1", "first-part"); sd.setPartitionValues(jc, map); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,jc.getConfiguration().get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); TaskAttemptContext tac = new TaskAttemptContext(conf, new TaskAttemptID()); RecordReader<?,?> rr = iF.createRecordReader(split,tac); rr.initialize(split, tac); HowlRecord[] tuples = getReorderedCols(); for(int j=0; j < 2; j++){ Assert.assertTrue(rr.nextKeyValue()); BytesRefArrayWritable w = (BytesRefArrayWritable)rr.getCurrentValue(); Assert.assertFalse(bytesArr[j].equals(w)); Assert.assertEquals(w.size(), 8); HowlRecord t = sd.convertToHowlRecord(null,w); Assert.assertEquals(7, t.size()); Assert.assertEquals(t,tuples[j]); } assertFalse(rr.nextKeyValue()); } private HowlRecord[] getExpectedRecords(){ List<Object> rec_1 = new ArrayList<Object>(8); rec_1.add(new Byte("123")); rec_1.add(new Short("456")); rec_1.add( new Integer(789)); rec_1.add( new Long(1000L)); rec_1.add( new Double(5.3D)); rec_1.add( new String("howl and hadoop")); rec_1.add( null); rec_1.add( null); HowlRecord tup_1 = new DefaultHowlRecord(rec_1); List<Object> rec_2 = new ArrayList<Object>(8); rec_2.add( new Byte("100")); rec_2.add( new Short("200")); rec_2.add( new Integer(123)); rec_2.add( new Long(1000L)); rec_2.add( new Double(5.3D)); rec_2.add( new String("howl and hadoop")); rec_2.add( null); rec_2.add( null); HowlRecord tup_2 = new DefaultHowlRecord(rec_2); return new HowlRecord[]{tup_1,tup_2}; } private HowlRecord[] getPrunedRecords(){ List<Object> rec_1 = new ArrayList<Object>(8); rec_1.add(new Byte("123")); rec_1.add( new Integer(789)); rec_1.add( new Double(5.3D)); rec_1.add( new String("howl and hadoop")); rec_1.add( null); HowlRecord tup_1 = new DefaultHowlRecord(rec_1); List<Object> rec_2 = new ArrayList<Object>(8); rec_2.add( new Byte("100")); rec_2.add( new Integer(123)); rec_2.add( new Double(5.3D)); rec_2.add( new String("howl and hadoop")); rec_2.add( null); HowlRecord tup_2 = new DefaultHowlRecord(rec_2); return new HowlRecord[]{tup_1,tup_2}; } private HowlSchema buildHiveSchema() throws HowlException{ List<FieldSchema> fields = new ArrayList<FieldSchema>(8); fields.add(new FieldSchema("atinyint", "tinyint", "")); fields.add(new FieldSchema("asmallint", "smallint", "")); fields.add(new FieldSchema("aint", "int", "")); fields.add(new FieldSchema("along", "bigint", "")); fields.add(new FieldSchema("adouble", "double", "")); fields.add(new FieldSchema("astring", "string", "")); fields.add(new FieldSchema("anullint", "int", "")); fields.add(new FieldSchema("anullstring", "string", "")); return new HowlSchema(HowlUtil.getHowlFieldSchemaList(fields)); } private HowlSchema buildPrunedSchema() throws HowlException{ List<FieldSchema> fields = new ArrayList<FieldSchema>(5); fields.add(new FieldSchema("atinyint", "tinyint", "")); fields.add(new FieldSchema("aint", "int", "")); fields.add(new FieldSchema("adouble", "double", "")); fields.add(new FieldSchema("astring", "string", "")); fields.add(new FieldSchema("anullint", "int", "")); return new HowlSchema(HowlUtil.getHowlFieldSchemaList(fields)); } private HowlSchema buildReorderedSchema() throws HowlException{ List<FieldSchema> fields = new ArrayList<FieldSchema>(7); fields.add(new FieldSchema("aint", "int", "")); fields.add(new FieldSchema("part1", "string", "")); fields.add(new FieldSchema("adouble", "double", "")); fields.add(new FieldSchema("newCol", "tinyint", "")); fields.add(new FieldSchema("astring", "string", "")); fields.add(new FieldSchema("atinyint", "tinyint", "")); fields.add(new FieldSchema("anullint", "int", "")); return new HowlSchema(HowlUtil.getHowlFieldSchemaList(fields)); } private HowlRecord[] getReorderedCols(){ List<Object> rec_1 = new ArrayList<Object>(7); rec_1.add( new Integer(789)); rec_1.add( new String("first-part")); rec_1.add( new Double(5.3D)); rec_1.add( null); // new column rec_1.add( new String("howl and hadoop")); rec_1.add( new Byte("123")); rec_1.add( null); HowlRecord tup_1 = new DefaultHowlRecord(rec_1); List<Object> rec_2 = new ArrayList<Object>(7); rec_2.add( new Integer(123)); rec_2.add( new String("first-part")); rec_2.add( new Double(5.3D)); rec_2.add(null); rec_2.add( new String("howl and hadoop")); rec_2.add( new Byte("100")); rec_2.add( null); HowlRecord tup_2 = new DefaultHowlRecord(rec_2); return new HowlRecord[]{tup_1,tup_2}; } private Properties getProps(){ Properties props = new Properties(); props.setProperty(Constants.SERIALIZATION_NULL_FORMAT, "\\N"); props.setProperty(Constants.SERIALIZATION_FORMAT, "9"); return props; } }