/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.hadoop.zebra.io;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import junit.framework.Assert;
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.zebra.tfile.RawComparable;
import org.apache.hadoop.zebra.io.BasicTableStatus;
import org.apache.hadoop.zebra.io.ColumnGroup;
import org.apache.hadoop.zebra.io.KeyDistribution;
import org.apache.hadoop.zebra.io.TableInserter;
import org.apache.hadoop.zebra.io.TableScanner;
import org.apache.hadoop.zebra.io.ColumnGroup.Reader.CGRangeSplit;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* Testing ColumnGroup APIs called as if in MapReduce Jobs
*/
public class TestColumnGroup {
static Configuration conf;
static Random random;
static Path rootPath;
static FileSystem fs;
@BeforeClass
public static void setUpOnce() throws IOException {
conf = new Configuration();
conf.setInt("table.output.tfile.minBlock.size", 64 * 1024);
conf.setInt("table.input.split.minSize", 64 * 1024);
conf.set("table.output.tfile.compression", "none");
conf.set("io.compression.codec.lzo.class", "no");
random = new Random(System.nanoTime());
rootPath = new Path(System.getProperty("test.build.data",
"build/test/data/workdir3"));
fs = rootPath.getFileSystem(conf);
}
@AfterClass
public static void tearDownOnce() throws IOException {
}
BytesWritable makeRandomKey(int max) {
return makeKey(random.nextInt(max));
}
static BytesWritable makeKey(int i) {
return new BytesWritable(String.format("key%09d", i).getBytes());
}
String makeString(String prefix, int max) {
return String.format("%s%09d", prefix, random.nextInt(max));
}
int createCG(int parts, int rows, String strSchema, Path path,
boolean properClose, boolean sorted, int[] emptyTFiles)
throws IOException, ParseException {
if (fs.exists(path)) {
ColumnGroup.drop(path, conf);
}
Set<Integer> emptyTFileSet = new HashSet<Integer>();
if (emptyTFiles != null) {
for (int i = 0; i < emptyTFiles.length; ++i) {
emptyTFileSet.add(emptyTFiles[i]);
}
}
ColumnGroup.Writer writer = new ColumnGroup.Writer(path, strSchema, sorted, path.getName(),
"pig", "gz", "root", null, (short) Short.parseShort("755", 8), false, conf);
writer.finish();
int total = 0;
Schema schema = new Schema(strSchema);
String colNames[] = schema.getColumns();
Tuple tuple = TypesUtils.createTuple(schema);
int[] permutation = new int[parts];
for (int i = 0; i < parts; ++i) {
permutation[i] = i;
}
for (int i = parts - 1; i > 0; --i) {
int targetIndex = random.nextInt(i + 1);
int tmp = permutation[i];
permutation[i] = permutation[targetIndex];
permutation[targetIndex] = tmp;
}
for (int i = 0; i < parts; ++i) {
writer = new ColumnGroup.Writer(path, conf);
TableInserter inserter = writer.getInserter(String.format("part-%06d",
permutation[i]), true);
if ((rows > 0) && !emptyTFileSet.contains(permutation[i])) {
int actualRows = random.nextInt(rows) + rows / 2;
for (int j = 0; j < actualRows; ++j, ++total) {
BytesWritable key;
if (!sorted) {
key = makeRandomKey(rows * 10);
} else {
key = makeKey(total);
}
TypesUtils.resetTuple(tuple);
for (int k = 0; k < tuple.size(); ++k) {
try {
tuple.set(k, new DataByteArray(makeString("col-" + colNames[k], rows * 10).getBytes()));
} catch (ExecException e) {
e.printStackTrace();
}
}
inserter.insert(key, tuple);
}
}
inserter.close();
}
if (properClose) {
writer = new ColumnGroup.Writer(path, conf);
writer.close();
/* We can only test number of rows on sorted tables.*/
if (sorted) {
BasicTableStatus status = getStatus(path);
Assert.assertEquals(total, status.getRows());
}
}
return total;
}
static class DupKeyGen {
int low, high;
int current;
boolean grow = true;
int index = 0;
int count = 0;
DupKeyGen(int low, int high) {
this.low = Math.max(10, low);
this.high = Math.max(this.low * 2, high);
current = this.low;
}
BytesWritable next() {
if (count == 0) {
count = nextCount();
++index;
}
--count;
return makeKey(index);
}
int nextCount() {
int ret = current;
if ((grow && current > high) || (!grow && current < low)) {
grow = !grow;
}
if (grow) {
current *= 2;
} else {
current /= 2;
}
return ret;
}
}
int createCGDupKeys(int parts, int rows, String strSchema, Path path)
throws IOException, ParseException {
if (fs.exists(path)) {
ColumnGroup.drop(path, conf);
}
ColumnGroup.Writer writer = new ColumnGroup.Writer(path, strSchema, true, path.getName(),
"pig", "gz", "root", null, (short) Short.parseShort("777", 8), false, conf);
writer.finish();
int total = 0;
DupKeyGen keyGen = new DupKeyGen(10, rows * 3);
Schema schema = new Schema(strSchema);
String colNames[] = schema.getColumns();
Tuple tuple = TypesUtils.createTuple(schema);
int[] permutation = new int[parts];
for (int i = 0; i < parts; ++i) {
permutation[i] = i;
}
for (int i = parts - 1; i > 0; --i) {
int targetIndex = random.nextInt(i + 1);
int tmp = permutation[i];
permutation[i] = permutation[targetIndex];
permutation[targetIndex] = tmp;
}
for (int i = 0; i < parts; ++i) {
writer = new ColumnGroup.Writer(path, conf);
TableInserter inserter = writer.getInserter(String.format("part-%06d",
permutation[i]), true);
if (rows > 0) {
int actualRows = random.nextInt(rows * 2 / 3) + rows * 2 / 3;
for (int j = 0; j < actualRows; ++j, ++total) {
BytesWritable key = keyGen.next();
TypesUtils.resetTuple(tuple);
for (int k = 0; k < tuple.size(); ++k) {
try {
tuple.set(k, new DataByteArray(makeString("col-" + colNames[k], rows * 10).getBytes()));
} catch (ExecException e) {
e.printStackTrace();
}
}
inserter.insert(key, tuple);
}
}
inserter.close();
}
writer = new ColumnGroup.Writer(path, conf);
writer.close();
BasicTableStatus status = getStatus(path);
Assert.assertEquals(total, status.getRows());
return total;
}
void rangeSplitCG(int numSplits, int totalRows, String strProjection,
Path path) throws IOException, ParseException {
ColumnGroup.Reader reader = new ColumnGroup.Reader(path, conf);
reader.setProjection(strProjection);
long totalBytes = reader.getStatus().getSize();
List<CGRangeSplit> splits = reader.rangeSplit(numSplits);
reader.close();
int total = 0;
for (int i = 0; i < splits.size(); ++i) {
reader = new ColumnGroup.Reader(path, conf);
reader.setProjection(strProjection);
total += doReadOnly(reader.getScanner(splits.get(i), true));
totalBytes -= reader.getBlockDistribution(splits.get(i)).getLength();
}
Assert.assertEquals(total, totalRows);
Assert.assertEquals(totalBytes, 0L);
}
void doRangeSplit(int[] numSplits, int totalRows, String projection, Path path)
throws IOException, ParseException {
for (int i : numSplits) {
if (i > 0) {
rangeSplitCG(i, totalRows, projection, path);
}
}
}
void keySplitCG(int numSplits, int totalRows, String strProjection, Path path)
throws IOException, ParseException {
ColumnGroup.Reader reader = new ColumnGroup.Reader(path, conf);
reader.setProjection(strProjection);
long totalBytes = reader.getStatus().getSize();
BlockDistribution lastBd = new BlockDistribution();
KeyDistribution keyDistri = reader.getKeyDistribution(numSplits * 10, 1, lastBd);
Assert.assertEquals(totalBytes, keyDistri.length()+lastBd.getLength());
reader.close();
BytesWritable[] keys = null;
if (keyDistri.size() >= numSplits) {
keyDistri.resize(lastBd);
Assert.assertEquals(totalBytes, keyDistri.length()+lastBd.getLength());
RawComparable[] rawComparables = keyDistri.getKeys();
keys = new BytesWritable[rawComparables.length];
for (int i = 0; i < keys.length; ++i) {
keys[i] = new BytesWritable();
keys[i].setSize(rawComparables[i].size());
System.arraycopy(rawComparables[i].buffer(),
rawComparables[i].offset(), keys[i].get(), 0, rawComparables[i]
.size());
}
} else {
int targetSize = Math.min(totalRows / 10, numSplits);
// revert to manually cooked up keys.
Set<Integer> keySets = new TreeSet<Integer>();
while (keySets.size() < targetSize) {
keySets.add(random.nextInt(totalRows));
}
keys = new BytesWritable[targetSize];
if (!keySets.isEmpty()) {
int j = 0;
for (int i : keySets.toArray(new Integer[keySets.size()])) {
keys[j] = makeKey(i);
++j;
}
}
}
int total = 0;
for (int i = 0; i < keys.length; ++i) {
reader = new ColumnGroup.Reader(path, conf);
reader.setProjection(strProjection);
BytesWritable begin = (i == 0) ? null : keys[i - 1];
BytesWritable end = (i == keys.length - 1) ? null : keys[i];
total += doReadOnly(reader.getScanner(begin, end, true));
}
Assert.assertEquals(total, totalRows);
}
void doKeySplit(int[] numSplits, int totalRows, String projection, Path path)
throws IOException, ParseException {
for (int i : numSplits) {
if (i > 0) {
keySplitCG(i, totalRows, projection, path);
}
}
}
BasicTableStatus getStatus(Path path) throws IOException, ParseException {
ColumnGroup.Reader reader = new ColumnGroup.Reader(path, conf);
try {
return reader.getStatus();
} finally {
reader.close();
}
}
void doReadWrite(Path path, int parts, int rows, String schema,
String projection, boolean properClose, boolean sorted, int[] emptyTFiles)
throws IOException, ParseException {
int totalRows = createCG(parts, rows, schema, path, properClose, sorted,
emptyTFiles);
if (rows == 0) {
Assert.assertEquals(rows, 0);
}
doRangeSplit(new int[] { 1, 2, parts / 2, parts, 2 * parts }, totalRows,
projection, path);
if (sorted) {
doKeySplit(new int[] { 1, 2, parts / 2, parts, 2 * parts, 10 * parts },
totalRows, projection, path);
}
}
int doReadOnly(TableScanner scanner) throws IOException, ParseException {
int total = 0;
BytesWritable key = new BytesWritable();
Tuple value = TypesUtils.createTuple(scanner.getSchema());
for (; !scanner.atEnd(); scanner.advance()) {
++total;
switch (random.nextInt() % 4) {
case 0:
scanner.getKey(key);
break;
case 1:
scanner.getValue(value);
break;
case 2:
scanner.getKey(key);
scanner.getValue(value);
break;
default: // no-op.
}
}
scanner.close();
return total;
}
@Test
public void testNullSplits() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupNullSplits");
int totalRows = createCG(2, 10, "a, b, c", path, true, true, null);
ColumnGroup.Reader reader = new ColumnGroup.Reader(path, conf);
reader.setProjection("a,d,c,f");
Assert.assertEquals(totalRows, doReadOnly(reader.getScanner(null, false)));
Assert.assertEquals(totalRows, doReadOnly(reader.getScanner(null, null,
false)));
reader.close();
}
@Test
public void testNegativeSplits() throws IOException, ParseException {
Path path = new Path(rootPath, "TestNegativeSplits");
int totalRows = createCG(2, 100, "a, b, c", path, true, true, null);
rangeSplitCG(-1, totalRows, "a,d,c,f", path);
}
@Test
public void testEmptyCG() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupEmptyCG");
doReadWrite(path, 0, 0, "a, b, c", "a, d, c, f", true, false, null);
doReadWrite(path, 0, 0, "a, b, c", "a, d, c, f", true, true, null);
}
@Test
public void testEmptyTFiles() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupEmptyTFile");
doReadWrite(path, 2, 0, "a, b, c", "a, d, c, f", true, false, null);
doReadWrite(path, 2, 0, "a, b, c", "a, d, c, f", true, true, null);
}
public void testNormalCases() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupNormal");
doReadWrite(path, 2, 500, "a, b, c", "a, d, c, f", true, false, null);
doReadWrite(path, 2, 500, "a, b, c", "a, d, c, f", true, true, null);
}
@Test
public void testSomeEmptyTFiles() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupSomeEmptyTFile");
for (int[] emptyTFiles : new int[][] { { 1, 2 }}) {
doReadWrite(path, 2, 250, "a, b, c", "a, d, c, f", true, false,
emptyTFiles);
doReadWrite(path, 2, 250, "a, b, c", "a, d, c, f", true, true,
emptyTFiles);
}
}
int countRows(Path path, String projection) throws IOException,
ParseException {
ColumnGroup.Reader reader = new ColumnGroup.Reader(path, conf);
if (projection != null) {
reader.setProjection(projection);
}
int totalRows = 0;
TableScanner scanner = reader.getScanner(null, true);
for (; !scanner.atEnd(); scanner.advance()) {
++totalRows;
}
scanner.close();
return totalRows;
}
@Test
public void testProjection() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupProjection");
int totalRows = createCG(2, 250, "a, b, c", path, true, true, null);
Assert.assertEquals(totalRows, countRows(path, null));
Assert.assertEquals(totalRows, countRows(path, ""));
}
@Test
public void testDuplicateKeys() throws IOException, ParseException {
Path path = new Path(rootPath, "TestColumnGroupDuplicateKeys");
int totalRows = createCGDupKeys(2, 250, "a, b, c", path);
doKeySplit(new int[] { 1, 5 }, totalRows, "a, d, c, f",
path);
}
@Test
public void testSortedCGKeySplit() throws IOException, ParseException {
conf.setInt("table.output.tfile.minBlock.size", 640 * 1024);
Path path = new Path(rootPath, "TestSortedCGKeySplit");
int totalRows = createCG(2, 250, "a, b, c", path, true, true, null);
doKeySplit(new int[] { 1, 5 }, totalRows, "a, d, c, f",
path);
}
}