/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector.util;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.orc.CompressionKind;
import org.apache.hadoop.hive.ql.io.orc.Writer;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.TestVectorizedORCReader;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
/**
* This class generates an orc file from a specified record class. The orc file
* will contain 3 batches worth of rows for each column for all kinds of data distribution:
* all values, no nulls, repeating value, and repeating null.
*
*/
public class OrcFileGenerator {
enum BatchDataDistribution {
AllValues,
NoNulls,
RepeatingValue,
RepeatingNull
}
/**
* Base class for type specific batch generators. Each type specific batch generator implements
* generateRandomNonNullValue to generate random values, and initializeFixedPointValues to
* specify a set of fixed values within the data (this is useful when defining query predicates)
*/
private abstract static class BatchGenerator<T> {
private final Random rand = new Random(0xfa57);
private int possibleNonRandomValueGenerated = rand.nextInt();
private final T[] fixedPointValues;
public BatchGenerator() {
fixedPointValues = initializeFixedPointValues();
}
protected abstract T[] initializeFixedPointValues();
protected abstract T generateRandomNonNullValue(Random rand);
public T[] generateBatch(BatchDataDistribution dist) {
Object[] batch = new Object[VectorizedRowBatch.DEFAULT_SIZE];
for (int i = 0; i < batch.length; i++) {
switch (dist) {
case AllValues:
if (possibleNonRandomValueGenerated % 73 == 0) {
batch[i] = null;
} else if (fixedPointValues != null && possibleNonRandomValueGenerated % 233 == 0) {
batch[i] = fixedPointValues[rand.nextInt(fixedPointValues.length)];
} else {
batch[i] = generateRandomNonNullValue(rand);
}
possibleNonRandomValueGenerated++;
break;
case NoNulls:
if (fixedPointValues != null && possibleNonRandomValueGenerated % 233 == 0) {
batch[i] = fixedPointValues[rand.nextInt(fixedPointValues.length)];
} else {
batch[i] = generateRandomNonNullValue(rand);
}
possibleNonRandomValueGenerated++;
break;
case RepeatingNull:
batch[i] = null;
break;
case RepeatingValue:
if (i == 0) {
batch[i] = generateRandomNonNullValue(rand);
} else {
batch[i] = batch[0];
}
break;
default:
throw new UnsupportedOperationException(
dist.toString() + " data distribution is not implemented.");
}
}
return (T[]) batch;
}
}
private static class ByteBatchGenerator extends BatchGenerator<Byte> {
@Override
protected Byte generateRandomNonNullValue(Random rand) {
return (byte) (rand.nextInt((Byte.MAX_VALUE - Byte.MIN_VALUE) / 2)
- Math.abs(Byte.MIN_VALUE / 2));
}
@Override
protected Byte[] initializeFixedPointValues() {
return new Byte[] {-23, -1, 17, 33};
}
}
private static class ShortBatchGenerator extends BatchGenerator<Short> {
@Override
protected Short generateRandomNonNullValue(Random rand) {
return (short) (rand.nextInt((Short.MAX_VALUE - Short.MIN_VALUE) / 2)
+ (Short.MIN_VALUE / 2));
}
@Override
protected Short[] initializeFixedPointValues() {
return new Short[] {-257, -75, 197, 359};
}
}
private static class IntegerBatchGenerator extends BatchGenerator<Integer> {
@Override
protected Integer generateRandomNonNullValue(Random rand) {
return rand.nextInt(Integer.MAX_VALUE) + (Integer.MIN_VALUE / 2);
}
@Override
protected Integer[] initializeFixedPointValues() {
return new Integer[] {-3728, -563, 762, 6981};
}
}
private static class LongBatchGenerator extends BatchGenerator<Long> {
@Override
protected Long generateRandomNonNullValue(Random rand) {
return (long) rand.nextInt();
}
@Override
protected Long[] initializeFixedPointValues() {
return new Long[] {(long) -89010, (long) -6432, (long) 3569, (long) 988888};
}
}
private static class FloatBatchGenerator extends BatchGenerator<Float> {
private final ByteBatchGenerator byteGenerator = new ByteBatchGenerator();
@Override
protected Float generateRandomNonNullValue(Random rand) {
return (float) byteGenerator.generateRandomNonNullValue(rand);
}
@Override
protected Float[] initializeFixedPointValues() {
return new Float[] {(float) -26.28, (float) -1.389, (float) 10.175, (float) 79.553};
}
}
private static class DoubleBatchGenerator extends BatchGenerator<Double> {
private final ShortBatchGenerator shortGenerator = new ShortBatchGenerator();
@Override
protected Double generateRandomNonNullValue(Random rand) {
return (double) shortGenerator.generateRandomNonNullValue(rand);
}
@Override
protected Double[] initializeFixedPointValues() {
return new Double[] {-5638.15, -863.257, 2563.58, 9763215.5639};
}
}
private static class BooleanBatchGenerator extends BatchGenerator<Boolean> {
@Override
protected Boolean generateRandomNonNullValue(Random rand) {
return rand.nextBoolean();
}
@Override
protected Boolean[] initializeFixedPointValues() {
return null;
}
}
private static class StringBatchGenerator extends BatchGenerator<String> {
@Override
protected String generateRandomNonNullValue(Random rand) {
int length = rand.nextInt(20) + 5;
char[] values = new char[length];
for (int j = 0; j < length; j++) {
switch (rand.nextInt(3)) {
case 0:
values[j] = (char) (rand.nextInt((int) 'z' - (int) 'a') + (int) 'a');
break;
case 1:
values[j] = (char) (rand.nextInt((int) 'Z' - (int) 'A') + (int) 'A');
break;
case 2:
values[j] = (char) (rand.nextInt((int) '9' - (int) '0') + (int) '0');
break;
default:
throw new UnsupportedOperationException();
}
}
return new String(values);
}
@Override
protected String[] initializeFixedPointValues() {
return new String[] {"a", "b", "ss", "10"};
}
}
private static class TimestampBatchGenerator extends BatchGenerator<Timestamp> {
private final ShortBatchGenerator shortGen = new ShortBatchGenerator();
@Override
protected Timestamp generateRandomNonNullValue(Random rand) {
return new Timestamp(shortGen.generateRandomNonNullValue(rand));
}
@Override
protected Timestamp[] initializeFixedPointValues() {
// TODO Auto-generated method stub
return new Timestamp[] {
new Timestamp(-29071),
new Timestamp(-10669),
new Timestamp(16558),
new Timestamp(31808)
};
}
}
private static final Map<Class, BatchGenerator> TYPE_TO_BATCH_GEN_MAP;
static {
TYPE_TO_BATCH_GEN_MAP = new HashMap<Class, BatchGenerator>();
TYPE_TO_BATCH_GEN_MAP.put(Boolean.class, new BooleanBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Byte.class, new ByteBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Integer.class, new IntegerBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Long.class, new LongBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Short.class, new ShortBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Float.class, new FloatBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Double.class, new DoubleBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(String.class, new StringBatchGenerator());
TYPE_TO_BATCH_GEN_MAP.put(Timestamp.class, new TimestampBatchGenerator());
}
/**
* Generates an orc file based on the provided record class in the specified file system
* at the output path.
*
* @param conf the configuration used to initialize the orc writer
* @param fs the file system to which will contain the generated orc file
* @param outputPath the path where the generated orc will be placed
* @param recordClass a class the defines the record format for the generated orc file, this
* class must have exactly one constructor.
*/
public static void generateOrcFile(Configuration conf, FileSystem fs, Path outputPath,
Class recordClass)
throws IOException, InstantiationException,
IllegalAccessException, InvocationTargetException {
ObjectInspector inspector;
synchronized (TestVectorizedORCReader.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(
recordClass, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(
fs,
outputPath,
conf,
inspector,
100000,
CompressionKind.ZLIB,
10000,
10000);
try {
Constructor[] constructors = recordClass.getConstructors();
if (constructors.length != 1) {
throw new UnsupportedOperationException(
"The provided recordClass must have exactly one constructor.");
}
BatchDataDistribution[] dataDist = BatchDataDistribution.values();
Class[] columns = constructors[0].getParameterTypes();
for (int i = 0; i < dataDist.length * 3; i++) {
Object[][] rows = new Object[columns.length][VectorizedRowBatch.DEFAULT_SIZE];
for (int c = 0; c < columns.length; c++) {
if (!TYPE_TO_BATCH_GEN_MAP.containsKey(columns[c])) {
throw new UnsupportedOperationException("No batch generator defined for type "
+ columns[c].getName());
}
rows[c] = TYPE_TO_BATCH_GEN_MAP.get(
columns[c]).generateBatch(dataDist[(i + c) % dataDist.length]);
}
for (int r = 0; r < VectorizedRowBatch.DEFAULT_SIZE; r++) {
Object[] row = new Object[columns.length];
for (int c = 0; c < columns.length; c++) {
row[c] = rows[c][r];
}
writer.addRow(
constructors[0].newInstance(row));
}
}
} finally {
writer.close();
}
}
}