package org.apache.hadoop.hbase.regionserver.pbase.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import parquet.column.ParquetProperties;
import parquet.example.data.Group;
import parquet.example.data.simple.SimpleGroupFactory;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.example.GroupWriteSupport;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* Created by wangxiaoyi on 15/7/2.
*
* generate some parquet file
* for testing the scanners
*
* 1.generate 10 separate files
* 2.each file contains 50 record
* 3.each file record sorted by row key
* 4.row key ranges from 1 to 500
*
*/
public class GenerateParquetFile {
public static Path root = new Path("hdfs://localhost:9000/parquet/");
public static Configuration conf = new Configuration();
public static MessageType schema = MessageTypeParser.parseMessageType(
" message people { " +
"required binary rowkey;" +
"required binary cf:name;" +
"required binary cf:age;" +
"required binary cf:job;" +
"required int64 timestamp;" +
" }"
);
public static SimpleGroupFactory sfg = new SimpleGroupFactory(schema);
public static Path initFile(String fileName){
return new Path(root, fileName);
}
public static ParquetWriter<Group> initWriter(String fileName, Map<String, String> metas)
throws IOException{
GroupWriteSupport.setSchema(schema, conf);
ParquetWriter<Group> writer = new ParquetWriter<Group>(
initFile(fileName),
new GroupWriteSupport(metas),
CompressionCodecName.SNAPPY,
1024,
1024,
512,
true,
false,
ParquetProperties.WriterVersion.PARQUET_1_0,
conf);
return writer;
}
public static String genRowKey(String format, int i){
return String.format(format, i);
}
public static void main(String []args) throws IOException{
int fileNum = 10; //num of files constructed
int fileRecordNum = 50; //record num of each file
int rowKey = 0;
for(int i = 0; i < fileNum; ++ i ) {
Map<String, String> metas = new HashMap<>();
metas.put(HConstants.START_KEY, genRowKey("%10d", rowKey + 1));
metas.put(HConstants.END_KEY, genRowKey("%10d", rowKey + fileRecordNum));
ParquetWriter<Group> writer = initWriter("pfile/scanner_test_file" + i, metas);
for (int j = 0; j < fileRecordNum; ++j) {
rowKey ++;
Group group = sfg.newGroup().append("rowkey", genRowKey("%10d", rowKey))
.append("cf:name", "wangxiaoyi" + rowKey)
.append("cf:age", String.format("%10d", rowKey))
.append("cf:job", "student")
.append("timestamp", System.currentTimeMillis());
writer.write(group);
}
writer.close();
}
}
}