PFileReader.java example

Explorer
pbase-master
package org.apache.hadoop.hbase.io.pfile;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.regionserver.InternalRecordScanner;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import org.apache.hadoop.hbase.regionserver.RecordScanner;
import org.apache.hadoop.hbase.regionserver.RowScanner;
import org.apache.hadoop.hbase.util.Bytes;
import parquet.column.ColumnDescriptor;
import parquet.column.ParquetProperties;
import parquet.example.data.Group;
import parquet.example.data.simple.SimpleGroupFactory;
import parquet.hadoop.ParquetFileInfo;
import parquet.hadoop.ParquetReader;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.example.GroupReadSupport;
import parquet.hadoop.example.GroupWriteSupport;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.metadata.FileMetaData;
import parquet.io.api.Binary;
import parquet.schema.GroupType;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;
import parquet.schema.Type;

import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
 * Created by wangxiaoyi on 15/4/24.
 *
 * parquet file reader
 *
 */
public class PFileReader implements PFile.Reader{

    private static final Log LOG = LogFactory.getLog(PFileReader.class);


    private static final String ROW_KEY = "rowkey";



    private Path path = null;
    private Configuration conf = null;
    private MessageType schema = null;
    private ParquetReader<Group> reader = null;


    /**
     * @param fileToRead
     * @param conf
     * @param schema
     */
    public PFileReader(Path fileToRead, Configuration conf, MessageType schema)throws IOException{

        this.path = fileToRead;
        this.conf = conf;
        this.schema = schema;
        if(schema != null) {
            Configuration localConf = new Configuration(conf);
            this.conf = localConf;
//            encapsulate changes in the local config
            //System.out.println(schema.toString());
            this.conf.set(ReadSupport.PARQUET_READ_SCHEMA, this.schema.toString());
        }
        initReader();
    }

    public void initReader()throws IOException{
        reader = ParquetReader
                    .builder(new GroupReadSupport(),path)
                    .withConf(conf)
                    .build();
    }

    /**
     * read a row
     */
    @Override
    public Group readGroup() {
        Group group = null;
        try {
            group = reader.read();
        }catch (IOException ioe){
         LOG.error(ioe);
        }
        return group;
    }

    /**
     * read value from parquet as cell
     *
     * @return
     */
    @Override
    public List<Cell> readCells() {
        List<Cell> cells = new LinkedList<>();
        Group group = readGroup();
        if(group == null)
            return null;
        else {
            List<ColumnDescriptor> columns = schema.getColumns();
            for (ColumnDescriptor column : columns){

            }
            return cells;
        }
    }

    /**
     * close the reader
     */
    @Override
    public void close() {
        try {
            if (reader != null) {
                reader.close();
            }
        }catch (IOException ioe){
            LOG.error(ioe);
        }
    }

    public ParquetFileInfo getFileInfo(){

        return reader.getFileInfo();
    }


    /**
     * get parquet scanner
     * @return {@link org.apache.hadoop.hbase.io.pfile.PFileReader.PFileScanner}
     */
    public PFileScanner getScanner(){
        return new PFileScanner(this);
    }

    /**
     * @return start key of the parquet file
     */
    public byte[] getStartKey(){
        return getFileInfo().getMetaData(HConstants.START_KEY).getBytes();
    }

    /**
     * @return end key of the parquet file
     */
    public byte[] getEndKey(){
        return getFileInfo().getMetaData(HConstants.END_KEY).getBytes();
    }

    /**
     *
     * @return max result left to read
     */
    public long getMaxResultLeft(){
        if(reader != null)
            return  reader.getTotalCountLeft();
        else
            return 0l;
    }


    /**
     *
     * @return total record of this file
     */
    public long getRecordCount(){
        return reader.getTotal();
    }


    /**
     * transform data in group into cells(List<cell> - > {@link org.apache.hadoop.hbase.client.Result}</>)
     * @param group
     * @return
     */
    public static List<Cell> groupToCells(Group group){

        List<Cell> cells = new LinkedList<>();
        if(group != null){
            cells = new LinkedList<>();
            GroupType groupType = group.getType();
            List<Type> types = groupType.getFields();
            byte [] rowKey = group.getBinary(HConstants.ROW_KEY, 0).getBytes();

            long timestamp = group.getLong(HConstants.TIME_STAMP, 0);

            for(Type t : types){
                if(! t.getName().equals(HConstants.ROW_KEY) && ! t.getName().equals(HConstants.TIME_STAMP)){
                    String name = t.getName();
                    String [] names = name.split(":");
                    if(names.length == 2) {
                        byte[] value = group.getBinary(name, 0).getBytes();
                        Cell cell = new KeyValue(rowKey, names[0].getBytes(), names[1].getBytes(), timestamp, value);
                        cells.add(cell);
                    }
                }
            }
        }
        return cells;
    }



    /**
     * scanner for a parquet file
     */
    public class PFileScanner implements InternalRecordScanner{

        private Group curr = null;
        private Group next = null;

        PFileReader reader = null;

        public PFileScanner(PFileReader reader){
            this.reader = reader;
            seek(null);
        }

        /**
         * seek the query row
         *
         * @param rowkey
         */
        public void seek(byte[] rowkey) {
            boolean seekEd = false;
            if(rowkey != null) {
                while (curr != null) {
                    byte[] key = curr.getBinary(ROW_KEY, 0).getBytes();
                    if (Bytes.compareTo(key, rowkey) >= 0) {
                        seekEd = true;
                        break;
                    }else {
                        curr = next;
                        next = reader.readGroup();
                    }
                }
                if(!seekEd){
                    curr = null;
                    next = null;
                }
            }else {
                curr = reader.readGroup();
                if(curr != null){
                    next = readGroup();
                }
            }
        }

        public byte[] getStartKey(){
            return reader.getStartKey();
        }

        public byte[] getEndKey(){
            return reader.getEndKey();
        }

        /**
         * has next row
         *
         * @return
         */
        public boolean hasNext() {
            return curr == null ? false : true;
        }

        /**
         * @return the next {@link Group}
         */
        public Group nextRow() {
            Group rs = curr;
            curr = next;
            next = reader.readGroup();
            return rs;
        }



        /**
         * return the record
         */
        @Override
        public List<Cell> next() {
            List<Cell> record = new LinkedList<>();
            Group group = nextRow();
            if(group != null){
                record.addAll(groupToCells(group));
            }
            return record;
        }

        /**
         * @return total records' count of this scanner
         */
        @Override
        public long getRecordCount() {
            return reader.getRecordCount();
        }

        /**
         * @return max result count left of this scanner
         */
        @Override
        public long getMaxResultsCount() {
            int add = 0;
            if(curr != null) add ++;
            if(next != null) add ++;
            return reader.getMaxResultLeft() + add;
        }

        /**
         * don't iterate just
         *
         * @return first element of the scanner
         */
        @Override
        public List<Cell> peek() {
            Group group = curr;
            return groupToCells(group);
        }

        /**
         * Closes this stream and releases any system resources associated
         * with it. If the stream is already closed then invoking this
         * method has no effect.
         *
         * @throws IOException if an I/O error occurs
         */
        @Override
        public void close() throws IOException {
            reader.close();
        }
    }






}