package org.apache.hadoop.hbase.io.pfile; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.regionserver.InternalRecordScanner; import org.apache.hadoop.hbase.regionserver.InternalScanner; import org.apache.hadoop.hbase.regionserver.RecordScanner; import org.apache.hadoop.hbase.regionserver.RowScanner; import org.apache.hadoop.hbase.util.Bytes; import parquet.column.ColumnDescriptor; import parquet.column.ParquetProperties; import parquet.example.data.Group; import parquet.example.data.simple.SimpleGroupFactory; import parquet.hadoop.ParquetFileInfo; import parquet.hadoop.ParquetReader; import parquet.hadoop.ParquetWriter; import parquet.hadoop.api.ReadSupport; import parquet.hadoop.example.GroupReadSupport; import parquet.hadoop.example.GroupWriteSupport; import parquet.hadoop.metadata.CompressionCodecName; import parquet.hadoop.metadata.FileMetaData; import parquet.io.api.Binary; import parquet.schema.GroupType; import parquet.schema.MessageType; import parquet.schema.MessageTypeParser; import parquet.schema.Type; import java.io.Closeable; import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; /** * Created by wangxiaoyi on 15/4/24. * * parquet file reader * */ public class PFileReader implements PFile.Reader{ private static final Log LOG = LogFactory.getLog(PFileReader.class); private static final String ROW_KEY = "rowkey"; private Path path = null; private Configuration conf = null; private MessageType schema = null; private ParquetReader<Group> reader = null; /** * @param fileToRead * @param conf * @param schema */ public PFileReader(Path fileToRead, Configuration conf, MessageType schema)throws IOException{ this.path = fileToRead; this.conf = conf; this.schema = schema; if(schema != null) { Configuration localConf = new Configuration(conf); this.conf = localConf; // encapsulate changes in the local config //System.out.println(schema.toString()); this.conf.set(ReadSupport.PARQUET_READ_SCHEMA, this.schema.toString()); } initReader(); } public void initReader()throws IOException{ reader = ParquetReader .builder(new GroupReadSupport(),path) .withConf(conf) .build(); } /** * read a row */ @Override public Group readGroup() { Group group = null; try { group = reader.read(); }catch (IOException ioe){ LOG.error(ioe); } return group; } /** * read value from parquet as cell * * @return */ @Override public List<Cell> readCells() { List<Cell> cells = new LinkedList<>(); Group group = readGroup(); if(group == null) return null; else { List<ColumnDescriptor> columns = schema.getColumns(); for (ColumnDescriptor column : columns){ } return cells; } } /** * close the reader */ @Override public void close() { try { if (reader != null) { reader.close(); } }catch (IOException ioe){ LOG.error(ioe); } } public ParquetFileInfo getFileInfo(){ return reader.getFileInfo(); } /** * get parquet scanner * @return {@link org.apache.hadoop.hbase.io.pfile.PFileReader.PFileScanner} */ public PFileScanner getScanner(){ return new PFileScanner(this); } /** * @return start key of the parquet file */ public byte[] getStartKey(){ return getFileInfo().getMetaData(HConstants.START_KEY).getBytes(); } /** * @return end key of the parquet file */ public byte[] getEndKey(){ return getFileInfo().getMetaData(HConstants.END_KEY).getBytes(); } /** * * @return max result left to read */ public long getMaxResultLeft(){ if(reader != null) return reader.getTotalCountLeft(); else return 0l; } /** * * @return total record of this file */ public long getRecordCount(){ return reader.getTotal(); } /** * transform data in group into cells(List<cell> - > {@link org.apache.hadoop.hbase.client.Result}</>) * @param group * @return */ public static List<Cell> groupToCells(Group group){ List<Cell> cells = new LinkedList<>(); if(group != null){ cells = new LinkedList<>(); GroupType groupType = group.getType(); List<Type> types = groupType.getFields(); byte [] rowKey = group.getBinary(HConstants.ROW_KEY, 0).getBytes(); long timestamp = group.getLong(HConstants.TIME_STAMP, 0); for(Type t : types){ if(! t.getName().equals(HConstants.ROW_KEY) && ! t.getName().equals(HConstants.TIME_STAMP)){ String name = t.getName(); String [] names = name.split(":"); if(names.length == 2) { byte[] value = group.getBinary(name, 0).getBytes(); Cell cell = new KeyValue(rowKey, names[0].getBytes(), names[1].getBytes(), timestamp, value); cells.add(cell); } } } } return cells; } /** * scanner for a parquet file */ public class PFileScanner implements InternalRecordScanner{ private Group curr = null; private Group next = null; PFileReader reader = null; public PFileScanner(PFileReader reader){ this.reader = reader; seek(null); } /** * seek the query row * * @param rowkey */ public void seek(byte[] rowkey) { boolean seekEd = false; if(rowkey != null) { while (curr != null) { byte[] key = curr.getBinary(ROW_KEY, 0).getBytes(); if (Bytes.compareTo(key, rowkey) >= 0) { seekEd = true; break; }else { curr = next; next = reader.readGroup(); } } if(!seekEd){ curr = null; next = null; } }else { curr = reader.readGroup(); if(curr != null){ next = readGroup(); } } } public byte[] getStartKey(){ return reader.getStartKey(); } public byte[] getEndKey(){ return reader.getEndKey(); } /** * has next row * * @return */ public boolean hasNext() { return curr == null ? false : true; } /** * @return the next {@link Group} */ public Group nextRow() { Group rs = curr; curr = next; next = reader.readGroup(); return rs; } /** * return the record */ @Override public List<Cell> next() { List<Cell> record = new LinkedList<>(); Group group = nextRow(); if(group != null){ record.addAll(groupToCells(group)); } return record; } /** * @return total records' count of this scanner */ @Override public long getRecordCount() { return reader.getRecordCount(); } /** * @return max result count left of this scanner */ @Override public long getMaxResultsCount() { int add = 0; if(curr != null) add ++; if(next != null) add ++; return reader.getMaxResultLeft() + add; } /** * don't iterate just * * @return first element of the scanner */ @Override public List<Cell> peek() { Group group = curr; return groupToCells(group); } /** * Closes this stream and releases any system resources associated * with it. If the stream is already closed then invoking this * method has no effect. * * @throws IOException if an I/O error occurs */ @Override public void close() throws IOException { reader.close(); } } }