InternalParquetRecordReader.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.Log;
import parquet.column.ColumnDescriptor;
import parquet.column.page.PageReadStore;
import parquet.filter.UnboundRecordFilter;
import parquet.filter2.compat.FilterCompat;
import parquet.filter2.compat.FilterCompat.Filter;
import parquet.hadoop.api.InitContext;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.util.counters.BenchmarkCounter;
import parquet.io.ColumnIOFactory;
import parquet.io.MessageColumnIO;
import parquet.io.ParquetDecodingException;
import parquet.io.api.RecordMaterializer;
import parquet.schema.GroupType;
import parquet.schema.MessageType;
import parquet.schema.Type;

import static java.lang.String.format;
import static parquet.Log.DEBUG;
import static parquet.Preconditions.checkNotNull;
import static parquet.hadoop.ParquetInputFormat.STRICT_TYPE_CHECKING;

class InternalParquetRecordReader<T> {
    //private static final Log LOG = Log.getLog(InternalParquetRecordReader.class);
    /**
     * @author wangxiaoyi
     * change the Log to Logger in SL4J for better control the log info
     */
    private static final Logger LOG = LoggerFactory.getLogger(InternalParquetRecordReader.class);

    private static final boolean PRINT_LOG_INFO = false;

    private final ColumnIOFactory columnIOFactory = new ColumnIOFactory();
    private final Filter filter;

    private MessageType requestedSchema;
    private MessageType fileSchema;
    private int columnCount;
    private final ReadSupport<T> readSupport;

    private RecordMaterializer<T> recordConverter;

    private T currentValue;
    private long total;
    private long current = 0;
    private int currentBlock = -1;
    private ParquetFileReader reader;
    private parquet.io.RecordReader<T> recordReader;
    private boolean strictTypeChecking;

    private long totalTimeSpentReadingBytes;
    private long totalTimeSpentProcessingRecords;
    private long startedAssemblingCurrentBlockAt;

    private long totalCountLoadedSoFar = 0;

    private Path file;

    /**
     * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro.
     * @param filter      for filtering individual records
     */
    public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter filter) {
        this.readSupport = readSupport;
        this.filter = checkNotNull(filter, "filter");
    }

    /**
     * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro.
     */
    public InternalParquetRecordReader(ReadSupport<T> readSupport) {
        this(readSupport, FilterCompat.NOOP);
    }

    /**
     * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro.
     * @param filter      Optional filter for only returning matching records.
     * @deprecated use {@link #InternalParquetRecordReader(ReadSupport, Filter)}
     */
    @Deprecated
    public InternalParquetRecordReader(ReadSupport<T> readSupport, UnboundRecordFilter filter) {
        this(readSupport, FilterCompat.get(filter));
    }

    private void checkRead() throws IOException {
        if (current == totalCountLoadedSoFar) {
            if (current != 0) {
                totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
                if (PRINT_LOG_INFO) {
                    LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: " + ((float) totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float) totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
                    final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
                    if (totalTime != 0) {
                        final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
                        final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
                        LOG.info("time spent so far " + percentReading + "% reading (" + totalTimeSpentReadingBytes + " ms) and " + percentProcessing + "% processing (" + totalTimeSpentProcessingRecords + " ms)");
                    }
                }
            }
            if(PRINT_LOG_INFO)
                LOG.info("at row " + current + ". reading next block");
            long t0 = System.currentTimeMillis();
            PageReadStore pages = reader.readNextRowGroup();
            if (pages == null) {
                throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
            }
            long timeSpentReading = System.currentTimeMillis() - t0;
            totalTimeSpentReadingBytes += timeSpentReading;
            BenchmarkCounter.incrementTime(timeSpentReading);
            if (PRINT_LOG_INFO)
                LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
            if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
            MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
            recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
            startedAssemblingCurrentBlockAt = System.currentTimeMillis();
            totalCountLoadedSoFar += pages.getRowCount();
            ++currentBlock;
        }
    }

    public void close() throws IOException {
        if (reader != null) {
            reader.close();
        }
    }

    public Void getCurrentKey() throws IOException, InterruptedException {
        return null;
    }

    public T getCurrentValue() throws IOException,
            InterruptedException {
        return currentValue;
    }

    public float getProgress() throws IOException, InterruptedException {
        return (float) current / total;
    }

    public void initialize(MessageType fileSchema,
                           Map<String, String> fileMetadata,
                           Path file, List<BlockMetaData> blocks, Configuration configuration)
            throws IOException {
        // initialize a ReadContext for this file
        ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
                configuration, toSetMultiMap(fileMetadata), fileSchema));
        this.requestedSchema = readContext.getRequestedSchema();
        this.fileSchema = fileSchema;
        this.file = file;
        this.columnCount = requestedSchema.getPaths().size();
        this.recordConverter = readSupport.prepareForRead(
                configuration, fileMetadata, fileSchema, readContext);
        this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
        List<ColumnDescriptor> columns = requestedSchema.getColumns();
        reader = new ParquetFileReader(configuration, file, blocks, columns);
        for (BlockMetaData block : blocks) {
            total += block.getRowCount();
        }
        LOG.info("RecordReader initialized will read a total of " + total + " records.");
    }

    private boolean contains(GroupType group, String[] path, int index) {
        if (index == path.length) {
            return false;
        }
        if (group.containsField(path[index])) {
            Type type = group.getType(path[index]);
            if (type.isPrimitive()) {
                return index + 1 == path.length;
            } else {
                return contains(type.asGroupType(), path, index + 1);
            }
        }
        return false;
    }

    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean recordFound = false;

        while (!recordFound) {
            // no more records left
            if (current >= total) {
                return false;
            }

            try {
                checkRead();
                currentValue = recordReader.read();
                current++;
                if (recordReader.shouldSkipCurrentRecord()) {
                    // this record is being filtered via the filter2 package
                    if (DEBUG) LOG.debug("skipping record");
                    continue;
                }

                if (currentValue == null) {
                    // only happens with FilteredRecordReader at end of block
                    current = totalCountLoadedSoFar;
                    if (DEBUG) LOG.debug("filtered record reader reached end of block");
                    continue;
                }

                recordFound = true;

                if (DEBUG) LOG.debug("read value: " + currentValue);
            } catch (RuntimeException e) {
                throw new ParquetDecodingException(format("Can not read value at %d in block %d in file %s", current, currentBlock, file), e);
            }
        }
        return true;
    }


    private static <K, V> Map<K, Set<V>> toSetMultiMap(Map<K, V> map) {
        Map<K, Set<V>> setMultiMap = new HashMap<K, Set<V>>();
        for (Map.Entry<K, V> entry : map.entrySet()) {
            Set<V> set = new HashSet<V>();
            set.add(entry.getValue());
            setMultiMap.put(entry.getKey(), Collections.unmodifiableSet(set));
        }
        return Collections.unmodifiableMap(setMultiMap);
    }

    /**
     * @return total count of records
     */
    public long getTotal(){
        return total;
    }

    /**
     *
     * @return current
     */
    public long getCurrent(){
        return current;
    }

    /**
     *
     * @return num of records wait for read
     */
    public long getTotalCountLeft(){
        return total - current;
    }
}