/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package parquet.hadoop; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import parquet.Log; import parquet.column.ColumnDescriptor; import parquet.column.page.PageReadStore; import parquet.filter.UnboundRecordFilter; import parquet.filter2.compat.FilterCompat; import parquet.filter2.compat.FilterCompat.Filter; import parquet.hadoop.api.InitContext; import parquet.hadoop.api.ReadSupport; import parquet.hadoop.metadata.BlockMetaData; import parquet.hadoop.util.counters.BenchmarkCounter; import parquet.io.ColumnIOFactory; import parquet.io.MessageColumnIO; import parquet.io.ParquetDecodingException; import parquet.io.api.RecordMaterializer; import parquet.schema.GroupType; import parquet.schema.MessageType; import parquet.schema.Type; import static java.lang.String.format; import static parquet.Log.DEBUG; import static parquet.Preconditions.checkNotNull; import static parquet.hadoop.ParquetInputFormat.STRICT_TYPE_CHECKING; class InternalParquetRecordReader<T> { //private static final Log LOG = Log.getLog(InternalParquetRecordReader.class); /** * @author wangxiaoyi * change the Log to Logger in SL4J for better control the log info */ private static final Logger LOG = LoggerFactory.getLogger(InternalParquetRecordReader.class); private static final boolean PRINT_LOG_INFO = false; private final ColumnIOFactory columnIOFactory = new ColumnIOFactory(); private final Filter filter; private MessageType requestedSchema; private MessageType fileSchema; private int columnCount; private final ReadSupport<T> readSupport; private RecordMaterializer<T> recordConverter; private T currentValue; private long total; private long current = 0; private int currentBlock = -1; private ParquetFileReader reader; private parquet.io.RecordReader<T> recordReader; private boolean strictTypeChecking; private long totalTimeSpentReadingBytes; private long totalTimeSpentProcessingRecords; private long startedAssemblingCurrentBlockAt; private long totalCountLoadedSoFar = 0; private Path file; /** * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro. * @param filter for filtering individual records */ public InternalParquetRecordReader(ReadSupport<T> readSupport, Filter filter) { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); } /** * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro. */ public InternalParquetRecordReader(ReadSupport<T> readSupport) { this(readSupport, FilterCompat.NOOP); } /** * @param readSupport Object which helps reads files of the given type, e.g. Thrift, Avro. * @param filter Optional filter for only returning matching records. * @deprecated use {@link #InternalParquetRecordReader(ReadSupport, Filter)} */ @Deprecated public InternalParquetRecordReader(ReadSupport<T> readSupport, UnboundRecordFilter filter) { this(readSupport, FilterCompat.get(filter)); } private void checkRead() throws IOException { if (current == totalCountLoadedSoFar) { if (current != 0) { totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt); if (PRINT_LOG_INFO) { LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: " + ((float) totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float) totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms"); final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes; if (totalTime != 0) { final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime; final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime; LOG.info("time spent so far " + percentReading + "% reading (" + totalTimeSpentReadingBytes + " ms) and " + percentProcessing + "% processing (" + totalTimeSpentProcessingRecords + " ms)"); } } } if(PRINT_LOG_INFO) LOG.info("at row " + current + ". reading next block"); long t0 = System.currentTimeMillis(); PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total); } long timeSpentReading = System.currentTimeMillis() - t0; totalTimeSpentReadingBytes += timeSpentReading; BenchmarkCounter.incrementTime(timeSpentReading); if (PRINT_LOG_INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount()); if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking); recordReader = columnIO.getRecordReader(pages, recordConverter, filter); startedAssemblingCurrentBlockAt = System.currentTimeMillis(); totalCountLoadedSoFar += pages.getRowCount(); ++currentBlock; } } public void close() throws IOException { if (reader != null) { reader.close(); } } public Void getCurrentKey() throws IOException, InterruptedException { return null; } public T getCurrentValue() throws IOException, InterruptedException { return currentValue; } public float getProgress() throws IOException, InterruptedException { return (float) current / total; } public void initialize(MessageType fileSchema, Map<String, String> fileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = fileSchema; this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); } private boolean contains(GroupType group, String[] path, int index) { if (index == path.length) { return false; } if (group.containsField(path[index])) { Type type = group.getType(path[index]); if (type.isPrimitive()) { return index + 1 == path.length; } else { return contains(type.asGroupType(), path, index + 1); } } return false; } public boolean nextKeyValue() throws IOException, InterruptedException { boolean recordFound = false; while (!recordFound) { // no more records left if (current >= total) { return false; } try { checkRead(); currentValue = recordReader.read(); current++; if (recordReader.shouldSkipCurrentRecord()) { // this record is being filtered via the filter2 package if (DEBUG) LOG.debug("skipping record"); continue; } if (currentValue == null) { // only happens with FilteredRecordReader at end of block current = totalCountLoadedSoFar; if (DEBUG) LOG.debug("filtered record reader reached end of block"); continue; } recordFound = true; if (DEBUG) LOG.debug("read value: " + currentValue); } catch (RuntimeException e) { throw new ParquetDecodingException(format("Can not read value at %d in block %d in file %s", current, currentBlock, file), e); } } return true; } private static <K, V> Map<K, Set<V>> toSetMultiMap(Map<K, V> map) { Map<K, Set<V>> setMultiMap = new HashMap<K, Set<V>>(); for (Map.Entry<K, V> entry : map.entrySet()) { Set<V> set = new HashSet<V>(); set.add(entry.getValue()); setMultiMap.put(entry.getKey(), Collections.unmodifiableSet(set)); } return Collections.unmodifiableMap(setMultiMap); } /** * @return total count of records */ public long getTotal(){ return total; } /** * * @return current */ public long getCurrent(){ return current; } /** * * @return num of records wait for read */ public long getTotalCountLeft(){ return total - current; } }