/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package parquet.hadoop; import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import parquet.Ints; import parquet.Log; import parquet.column.ColumnDescriptor; import parquet.column.page.DataPage; import parquet.column.page.DataPageV1; import parquet.column.page.DataPageV2; import parquet.column.page.DictionaryPage; import parquet.column.page.PageReadStore; import parquet.column.page.PageReader; import parquet.hadoop.CodecFactory.BytesDecompressor; import parquet.io.ParquetDecodingException; /** * TODO: should this actually be called RowGroupImpl or something? * The name is kind of confusing since it references three different "entities" * in our format: columns, chunks, and pages */ class ColumnChunkPageReadStore implements PageReadStore { private static final Log LOG = Log.getLog(ColumnChunkPageReadStore.class); /** * PageReader for a single column chunk. A column chunk contains * several pages, which are yielded one by one in order. * <p/> * This implementation is provided with a list of pages, each of which * is decompressed and passed through. */ static final class ColumnChunkPageReader implements PageReader { private final BytesDecompressor decompressor; private final long valueCount; private final List<DataPage> compressedPages; private final DictionaryPage compressedDictionaryPage; ColumnChunkPageReader(BytesDecompressor decompressor, List<DataPage> compressedPages, DictionaryPage compressedDictionaryPage) { this.decompressor = decompressor; this.compressedPages = new LinkedList<DataPage>(compressedPages); this.compressedDictionaryPage = compressedDictionaryPage; int count = 0; for (DataPage p : compressedPages) { count += p.getValueCount(); } this.valueCount = count; } public long getTotalValueCount() { return valueCount; } public DataPage readPage() { if (compressedPages.isEmpty()) { return null; } DataPage compressedPage = compressedPages.remove(0); return compressedPage.accept(new DataPage.Visitor<DataPage>() { public DataPage visit(DataPageV1 dataPageV1) { try { return new DataPageV1( decompressor.decompress(dataPageV1.getBytes(), dataPageV1.getUncompressedSize()), dataPageV1.getValueCount(), dataPageV1.getUncompressedSize(), dataPageV1.getStatistics(), dataPageV1.getRlEncoding(), dataPageV1.getDlEncoding(), dataPageV1.getValueEncoding()); } catch (IOException e) { throw new ParquetDecodingException("could not decompress page", e); } } public DataPage visit(DataPageV2 dataPageV2) { if (!dataPageV2.isCompressed()) { return dataPageV2; } try { int uncompressedSize = Ints.checkedCast( dataPageV2.getUncompressedSize() - dataPageV2.getDefinitionLevels().size() - dataPageV2.getRepetitionLevels().size()); return DataPageV2.uncompressed( dataPageV2.getRowCount(), dataPageV2.getNullCount(), dataPageV2.getValueCount(), dataPageV2.getRepetitionLevels(), dataPageV2.getDefinitionLevels(), dataPageV2.getDataEncoding(), decompressor.decompress(dataPageV2.getData(), uncompressedSize), dataPageV2.getStatistics() ); } catch (IOException e) { throw new ParquetDecodingException("could not decompress page", e); } } }); } public DictionaryPage readDictionaryPage() { if (compressedDictionaryPage == null) { return null; } try { return new DictionaryPage( decompressor.decompress(compressedDictionaryPage.getBytes(), compressedDictionaryPage.getUncompressedSize()), compressedDictionaryPage.getDictionarySize(), compressedDictionaryPage.getEncoding()); } catch (IOException e) { throw new RuntimeException(e); // TODO: cleanup } } } private final Map<ColumnDescriptor, ColumnChunkPageReader> readers = new HashMap<ColumnDescriptor, ColumnChunkPageReader>(); private final long rowCount; public ColumnChunkPageReadStore(long rowCount) { this.rowCount = rowCount; } public long getRowCount() { return rowCount; } public PageReader getPageReader(ColumnDescriptor path) { if (!readers.containsKey(path)) { throw new IllegalArgumentException(path + " is not in the store: " + readers.keySet() + " " + rowCount); } return readers.get(path); } void addColumn(ColumnDescriptor path, ColumnChunkPageReader reader) { if (readers.put(path, reader) != null) { throw new RuntimeException(path + " was added twice"); } } }