/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package parquet.hadoop; import static parquet.Log.INFO; import static parquet.column.statistics.Statistics.getStatsBasedOnType; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import parquet.Log; import parquet.bytes.BytesInput; import parquet.bytes.ConcatenatingByteArrayCollector; import parquet.column.ColumnDescriptor; import parquet.column.Encoding; import parquet.column.page.DictionaryPage; import parquet.column.page.PageWriteStore; import parquet.column.page.PageWriter; import parquet.column.statistics.Statistics; import parquet.format.converter.ParquetMetadataConverter; import parquet.hadoop.CodecFactory.BytesCompressor; import parquet.io.ParquetEncodingException; import parquet.schema.MessageType; class ColumnChunkPageWriteStore implements PageWriteStore { private static final Log LOG = Log.getLog(ColumnChunkPageWriteStore.class); private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); private static final class ColumnChunkPageWriter implements PageWriter { private final ColumnDescriptor path; private final BytesCompressor compressor; private final ByteArrayOutputStream tempOutputStream = new ByteArrayOutputStream(); private final ConcatenatingByteArrayCollector buf; private DictionaryPage dictionaryPage; private long uncompressedLength; private long compressedLength; private long totalValueCount; private int pageCount; private Set<Encoding> encodings = new HashSet<Encoding>(); private Statistics totalStatistics; private ColumnChunkPageWriter(ColumnDescriptor path, BytesCompressor compressor, int pageSize) { this.path = path; this.compressor = compressor; this.buf = new ConcatenatingByteArrayCollector(); this.totalStatistics = getStatsBasedOnType(this.path.getType()); } @Override public void writePage(BytesInput bytes, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) throws IOException { long uncompressedSize = bytes.size(); if (uncompressedSize > Integer.MAX_VALUE) { throw new ParquetEncodingException( "Cannot write page larger than Integer.MAX_VALUE bytes: " + uncompressedSize); } BytesInput compressedBytes = compressor.compress(bytes); long compressedSize = compressedBytes.size(); if (compressedSize > Integer.MAX_VALUE) { throw new ParquetEncodingException( "Cannot write compressed page larger than Integer.MAX_VALUE bytes: " + compressedSize); } tempOutputStream.reset(); parquetMetadataConverter.writeDataPageHeader( (int) uncompressedSize, (int) compressedSize, valueCount, statistics, rlEncoding, dlEncoding, valuesEncoding, tempOutputStream); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; this.totalStatistics.mergeStatistics(statistics); // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes)); encodings.add(rlEncoding); encodings.add(dlEncoding); encodings.add(valuesEncoding); } @Override public void writePageV2( int rowCount, int nullCount, int valueCount, BytesInput repetitionLevels, BytesInput definitionLevels, Encoding dataEncoding, BytesInput data, Statistics<?> statistics) throws IOException { int rlByteLength = toIntWithCheck(repetitionLevels.size()); int dlByteLength = toIntWithCheck(definitionLevels.size()); int uncompressedSize = toIntWithCheck( data.size() + repetitionLevels.size() + definitionLevels.size() ); // TODO: decide if we compress BytesInput compressedData = compressor.compress(data); int compressedSize = toIntWithCheck( compressedData.size() + repetitionLevels.size() + definitionLevels.size() ); tempOutputStream.reset(); parquetMetadataConverter.writeDataPageV2Header( uncompressedSize, compressedSize, valueCount, nullCount, rowCount, statistics, dataEncoding, rlByteLength, dlByteLength, tempOutputStream); this.uncompressedLength += uncompressedSize; this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; this.totalStatistics.mergeStatistics(statistics); // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect( BytesInput.concat( BytesInput.from(tempOutputStream), repetitionLevels, definitionLevels, compressedData) ); encodings.add(dataEncoding); } private int toIntWithCheck(long size) { if (size > Integer.MAX_VALUE) { throw new ParquetEncodingException( "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " + size); } return (int) size; } @Override public long getMemSize() { return buf.size(); } public void writeToFileWriter(ParquetFileWriter writer) throws IOException { writer.startColumn(path, totalValueCount, compressor.getCodecName()); if (dictionaryPage != null) { writer.writeDictionaryPage(dictionaryPage); encodings.add(dictionaryPage.getEncoding()); } writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics, new ArrayList<Encoding>(encodings)); writer.endColumn(); if (INFO) { LOG.info( String.format( "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s", buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, encodings) + (dictionaryPage != null ? String.format( ", dic { %,d entries, %,dB raw, %,dB comp}", dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize()) : "")); } encodings.clear(); pageCount = 0; } @Override public long allocatedSize() { return buf.size(); } @Override public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException { if (this.dictionaryPage != null) { throw new ParquetEncodingException("Only one dictionary page is allowed"); } BytesInput dictionaryBytes = dictionaryPage.getBytes(); int uncompressedSize = (int) dictionaryBytes.size(); BytesInput compressedBytes = compressor.compress(dictionaryBytes); this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding()); } @Override public String memUsageString(String prefix) { return buf.memUsageString(prefix + " ColumnChunkPageWriter"); } } private final Map<ColumnDescriptor, ColumnChunkPageWriter> writers = new HashMap<ColumnDescriptor, ColumnChunkPageWriter>(); private final MessageType schema; public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int pageSize) { this.schema = schema; for (ColumnDescriptor path : schema.getColumns()) { writers.put(path, new ColumnChunkPageWriter(path, compressor, pageSize)); } } @Override public PageWriter getPageWriter(ColumnDescriptor path) { return writers.get(path); } public void flushToFileWriter(ParquetFileWriter writer) throws IOException { for (ColumnDescriptor path : schema.getColumns()) { ColumnChunkPageWriter pageWriter = writers.get(path); pageWriter.writeToFileWriter(writer); } } }