/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.apache.drill.exec.store.parquet.columnreaders; import java.io.IOException; import org.apache.drill.common.exceptions.ExecutionSetupException; import org.apache.drill.exec.vector.BaseDataValueVector; import org.apache.drill.exec.vector.UInt4Vector; import org.apache.drill.exec.vector.complex.RepeatedValueVector; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.format.SchemaElement; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; public class FixedWidthRepeatedReader extends VarLengthColumn<RepeatedValueVector> { ColumnReader<?> dataReader; int dataTypeLengthInBytes; // we can do a vector copy of the data once we figure out how much we need to copy // this tracks the number of values to transfer (the dataReader will translate this to a number // of bytes to transfer and re-use the code from the non-repeated types) int valuesToRead; int repeatedGroupsReadInCurrentPass; int repeatedValuesInCurrentList; // empty lists are notated by definition levels, to stop reading at the correct time, we must keep // track of the number of empty lists as well as the length of all of the defined lists together int definitionLevelsRead; // parquet currently does not restrict lists reaching across pages for repeated values, this necessitates // tracking when this happens to stop some of the state updates until we know the full length of the repeated // value for the current record boolean notFishedReadingList; byte[] leftOverBytes; FixedWidthRepeatedReader(ParquetRecordReader parentReader, ColumnReader<?> dataReader, int dataTypeLengthInBytes, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, RepeatedValueVector valueVector, SchemaElement schemaElement) throws ExecutionSetupException { super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, valueVector, schemaElement); this.dataTypeLengthInBytes = dataTypeLengthInBytes; this.dataReader = dataReader; this.dataReader.pageReader.clear(); this.dataReader.pageReader = this.pageReader; // this is not in the reset method because it needs to be initialized only for the very first page read // in all other cases if a read ends at a page boundary we will need to keep track of this flag and not // clear it at the start of the next read loop notFishedReadingList = false; } @Override public void reset() { bytesReadInCurrentPass = 0; valuesReadInCurrentPass = 0; pageReader.valuesReadyToRead = 0; dataReader.vectorData = BaseDataValueVector.class.cast(valueVec.getDataVector()).getBuffer(); dataReader.valuesReadInCurrentPass = 0; repeatedGroupsReadInCurrentPass = 0; } @Override public int getRecordsReadInCurrentPass() { return repeatedGroupsReadInCurrentPass; } @Override protected void readField(long recordsToRead) { //To change body of implemented methods use File | Settings | File Templates. } @Override public boolean skipReadyToReadPositionUpdate() { return false; } @Override public void updateReadyToReadPosition() { valuesToRead += repeatedValuesInCurrentList; pageReader.valuesReadyToRead += repeatedValuesInCurrentList; repeatedGroupsReadInCurrentPass++; currDictVal = null; if ( ! notFishedReadingList) { repeatedValuesInCurrentList = -1; } } @Override public void updatePosition() { pageReader.readPosInBytes += dataTypeLengthInBits; bytesReadInCurrentPass += dataTypeLengthInBits; valuesReadInCurrentPass++; } @Override public void hitRowGroupEnd() { pageReader.valuesReadyToRead = 0; definitionLevelsRead = 0; } @Override public void postPageRead() { super.postPageRead(); // this is no longer correct as we figured out that lists can reach across pages if ( ! notFishedReadingList) { repeatedValuesInCurrentList = -1; } definitionLevelsRead = 0; } @Override protected int totalValuesReadAndReadyToReadInPage() { // we need to prevent the page reader from getting rid of the current page in the case where we have a repeated // value split across a page boundary if (notFishedReadingList) { return definitionLevelsRead - repeatedValuesInCurrentList; } return definitionLevelsRead; } @Override protected boolean checkVectorCapacityReached() { boolean doneReading = super.checkVectorCapacityReached(); if (doneReading) { return true; } if (valuesReadInCurrentPass + pageReader.valuesReadyToRead + repeatedValuesInCurrentList >= valueVec.getValueCapacity()) { return true; } else { return false; } } @SuppressWarnings("resource") @Override protected boolean readAndStoreValueSizeInformation() { int numLeftoverVals = 0; if (notFishedReadingList) { numLeftoverVals = repeatedValuesInCurrentList; readRecords(numLeftoverVals); notFishedReadingList = false; pageReader.valuesReadyToRead = 0; try { boolean stopReading = readPage(); if (stopReading) { // hit the end of a row group return false; } } catch (IOException e) { throw new RuntimeException("Unexpected error reading parquet repeated column.", e); } } if ( currDefLevel == -1 ) { currDefLevel = pageReader.definitionLevels.readInteger(); definitionLevelsRead++; } int repLevel; if ( columnDescriptor.getMaxDefinitionLevel() == currDefLevel) { if (repeatedValuesInCurrentList == -1 || notFishedReadingList) { repeatedValuesInCurrentList = 1; do { repLevel = pageReader.repetitionLevels.readInteger(); if (repLevel > 0) { repeatedValuesInCurrentList++; currDefLevel = pageReader.definitionLevels.readInteger(); definitionLevelsRead++; // we hit the end of this page, without confirmation that we reached the end of the current record if (definitionLevelsRead == pageReader.currentPageCount) { // check that we have not hit the end of the row group (in which case we will not find the repetition level indicating // the end of this record as there is no next page to check, we have read all the values in this repetition so it is okay // to add it to the read ) if (totalValuesRead + pageReader.valuesReadyToRead + repeatedValuesInCurrentList != columnChunkMetaData.getValueCount()) { notFishedReadingList = true; // if we hit this case, we cut off the current batch at the previous value, these extra values as well // as those that spill into the next page will be added to the next batch return true; } } } } while (repLevel != 0); } } else { repeatedValuesInCurrentList = 0; } // this should not fail final UInt4Vector offsets = valueVec.getOffsetVector(); offsets.getMutator().setSafe(repeatedGroupsReadInCurrentPass + 1, offsets.getAccessor().get(repeatedGroupsReadInCurrentPass)); // This field is being referenced in the superclass determineSize method, so we need to set it here // again going to make this the length in BYTES to avoid repetitive multiplication/division dataTypeLengthInBits = repeatedValuesInCurrentList * dataTypeLengthInBytes; return false; } @Override protected void readRecords(int valuesToRead) { if (valuesToRead == 0) { return; } // TODO - validate that this works in all cases, it fixes a bug when reading from multiple pages into // a single vector dataReader.valuesReadInCurrentPass = 0; dataReader.readValues(valuesToRead); valuesReadInCurrentPass += valuesToRead; valueVec.getMutator().setValueCount(repeatedGroupsReadInCurrentPass); valueVec.getDataVector().getMutator().setValueCount(valuesReadInCurrentPass); } @Override public int capacity() { return BaseDataValueVector.class.cast(valueVec.getDataVector()).getBuffer().capacity(); } @Override public void clear() { super.clear(); dataReader.clear(); } }