/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.rcfile;
import com.facebook.presto.rcfile.binary.BinaryRcFileEncoding;
import com.facebook.presto.spi.block.Block;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;
import io.airlift.units.DataSize;
import org.testng.annotations.Test;
import java.io.IOException;
import java.util.List;
import static com.facebook.presto.spi.type.SmallintType.SMALLINT;
import static com.google.common.base.Preconditions.checkArgument;
import static io.airlift.slice.Slices.utf8Slice;
import static io.airlift.units.DataSize.Unit.MEGABYTE;
import static java.lang.Math.toIntExact;
import static java.util.stream.Collectors.toList;
import static org.testng.Assert.assertEquals;
public class TestRcFileReaderManual
{
private static final Slice COLUMN_COUNT_METADATA_KEY = utf8Slice("hive.io.rcfile.column.number");
private static final Slice RCFILE_MAGIC = utf8Slice("RCF");
private static final int CURRENT_VERSION = 1;
private static final long syncFirst = 0x1234_5678_9012_3456L;
private static final long syncSecond = 0x7890_1234_5678_9012L;
@Test
public void testNoStartSync()
throws Exception
{
SliceOutput output = new DynamicSliceOutput(10 * 1024);
List<Segment> segments = ImmutableList.of(
writeSegment(output, ImmutableList.of(ImmutableList.of(0, 2, 3, 4), ImmutableList.of(10, 12, 13))),
writeSegment(output, ImmutableList.of(ImmutableList.of(20, 22), ImmutableList.of(30, 33), ImmutableList.of(40, 44))),
writeSegment(output, ImmutableList.of(ImmutableList.of(100, 101, 102))));
assertFileSegments(output.slice(), segments);
}
@Test
public void testStartSync()
throws Exception
{
SliceOutput output = new DynamicSliceOutput(10 * 1024);
List<Segment> segments = ImmutableList.of(
writeSegment(output, ImmutableList.of()),
writeSegment(output, ImmutableList.of(ImmutableList.of(0, 2, 3, 4), ImmutableList.of(10, 12, 13))),
writeSegment(output, ImmutableList.of(ImmutableList.of(20, 22), ImmutableList.of(30, 33), ImmutableList.of(40, 44))),
writeSegment(output, ImmutableList.of(ImmutableList.of(100, 101, 102))));
assertFileSegments(output.slice(), segments);
}
private static void assertFileSegments(Slice file, List<Segment> segments)
throws IOException
{
// read whole file
List<Integer> allValues = segments.stream()
.map(Segment::getValues)
.flatMap(List::stream)
.collect(toList());
assertEquals(allValues, readValues(file, 0, file.length()));
for (Segment segment : segments) {
// whole segment
assertEquals(segment.getValues(), readValues(file, segment.getOffset(), segment.getLength()));
// first byte of segment
assertEquals(segment.getValues(), readValues(file, segment.getOffset(), 1));
// straddle segment start
assertEquals(segment.getValues(), readValues(file, segment.getOffset() - 1, 2));
// regions entirely within the the segment
assertEquals(ImmutableList.of(), readValues(file, segment.getOffset() + 1, 1));
assertEquals(ImmutableList.of(), readValues(file, segment.getOffset() + 1, segment.getLength() - 1));
for (int rowGroupOffset : segment.getRowGroupSegmentOffsets()) {
// segment header to row group start
assertEquals(segment.getValues(), readValues(file, segment.getOffset(), rowGroupOffset));
assertEquals(segment.getValues(), readValues(file, segment.getOffset(), rowGroupOffset - 1));
assertEquals(segment.getValues(), readValues(file, segment.getOffset(), rowGroupOffset + 1));
// region from grow group start until end of file (row group offset is always inside of the segment since a
// segment starts with a file header or sync sequence)
assertEquals(ImmutableList.of(), readValues(file, segment.getOffset() + rowGroupOffset, segment.getLength() - rowGroupOffset));
}
}
// all combinations of segments
for (int startSegmentIndex = 0; startSegmentIndex < segments.size(); startSegmentIndex++) {
Segment startSegment = segments.get(startSegmentIndex);
for (int endSegmentIndex = startSegmentIndex; endSegmentIndex < segments.size(); endSegmentIndex++) {
Segment endSegment = segments.get(endSegmentIndex);
List<Integer> segmentsValues = segments.subList(startSegmentIndex, endSegmentIndex + 1).stream()
.map(Segment::getValues)
.flatMap(List::stream)
.collect(toList());
assertEquals(segmentsValues, readValues(file, startSegment.getOffset(), endSegment.getOffset() + endSegment.getLength() - startSegment.getOffset()));
assertEquals(segmentsValues, readValues(file, startSegment.getOffset(), endSegment.getOffset() + 1 - startSegment.getOffset()));
assertEquals(segmentsValues, readValues(file, startSegment.getOffset() - 1, endSegment.getOffset() + 1 + endSegment.getLength() - startSegment.getOffset()));
assertEquals(segmentsValues, readValues(file, startSegment.getOffset() - 1, endSegment.getOffset() + 1 + 1 - startSegment.getOffset()));
}
}
}
private static Segment writeSegment(SliceOutput output, List<List<Integer>> rowGroups)
{
int offset = output.size();
// if we are at the beginning of the file write a file header, otherwise write a sync
if (offset == 0) {
writeFileHeader(output);
}
else {
writeSync(output);
}
ImmutableList.Builder<Integer> rowGroupOffsets = ImmutableList.builder();
for (List<Integer> rowGroup : rowGroups) {
rowGroupOffsets.add(output.size() - offset);
writeRowGroup(output, rowGroup);
}
int length = output.size() - offset;
return new Segment(
rowGroups.stream()
.flatMap(List::stream)
.collect(toList()),
offset,
length,
rowGroupOffsets.build());
}
private static void writeFileHeader(SliceOutput output)
{
// write header
output.writeBytes(RCFILE_MAGIC);
output.writeByte(CURRENT_VERSION);
// write codec information
output.writeBoolean(false);
// write metadata (which contains just the column count)
output.writeInt(Integer.reverseBytes(1));
output.writeByte(COLUMN_COUNT_METADATA_KEY.length());
output.writeBytes(COLUMN_COUNT_METADATA_KEY);
output.writeByte(1);
output.writeByte('1');
// write sync sequence
output.writeLong(syncFirst);
output.writeLong(syncSecond);
}
private static void writeSync(SliceOutput output)
{
output.writeInt(-1);
output.writeLong(syncFirst);
output.writeLong(syncSecond);
}
private static void writeRowGroup(SliceOutput output, List<Integer> shortValues)
{
// add arbitrary limit assure all lengths write as a simple single vint byte
checkArgument(shortValues.size() < 32);
// key section is 4 vint sizes followed by the column data
int columnLengthsLength = shortValues.size();
int keySectionLength = 4 + columnLengthsLength;
int columnDataLength = shortValues.size() * 2;
// write the sum of the uncompressed key length and compressed value length
// this number is useless to the reader
output.writeInt(Integer.reverseBytes(keySectionLength + columnDataLength));
// key section: uncompressed size
output.writeInt(Integer.reverseBytes(keySectionLength));
// key section: compressed size
output.writeInt(Integer.reverseBytes(keySectionLength));
// key section: row count
output.writeByte(shortValues.size());
// key section: column data compressed size
output.writeByte(columnDataLength);
// key section: column data uncompressed size
output.writeByte(columnDataLength);
// key section: column lengths uncompressed size
output.writeByte(columnLengthsLength);
// key section: column lengths
for (int ignored : shortValues) {
output.write(2);
}
// value section: data
for (int value : shortValues) {
output.writeShort(Short.reverseBytes((short) value));
}
}
private static List<Integer> readValues(Slice data, int offset, int length)
throws IOException
{
// to simplify the testing:
// change negative offsets to 0
// truncate length so it is not off the end of the file
if (offset < 0) {
// adjust length to new offset
length += offset;
offset = 0;
}
if (offset + length > data.length()) {
length = data.length() - offset;
}
RcFileReader reader = new RcFileReader(
new SliceRcFileDataSource(data),
new BinaryRcFileEncoding(),
ImmutableMap.of(0, SMALLINT),
new BogusRcFileCodecFactory(),
offset,
length,
new DataSize(1, MEGABYTE));
ImmutableList.Builder<Integer> values = ImmutableList.builder();
while (reader.advance() >= 0) {
Block block = reader.readBlock(0);
for (int position = 0; position < block.getPositionCount(); position++) {
values.add((int) SMALLINT.getLong(block, position));
}
}
return values.build();
}
private static class Segment
{
private final List<Integer> values;
private final int offset;
private final int length;
private final List<Integer> rowGroupSegmentOffsets;
public Segment(List<Integer> values, int offset, int length, List<Integer> rowGroupSegmentOffsets)
{
this.values = ImmutableList.copyOf(values);
this.offset = offset;
this.length = length;
this.rowGroupSegmentOffsets = ImmutableList.copyOf(rowGroupSegmentOffsets);
}
public List<Integer> getValues()
{
return values;
}
public int getOffset()
{
return offset;
}
public int getLength()
{
return length;
}
public List<Integer> getRowGroupSegmentOffsets()
{
return rowGroupSegmentOffsets;
}
}
private static class SliceRcFileDataSource
implements RcFileDataSource
{
private final Slice data;
public SliceRcFileDataSource(Slice data)
{
this.data = data;
}
@Override
public long getReadBytes()
{
return 0;
}
@Override
public long getReadTimeNanos()
{
return 0;
}
@Override
public long getSize()
{
return data.length();
}
@Override
public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength)
{
data.getBytes(toIntExact(position), buffer, bufferOffset, bufferLength);
}
@Override
public void close()
{
}
}
private static class BogusRcFileCodecFactory
implements RcFileCodecFactory
{
@Override
public RcFileCompressor createCompressor(String codecName)
{
throw new UnsupportedOperationException();
}
@Override
public RcFileDecompressor createDecompressor(String codecName)
{
throw new UnsupportedOperationException();
}
}
}