package water.parser.parquet;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import water.fvec.Vec;
import water.parser.BufferedString;
import water.parser.ParseWriter;
import water.util.StringUtils;
/**
* Implementation of Parquet's GroupConverter for H2O's chunks.
*
* ChunkConverter is responsible for converting parquet data into Chunks. As opposed to regular
* Parquet converters this converter doesn't actually produce any records and instead writes the data
* using a provided ParseWriter to chunks. The (artificial) output of the converter is number of
* the record that was written to the chunk.
*
* Note: It is meant to be used as a root converter.
*/
class ChunkConverter extends GroupConverter {
private final WriterDelegate _writer;
private final Converter[] _converters;
private int _currentRecordIdx = -1;
ChunkConverter(MessageType parquetSchema, byte[] chunkSchema, ParseWriter writer) {
_writer = new WriterDelegate(writer, chunkSchema.length);
int colIdx = 0;
_converters = new Converter[chunkSchema.length];
for (Type parquetField : parquetSchema.getFields()) {
assert parquetField.isPrimitive();
_converters[colIdx] = newConverter(colIdx, chunkSchema[colIdx], parquetField.asPrimitiveType());
colIdx++;
}
}
@Override
public Converter getConverter(int fieldIndex) {
return _converters[fieldIndex];
}
@Override
public void start() {
_currentRecordIdx++;
_writer.startLine();
}
@Override
public void end() {
_writer.endLine();
assert _writer.lineNum() - 1 == _currentRecordIdx;
}
int getCurrentRecordIdx() {
return _currentRecordIdx;
}
private PrimitiveConverter newConverter(int colIdx, byte vecType, PrimitiveType parquetType) {
switch (vecType) {
case Vec.T_BAD:
case Vec.T_CAT:
case Vec.T_STR:
case Vec.T_UUID:
case Vec.T_TIME:
if (parquetType.getOriginalType() == OriginalType.TIMESTAMP_MILLIS) {
return new TimestampConverter(colIdx, _writer);
} else {
boolean dictSupport = parquetType.getOriginalType() == OriginalType.UTF8 || parquetType.getOriginalType() == OriginalType.ENUM;
return new StringConverter(_writer, colIdx, dictSupport);
}
case Vec.T_NUM:
return new NumberConverter(colIdx, _writer);
default:
throw new UnsupportedOperationException("Unsupported type " + vecType);
}
}
private static class StringConverter extends PrimitiveConverter {
private final BufferedString _bs = new BufferedString();
private final int _colIdx;
private final WriterDelegate _writer;
private final boolean _dictionarySupport;
private String[] _dict;
StringConverter(WriterDelegate writer, int colIdx, boolean dictionarySupport) {
_colIdx = colIdx;
_writer = writer;
_dictionarySupport = dictionarySupport;
}
@Override
public void addBinary(Binary value) {
_bs.set(StringUtils.bytesOf(value.toStringUsingUTF8()));
_writer.addStrCol(_colIdx, _bs);
}
@Override
public boolean hasDictionarySupport() {
return _dictionarySupport;
}
@Override
public void setDictionary(Dictionary dictionary) {
_dict = new String[dictionary.getMaxId() + 1];
for (int i = 0; i <= dictionary.getMaxId(); i++) {
_dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
}
}
@Override
public void addValueFromDictionary(int dictionaryId) {
_bs.set(StringUtils.bytesOf(_dict[dictionaryId]));
_writer.addStrCol(_colIdx, _bs);
}
}
private static class NumberConverter extends PrimitiveConverter {
private final int _colIdx;
private final WriterDelegate _writer;
private final BufferedString _bs = new BufferedString();
NumberConverter(int _colIdx, WriterDelegate _writer) {
this._colIdx = _colIdx;
this._writer = _writer;
}
@Override
public void addBoolean(boolean value) {
_writer.addNumCol(_colIdx, value ? 1 : 0);
}
@Override
public void addDouble(double value) {
_writer.addNumCol(_colIdx, value);
}
@Override
public void addFloat(float value) {
_writer.addNumCol(_colIdx, value);
}
@Override
public void addInt(int value) {
_writer.addNumCol(_colIdx, value, 0);
}
@Override
public void addLong(long value) {
_writer.addNumCol(_colIdx, value, 0);
}
@Override
public void addBinary(Binary value) {
_bs.set(StringUtils.bytesOf(value.toStringUsingUTF8()));
_writer.addStrCol(_colIdx, _bs);
}
}
private static class TimestampConverter extends PrimitiveConverter {
private final int _colIdx;
private final WriterDelegate _writer;
TimestampConverter(int _colIdx, WriterDelegate _writer) {
this._colIdx = _colIdx;
this._writer = _writer;
}
@Override
public void addLong(long value) {
_writer.addNumCol(_colIdx, value, 0);
}
}
private static class WriterDelegate {
private final ParseWriter _writer;
private final int _numCols;
private int _col;
WriterDelegate(ParseWriter writer, int numCols) {
_writer = writer;
_numCols = numCols;
_col = Integer.MIN_VALUE;
}
void startLine() {
_col = -1;
}
void endLine() {
moveToCol(_numCols);
_writer.newLine();
}
int moveToCol(int colIdx) {
for (int c = _col + 1; c < colIdx; c++) _writer.addInvalidCol(c);
_col = colIdx;
return _col;
}
void addNumCol(int colIdx, long number, int exp) {
_writer.addNumCol(moveToCol(colIdx), number, exp);
}
void addNumCol(int colIdx, double d) {
_writer.addNumCol(moveToCol(colIdx), d);
}
void addStrCol(int colIdx, BufferedString str) {
_writer.addStrCol(moveToCol(colIdx), str);
}
long lineNum() {
return _writer.lineNum();
}
}
}