/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.raptor.storage;
import com.facebook.presto.raptor.util.Closer;
import com.facebook.presto.raptor.util.SyncingFileSystem;
import com.facebook.presto.spi.classloader.ThreadContextClassLoader;
import io.airlift.log.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.NullMemoryManager;
import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
import org.apache.hadoop.hive.ql.io.orc.Writer;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import java.io.File;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.List;
import java.util.Map;
import static com.facebook.presto.raptor.util.Closer.closer;
import static io.airlift.slice.SizeOf.SIZE_OF_BYTE;
import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE;
import static io.airlift.slice.SizeOf.SIZE_OF_LONG;
import static io.airlift.units.Duration.nanosSince;
import static java.lang.Math.toIntExact;
import static org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader;
import static org.apache.hadoop.hive.ql.io.orc.OrcFile.createWriter;
import static org.apache.hadoop.hive.ql.io.orc.OrcUtil.getFieldValue;
public final class OrcFileRewriter
{
private static final Logger log = Logger.get(OrcFileRewriter.class);
private static final Configuration CONFIGURATION = new Configuration();
private OrcFileRewriter() {}
public static OrcFileInfo rewrite(File input, File output, BitSet rowsToDelete)
throws IOException
{
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
FileSystem fileSystem = new SyncingFileSystem(CONFIGURATION)) {
Reader reader = createReader(fileSystem, path(input));
if (reader.getNumberOfRows() < rowsToDelete.length()) {
throw new IOException("File has fewer rows than deletion vector");
}
int deleteRowCount = rowsToDelete.cardinality();
if (reader.getNumberOfRows() == deleteRowCount) {
return new OrcFileInfo(0, 0);
}
if (reader.getNumberOfRows() >= Integer.MAX_VALUE) {
throw new IOException("File has too many rows");
}
int inputRowCount = toIntExact(reader.getNumberOfRows());
WriterOptions writerOptions = new OrcWriterOptions(CONFIGURATION)
.memory(new NullMemoryManager(CONFIGURATION))
.fileSystem(fileSystem)
.compress(reader.getCompression())
.inspector(reader.getObjectInspector());
long start = System.nanoTime();
try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close);
Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) {
if (reader.hasMetadataValue(OrcFileMetadata.KEY)) {
ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY);
writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
}
OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, inputRowCount);
log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
return fileInfo;
}
}
}
private static OrcFileInfo rewrite(RecordReader reader, Writer writer, BitSet rowsToDelete, int inputRowCount)
throws IOException
{
Object object = null;
int row = 0;
long rowCount = 0;
long uncompressedSize = 0;
row = rowsToDelete.nextClearBit(row);
if (row < inputRowCount) {
reader.seekToRow(row);
}
while (row < inputRowCount) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedIOException();
}
// seekToRow() is extremely expensive
if (reader.getRowNumber() < row) {
reader.next(object);
continue;
}
object = reader.next(object);
writer.addRow(object);
rowCount++;
uncompressedSize += uncompressedSize(object);
row = rowsToDelete.nextClearBit(row + 1);
}
return new OrcFileInfo(rowCount, uncompressedSize);
}
private static Path path(File input)
{
return new Path(input.toURI());
}
private static int uncompressedSize(Object object)
throws IOException
{
if (object instanceof OrcStruct) {
OrcStruct struct = (OrcStruct) object;
int size = 0;
for (int i = 0; i < struct.getNumFields(); i++) {
size += uncompressedSize(getFieldValue(struct, i));
}
return size;
}
if ((object == null) || (object instanceof BooleanWritable)) {
return SIZE_OF_BYTE;
}
if (object instanceof LongWritable) {
return SIZE_OF_LONG;
}
if (object instanceof DoubleWritable) {
return SIZE_OF_DOUBLE;
}
if (object instanceof Text) {
return ((Text) object).getLength();
}
if (object instanceof BytesWritable) {
return ((BytesWritable) object).getLength();
}
if (object instanceof List<?>) {
int size = 0;
for (Object element : (Iterable<?>) object) {
size += uncompressedSize(element);
}
return size;
}
if (object instanceof Map<?, ?>) {
int size = 0;
for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
size += uncompressedSize(entry.getKey());
size += uncompressedSize(entry.getValue());
}
return size;
}
throw new IOException("Unhandled ORC object: " + object.getClass().getName());
}
public static class OrcFileInfo
{
private final long rowCount;
private final long uncompressedSize;
public OrcFileInfo(long rowCount, long uncompressedSize)
{
this.rowCount = rowCount;
this.uncompressedSize = uncompressedSize;
}
public long getRowCount()
{
return rowCount;
}
public long getUncompressedSize()
{
return uncompressedSize;
}
}
}