/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive.orc;
import com.facebook.presto.hive.FileFormatDataSourceStats;
import com.facebook.presto.hive.HdfsEnvironment;
import com.facebook.presto.hive.HiveClientConfig;
import com.facebook.presto.hive.HiveColumnHandle;
import com.facebook.presto.hive.HivePageSourceFactory;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcDataSourceId;
import com.facebook.presto.orc.OrcPredicate;
import com.facebook.presto.orc.OrcReader;
import com.facebook.presto.orc.OrcRecordReader;
import com.facebook.presto.orc.TupleDomainOrcPredicate;
import com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference;
import com.facebook.presto.orc.memory.AggregatedMemoryContext;
import com.facebook.presto.orc.metadata.MetadataReader;
import com.facebook.presto.orc.metadata.OrcMetadataReader;
import com.facebook.presto.spi.ConnectorPageSource;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.predicate.TupleDomain;
import com.facebook.presto.spi.type.Type;
import com.facebook.presto.spi.type.TypeManager;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.units.DataSize;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.joda.time.DateTimeZone;
import javax.inject.Inject;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import java.util.regex.Pattern;
import static com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR;
import static com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT;
import static com.facebook.presto.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES;
import static com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA;
import static com.facebook.presto.hive.HiveSessionProperties.getOrcMaxBufferSize;
import static com.facebook.presto.hive.HiveSessionProperties.getOrcMaxMergeDistance;
import static com.facebook.presto.hive.HiveSessionProperties.getOrcStreamBufferSize;
import static com.facebook.presto.hive.HiveSessionProperties.isOrcBloomFiltersEnabled;
import static com.facebook.presto.hive.HiveUtil.isDeserializerClass;
import static com.google.common.base.Strings.nullToEmpty;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
public class OrcPageSourceFactory
implements HivePageSourceFactory
{
private static final Pattern DEFAULT_HIVE_COLUMN_NAME_PATTERN = Pattern.compile("_col\\d+");
private final TypeManager typeManager;
private final boolean useOrcColumnNames;
private final HdfsEnvironment hdfsEnvironment;
private final FileFormatDataSourceStats stats;
@Inject
public OrcPageSourceFactory(TypeManager typeManager, HiveClientConfig config, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats)
{
this(typeManager, requireNonNull(config, "hiveClientConfig is null").isUseOrcColumnNames(), hdfsEnvironment, stats);
}
public OrcPageSourceFactory(TypeManager typeManager, boolean useOrcColumnNames, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats)
{
this.typeManager = requireNonNull(typeManager, "typeManager is null");
this.useOrcColumnNames = useOrcColumnNames;
this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
this.stats = requireNonNull(stats, "stats is null");
}
@Override
public Optional<? extends ConnectorPageSource> createPageSource(
Configuration configuration,
ConnectorSession session,
Path path,
long start,
long length,
Properties schema,
List<HiveColumnHandle> columns,
TupleDomain<HiveColumnHandle> effectivePredicate,
DateTimeZone hiveStorageTimeZone)
{
if (!isDeserializerClass(schema, OrcSerde.class)) {
return Optional.empty();
}
return Optional.of(createOrcPageSource(
new OrcMetadataReader(),
hdfsEnvironment,
session.getUser(),
configuration,
path,
start,
length,
columns,
useOrcColumnNames,
effectivePredicate,
hiveStorageTimeZone,
typeManager,
getOrcMaxMergeDistance(session),
getOrcMaxBufferSize(session),
getOrcStreamBufferSize(session),
isOrcBloomFiltersEnabled(session),
stats));
}
public static OrcPageSource createOrcPageSource(
MetadataReader metadataReader,
HdfsEnvironment hdfsEnvironment,
String sessionUser,
Configuration configuration,
Path path,
long start,
long length,
List<HiveColumnHandle> columns,
boolean useOrcColumnNames,
TupleDomain<HiveColumnHandle> effectivePredicate,
DateTimeZone hiveStorageTimeZone,
TypeManager typeManager,
DataSize maxMergeDistance,
DataSize maxBufferSize,
DataSize streamBufferSize,
boolean orcBloomFiltersEnabled,
FileFormatDataSourceStats stats)
{
OrcDataSource orcDataSource;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
long size = fileSystem.getFileStatus(path).getLen();
FSDataInputStream inputStream = fileSystem.open(path);
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), size, maxMergeDistance, maxBufferSize, streamBufferSize, inputStream, stats);
}
catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") ||
e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext systemMemoryUsage = new AggregatedMemoryContext();
try {
OrcReader reader = new OrcReader(orcDataSource, metadataReader, maxMergeDistance, maxBufferSize);
List<HiveColumnHandle> physicalColumns = getPhysicalHiveColumnHandles(columns, useOrcColumnNames, reader, path);
ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
ImmutableList.Builder<ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
for (HiveColumnHandle column : physicalColumns) {
if (column.getColumnType() == REGULAR) {
Type type = typeManager.getType(column.getTypeSignature());
includedColumns.put(column.getHiveColumnIndex(), type);
columnReferences.add(new ColumnReference<>(column, column.getHiveColumnIndex(), type));
}
}
OrcPredicate predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences.build(), orcBloomFiltersEnabled);
OrcRecordReader recordReader = reader.createRecordReader(
includedColumns.build(),
predicate,
start,
length,
hiveStorageTimeZone,
systemMemoryUsage);
return new OrcPageSource(
recordReader,
orcDataSource,
physicalColumns,
typeManager,
systemMemoryUsage);
}
catch (Exception e) {
try {
orcDataSource.close();
}
catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = splitError(e, path, start, length);
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
private static String splitError(Throwable t, Path path, long start, long length)
{
return format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, t.getMessage());
}
private static List<HiveColumnHandle> getPhysicalHiveColumnHandles(List<HiveColumnHandle> columns, boolean useOrcColumnNames, OrcReader reader, Path path)
{
if (!useOrcColumnNames) {
return columns;
}
verifyFileHasColumnNames(reader.getColumnNames(), path);
Map<String, Integer> physicalNameOrdinalMap = buildPhysicalNameOrdinalMap(reader);
int nextMissingColumnIndex = physicalNameOrdinalMap.size();
ImmutableList.Builder<HiveColumnHandle> physicalColumns = ImmutableList.builder();
for (HiveColumnHandle column : columns) {
Integer physicalOrdinal = physicalNameOrdinalMap.get(column.getName());
if (physicalOrdinal == null) {
// if the column is missing from the file, assign it a column number larger
// than the number of columns in the file so the reader will fill it with nulls
physicalOrdinal = nextMissingColumnIndex;
nextMissingColumnIndex++;
}
physicalColumns.add(new HiveColumnHandle(column.getClientId(), column.getName(), column.getHiveType(), column.getTypeSignature(), physicalOrdinal, column.getColumnType(), column.getComment()));
}
return physicalColumns.build();
}
private static void verifyFileHasColumnNames(List<String> physicalColumnNames, Path path)
{
if (!physicalColumnNames.isEmpty() && physicalColumnNames.stream().allMatch(physicalColumnName -> DEFAULT_HIVE_COLUMN_NAME_PATTERN.matcher(physicalColumnName).matches())) {
throw new PrestoException(
HIVE_FILE_MISSING_COLUMN_NAMES,
"ORC file does not contain column names in the footer: " + path);
}
}
private static Map<String, Integer> buildPhysicalNameOrdinalMap(OrcReader reader)
{
ImmutableMap.Builder<String, Integer> physicalNameOrdinalMap = ImmutableMap.builder();
int ordinal = 0;
for (String physicalColumnName : reader.getColumnNames()) {
physicalNameOrdinalMap.put(physicalColumnName, ordinal);
ordinal++;
}
return physicalNameOrdinalMap.build();
}
}