/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.parquet;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.logical.StoragePluginConfig;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.physical.base.AbstractWriter;
import org.apache.drill.exec.physical.base.PhysicalOperator;
import org.apache.drill.exec.physical.impl.WriterRecordBatch;
import org.apache.drill.exec.planner.logical.DrillTable;
import org.apache.drill.exec.planner.logical.DynamicDrillTable;
import org.apache.drill.exec.proto.ExecProtos.FragmentHandle;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.server.DrillbitContext;
import org.apache.drill.exec.store.RecordWriter;
import org.apache.drill.exec.store.StoragePluginOptimizerRule;
import org.apache.drill.exec.store.dfs.BasicFormatMatcher;
import org.apache.drill.exec.store.dfs.DrillFileSystem;
import org.apache.drill.exec.store.dfs.DrillPathFilter;
import org.apache.drill.exec.store.dfs.FileSelection;
import org.apache.drill.exec.store.dfs.FileSystemConfig;
import org.apache.drill.exec.store.dfs.FileSystemPlugin;
import org.apache.drill.exec.store.dfs.FormatMatcher;
import org.apache.drill.exec.store.dfs.FormatPlugin;
import org.apache.drill.exec.store.dfs.FormatSelection;
import org.apache.drill.exec.store.dfs.MagicString;
import org.apache.drill.exec.store.dfs.MetadataContext;
import org.apache.drill.exec.store.mock.MockStorageEngine;
import org.apache.drill.exec.store.parquet.Metadata.ParquetTableMetadataDirs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileWriter;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
public class ParquetFormatPlugin implements FormatPlugin{
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MockStorageEngine.class);
public static final ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
private static final String DEFAULT_NAME = "parquet";
private static final List<Pattern> PATTERNS = Lists.newArrayList(
Pattern.compile(".*\\.parquet$"),
Pattern.compile(".*/" + ParquetFileWriter.PARQUET_METADATA_FILE));
private static final List<MagicString> MAGIC_STRINGS = Lists.newArrayList(new MagicString(0, ParquetFileWriter.MAGIC));
private final DrillbitContext context;
private final Configuration fsConf;
private final ParquetFormatMatcher formatMatcher;
private final ParquetFormatConfig config;
private final StoragePluginConfig storageConfig;
private final String name;
public ParquetFormatPlugin(String name, DrillbitContext context, Configuration fsConf,
StoragePluginConfig storageConfig){
this(name, context, fsConf, storageConfig, new ParquetFormatConfig());
}
public ParquetFormatPlugin(String name, DrillbitContext context, Configuration fsConf,
StoragePluginConfig storageConfig, ParquetFormatConfig formatConfig){
this.context = context;
this.config = formatConfig;
this.formatMatcher = new ParquetFormatMatcher(this, config);
this.storageConfig = storageConfig;
this.fsConf = fsConf;
this.name = name == null ? DEFAULT_NAME : name;
}
@Override
public Configuration getFsConf() {
return fsConf;
}
@Override
public ParquetFormatConfig getConfig() {
return config;
}
public DrillbitContext getContext() {
return this.context;
}
@Override
public boolean supportsRead() {
return true;
}
@Override
public Set<StoragePluginOptimizerRule> getOptimizerRules() {
return ImmutableSet.of();
}
@Override
public AbstractWriter getWriter(PhysicalOperator child, String location, List<String> partitionColumns) throws IOException {
return new ParquetWriter(child, location, partitionColumns, this);
}
public RecordWriter getRecordWriter(FragmentContext context, ParquetWriter writer) throws IOException, OutOfMemoryException {
Map<String, String> options = Maps.newHashMap();
options.put("location", writer.getLocation());
FragmentHandle handle = context.getHandle();
String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId());
options.put("prefix", fragmentId);
options.put(FileSystem.FS_DEFAULT_NAME_KEY, ((FileSystemConfig)writer.getStorageConfig()).connection);
options.put(ExecConstants.PARQUET_BLOCK_SIZE, context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE).num_val.toString());
options.put(ExecConstants.PARQUET_PAGE_SIZE, context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE).num_val.toString());
options.put(ExecConstants.PARQUET_DICT_PAGE_SIZE, context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE).num_val.toString());
options.put(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE,
context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE).string_val);
options.put(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING,
context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING).bool_val.toString());
RecordWriter recordWriter = new ParquetRecordWriter(context, writer);
recordWriter.init(options);
return recordWriter;
}
public WriterRecordBatch getWriterBatch(FragmentContext context, RecordBatch incoming, ParquetWriter writer)
throws ExecutionSetupException {
try {
return new WriterRecordBatch(writer, incoming, context, getRecordWriter(context, writer));
} catch(IOException e) {
throw new ExecutionSetupException(String.format("Failed to create the WriterRecordBatch. %s", e.getMessage()), e);
}
}
@Override
public ParquetGroupScan getGroupScan(String userName, FileSelection selection, List<SchemaPath> columns)
throws IOException {
return new ParquetGroupScan(userName, selection, this, selection.selectionRoot, selection.cacheFileRoot, columns);
}
@Override
public StoragePluginConfig getStorageConfig() {
return storageConfig;
}
public String getName(){
return name;
}
@Override
public boolean supportsWrite() {
return false;
}
@Override
public boolean supportsAutoPartitioning() {
return true;
}
@Override
public FormatMatcher getMatcher() {
return formatMatcher;
}
private static class ParquetFormatMatcher extends BasicFormatMatcher{
private final ParquetFormatConfig formatConfig;
public ParquetFormatMatcher(ParquetFormatPlugin plugin, ParquetFormatConfig formatConfig) {
super(plugin, PATTERNS, MAGIC_STRINGS);
this.formatConfig = formatConfig;
}
@Override
public boolean supportDirectoryReads() {
return true;
}
@Override
public DrillTable isReadable(DrillFileSystem fs, FileSelection selection,
FileSystemPlugin fsPlugin, String storageEngineName, String userName)
throws IOException {
if(selection.containsDirectories(fs)) {
Path dirMetaPath = new Path(selection.getSelectionRoot(), Metadata.METADATA_DIRECTORIES_FILENAME);
// check if the metadata 'directories' file exists; if it does, there is an implicit assumption that
// the directory is readable since the metadata 'directories' file cannot be created otherwise. Note
// that isDirReadable() does a similar check with the metadata 'cache' file.
if (fs.exists(dirMetaPath)) {
// create a metadata context that will be used for the duration of the query for this table
MetadataContext metaContext = new MetadataContext();
ParquetTableMetadataDirs mDirs = Metadata.readMetadataDirs(fs, dirMetaPath.toString(), metaContext, formatConfig);
if (mDirs.getDirectories().size() > 0) {
FileSelection dirSelection = FileSelection.createFromDirectories(mDirs.getDirectories(), selection,
selection.getSelectionRoot() /* cacheFileRoot initially points to selectionRoot */);
dirSelection.setExpandedPartial();
dirSelection.setMetaContext(metaContext);
return new DynamicDrillTable(fsPlugin, storageEngineName, userName,
new FormatSelection(plugin.getConfig(), dirSelection));
}
}
if(isDirReadable(fs, selection.getFirstPath(fs))) {
return new DynamicDrillTable(fsPlugin, storageEngineName, userName,
new FormatSelection(plugin.getConfig(), selection));
}
}
return super.isReadable(fs, selection, fsPlugin, storageEngineName, userName);
}
private Path getMetadataPath(FileStatus dir) {
return new Path(dir.getPath(), Metadata.METADATA_FILENAME);
}
private boolean metaDataFileExists(FileSystem fs, FileStatus dir) throws IOException {
return fs.exists(getMetadataPath(dir));
}
boolean isDirReadable(DrillFileSystem fs, FileStatus dir) {
Path p = new Path(dir.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
try {
if (fs.exists(p)) {
return true;
} else {
if (metaDataFileExists(fs, dir)) {
return true;
}
PathFilter filter = new DrillPathFilter();
FileStatus[] files = fs.listStatus(dir.getPath(), filter);
if (files.length == 0) {
return false;
}
return super.isFileReadable(fs, files[0]);
}
} catch (IOException e) {
logger.info("Failure while attempting to check for Parquet metadata file.", e);
return false;
}
}
}
}