/**
* Copyright 2011-2017 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.directio.hive.orc;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.CompressionKind;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcFile.OrcTableProperties;
import org.apache.hadoop.hive.ql.io.orc.OrcFile.Version;
import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.StripeInformation;
import com.asakusafw.directio.hive.info.BuiltinStorageFormatInfo;
import com.asakusafw.directio.hive.info.StorageFormatInfo;
import com.asakusafw.directio.hive.info.TableInfo;
import com.asakusafw.directio.hive.serde.DataModelDescriptor;
import com.asakusafw.directio.hive.serde.DataModelInspector;
import com.asakusafw.directio.hive.serde.DataModelMapping;
import com.asakusafw.directio.hive.serde.PropertyDescriptor;
import com.asakusafw.runtime.directio.Counter;
import com.asakusafw.runtime.directio.DirectInputFragment;
import com.asakusafw.runtime.directio.hadoop.BlockMap;
import com.asakusafw.runtime.directio.hadoop.HadoopFileFormat;
import com.asakusafw.runtime.directio.hadoop.StripedDataFormat;
import com.asakusafw.runtime.io.ModelInput;
import com.asakusafw.runtime.io.ModelOutput;
/**
* An abstract implementation of {@link HadoopFileFormat} for ORCFile.
* @param <T> the data model type
* @since 0.7.0
*/
public abstract class AbstractOrcFileFormat<T> extends HadoopFileFormat<T>
implements StripedDataFormat<T>, TableInfo.Provider {
static final Log LOG = LogFactory.getLog(AbstractOrcFileFormat.class);
/**
* Returns the format configuration.
* @return the format configuration
*/
public abstract OrcFormatConfiguration getFormatConfiguration();
/**
* Returns the target data model descriptor.
* @return the target data model descriptor
*/
public abstract DataModelDescriptor getDataModelDescriptor();
/**
* Returns the table name.
* @return the table name
*/
public abstract String getTableName();
@Override
public TableInfo getSchema() {
DataModelDescriptor desc = getDataModelDescriptor();
TableInfo.Builder builder = new TableInfo.Builder(getTableName());
for (PropertyDescriptor property : desc.getPropertyDescriptors()) {
builder.withColumn(property.getSchema());
}
builder.withComment(desc.getDataModelComment());
builder.withStorageFormat(BuiltinStorageFormatInfo.of(StorageFormatInfo.FormatKind.ORC));
OrcFormatConfiguration conf = getFormatConfiguration();
Map<String, String> properties = new HashMap<>();
putTableProperty(properties, OrcTableProperties.COMPRESSION, conf.getCompressionKind());
putTableProperty(properties, OrcTableProperties.STRIPE_SIZE, conf.getStripeSize());
builder.withProperties(properties);
return builder.build();
}
private void putTableProperty(Map<String, String> results, OrcTableProperties property, Object value) {
if (value == null) {
return;
}
results.put(property.getPropName(), value.toString());
}
@SuppressWarnings("unchecked")
@Override
public Class<T> getSupportedType() {
return (Class<T>) getDataModelDescriptor().getDataModelClass();
}
@Override
public List<DirectInputFragment> computeInputFragments(
InputContext context) throws IOException, InterruptedException {
// TODO parallel?
List<DirectInputFragment> results = new ArrayList<>();
for (FileStatus status : context.getInputFiles()) {
if (LOG.isInfoEnabled()) {
LOG.info(MessageFormat.format(
Messages.getString("AbstractOrcFileFormat.infoLoadMetadata"), //$NON-NLS-1$
context.getDataType().getSimpleName(),
status.getPath()));
}
Reader orc = OrcFile.createReader(context.getFileSystem(), status.getPath());
if (LOG.isInfoEnabled()) {
LOG.info(MessageFormat.format(
Messages.getString("AbstractOrcFileFormat.infoAnalyzeMetadata"), //$NON-NLS-1$
context.getDataType().getSimpleName(),
status.getPath(),
orc.getNumberOfRows(),
orc.getRawDataSize()));
}
BlockMap blockMap = BlockMap.create(
status.getPath().toString(),
status.getLen(),
BlockMap.computeBlocks(context.getFileSystem(), status),
false);
// TODO configurable split
for (StripeInformation stripe : orc.getStripes()) {
long begin = stripe.getOffset();
long end = begin + stripe.getLength();
DirectInputFragment fragment = blockMap.get(begin, end);
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format(
"Detect ORCFile stripe: path={0}, rows={1}, range={2}+{3}, allocation={4}", //$NON-NLS-1$
fragment.getPath(),
stripe.getNumberOfRows(),
fragment.getOffset(),
fragment.getSize(),
fragment.getOwnerNodeNames()));
}
results.add(fragment);
}
}
return results;
}
@Override
public ModelInput<T> createInput(
Class<? extends T> dataType,
FileSystem fileSystem, Path path,
long offset, long fragmentSize,
Counter counter) throws IOException, InterruptedException {
DataModelMapping driverConf = new DataModelMapping();
OrcFormatConfiguration conf = getFormatConfiguration();
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format(
"ORCFile input ({0}): {1}", //$NON-NLS-1$
path,
conf));
}
if (conf.getFieldMappingStrategy() != null) {
driverConf.setFieldMappingStrategy(conf.getFieldMappingStrategy());
}
if (conf.getOnMissingSource() != null) {
driverConf.setOnMissingSource(conf.getOnMissingSource());
}
if (conf.getOnMissingTarget() != null) {
driverConf.setOnMissingTarget(conf.getOnMissingTarget());
}
if (conf.getOnIncompatibleType() != null) {
driverConf.setOnIncompatibleType(conf.getOnIncompatibleType());
}
long size = fragmentSize;
if (size < 0L) {
FileStatus stat = fileSystem.getFileStatus(path);
size = stat.getLen();
}
return new OrcFileInput<>(
getDataModelDescriptor(), driverConf,
fileSystem, path,
offset, size, counter);
}
@Override
public ModelOutput<T> createOutput(
Class<? extends T> dataType,
FileSystem fileSystem, Path path,
Counter counter) throws IOException, InterruptedException {
WriterOptions options = OrcFile.writerOptions(getConf());
options.fileSystem(fileSystem);
options.inspector(new DataModelInspector(getDataModelDescriptor()));
OrcFormatConfiguration conf = getFormatConfiguration();
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format(
"ORCFile output ({0}): {1}", //$NON-NLS-1$
path,
conf));
}
Version formatVersion = conf.getFormatVersion();
if (formatVersion != null) {
options.version(formatVersion);
}
CompressionKind compressionKind = conf.getCompressionKind();
if (compressionKind != null) {
options.compress(compressionKind);
}
Long stripeSize = conf.getStripeSize();
if (stripeSize != null) {
options.stripeSize(stripeSize);
}
return new OrcFileOutput<>(getDataModelDescriptor(), path, options, counter);
}
}