/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.CompressionKind;
import org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.SideEffectFree;
import org.apache.nifi.annotation.behavior.SupportsBatching;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.DataUnit;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.util.hive.HiveJdbcCommon;
import org.apache.nifi.util.hive.HiveUtils;
import org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
/**
* The ConvertAvroToORC processor takes an Avro-formatted flow file as input and converts it into ORC format.
*/
@SideEffectFree
@SupportsBatching
@Tags({"avro", "orc", "hive", "convert"})
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@CapabilityDescription("Converts an Avro record into ORC file format. This processor provides a direct mapping of an Avro record to an ORC record, such "
+ "that the resulting ORC file will have the same hierarchical structure as the Avro document. If an incoming FlowFile contains a stream of "
+ "multiple Avro records, the resultant FlowFile will contain a ORC file containing all of the Avro records. If an incoming FlowFile does "
+ "not contain any records, an empty ORC file is the output. NOTE: Many Avro datatypes (collections, primitives, and unions of primitives, e.g.) can "
+ "be converted to ORC, but unions of collections and other complex datatypes may not be able to be converted to ORC.")
@WritesAttributes({
@WritesAttribute(attribute = "mime.type", description = "Sets the mime type to application/octet-stream"),
@WritesAttribute(attribute = "filename", description = "Sets the filename to the existing filename with the extension replaced by / added to by .orc"),
@WritesAttribute(attribute = "record.count", description = "Sets the number of records in the ORC file."),
@WritesAttribute(attribute = "hive.ddl", description = "Creates a partial Hive DDL statement for creating a table in Hive from this ORC file. "
+ "This can be used in ReplaceText for setting the content to the DDL. To make it valid DDL, add \"LOCATION '<path_to_orc_file_in_hdfs>'\", where "
+ "the path is the directory that contains this ORC file on HDFS. For example, ConvertAvroToORC can send flow files to a PutHDFS processor to send the file to "
+ "HDFS, then to a ReplaceText to set the content to this DDL (plus the LOCATION clause as described), then to PutHiveQL processor to create the table "
+ "if it doesn't exist.")
})
public class ConvertAvroToORC extends AbstractProcessor {
// Attributes
public static final String ORC_MIME_TYPE = "application/octet-stream";
public static final String HIVE_DDL_ATTRIBUTE = "hive.ddl";
public static final String RECORD_COUNT_ATTRIBUTE = "record.count";
// Properties
public static final PropertyDescriptor ORC_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder()
.name("orc-config-resources")
.displayName("ORC Configuration Resources")
.description("A file or comma separated list of files which contains the ORC configuration (hive-site.xml, e.g.). Without this, Hadoop "
+ "will search the classpath for a 'hive-site.xml' file or will revert to a default configuration. Please see the ORC documentation for more details.")
.required(false).addValidator(HiveUtils.createMultipleFilesExistValidator()).build();
public static final PropertyDescriptor STRIPE_SIZE = new PropertyDescriptor.Builder()
.name("orc-stripe-size")
.displayName("Stripe Size")
.description("The size of the memory buffer (in bytes) for writing stripes to an ORC file")
.required(true)
.addValidator(StandardValidators.DATA_SIZE_VALIDATOR)
.defaultValue("64 MB")
.build();
public static final PropertyDescriptor BUFFER_SIZE = new PropertyDescriptor.Builder()
.name("orc-buffer-size")
.displayName("Buffer Size")
.description("The maximum size of the memory buffers (in bytes) used for compressing and storing a stripe in memory. This is a hint to the ORC writer, "
+ "which may choose to use a smaller buffer size based on stripe size and number of columns for efficient stripe writing and memory utilization.")
.required(true)
.addValidator(StandardValidators.DATA_SIZE_VALIDATOR)
.defaultValue("10 KB")
.build();
public static final PropertyDescriptor COMPRESSION_TYPE = new PropertyDescriptor.Builder()
.name("orc-compression-type")
.displayName("Compression Type")
.required(true)
.allowableValues("NONE", "ZLIB", "SNAPPY", "LZO")
.defaultValue("NONE")
.build();
public static final PropertyDescriptor HIVE_TABLE_NAME = new PropertyDescriptor.Builder()
.name("orc-hive-table-name")
.displayName("Hive Table Name")
.description("An optional table name to insert into the hive.ddl attribute. The generated DDL can be used by "
+ "a PutHiveQL processor (presumably after a PutHDFS processor) to create a table backed by the converted ORC file. "
+ "If this property is not provided, the full name (including namespace) of the incoming Avro record will be normalized "
+ "and used as the table name.")
.required(false)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_BLANK_VALIDATOR)
.build();
// Relationships
static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("A FlowFile is routed to this relationship after it has been converted to ORC format.")
.build();
static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("A FlowFile is routed to this relationship if it cannot be parsed as Avro or cannot be converted to ORC for any reason")
.build();
private final static List<PropertyDescriptor> propertyDescriptors;
private final static Set<Relationship> relationships;
private volatile Configuration orcConfig;
/*
* Will ensure that the list of property descriptors is built only once.
* Will also create a Set of relationships
*/
static {
List<PropertyDescriptor> _propertyDescriptors = new ArrayList<>();
_propertyDescriptors.add(ORC_CONFIGURATION_RESOURCES);
_propertyDescriptors.add(STRIPE_SIZE);
_propertyDescriptors.add(BUFFER_SIZE);
_propertyDescriptors.add(COMPRESSION_TYPE);
_propertyDescriptors.add(HIVE_TABLE_NAME);
propertyDescriptors = Collections.unmodifiableList(_propertyDescriptors);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(_relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@OnScheduled
public void setup(ProcessContext context) {
boolean confFileProvided = context.getProperty(ORC_CONFIGURATION_RESOURCES).isSet();
if (confFileProvided) {
final String configFiles = context.getProperty(ORC_CONFIGURATION_RESOURCES).getValue();
orcConfig = HiveJdbcCommon.getConfigurationFromFiles(configFiles);
}
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
try {
long startTime = System.currentTimeMillis();
final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue();
final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue();
final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue());
final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null);
final AtomicInteger totalRecordCount = new AtomicInteger(0);
final String fileName = flowFile.getAttribute(CoreAttributes.FILENAME.key());
flowFile = session.write(flowFile, (rawIn, rawOut) -> {
try (final InputStream in = new BufferedInputStream(rawIn);
final OutputStream out = new BufferedOutputStream(rawOut);
final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<>())) {
// Create ORC schema from Avro schema
Schema avroSchema = reader.getSchema();
TypeInfo orcSchema = NiFiOrcUtils.getOrcField(avroSchema);
if (orcConfig == null) {
orcConfig = new Configuration();
}
OrcFlowFileWriter orcWriter = NiFiOrcUtils.createWriter(
out,
new Path(fileName),
orcConfig,
orcSchema,
stripeSize,
compressionType,
bufferSize);
try {
int recordCount = 0;
GenericRecord currRecord = null;
while (reader.hasNext()) {
currRecord = reader.next(currRecord);
List<Schema.Field> fields = currRecord.getSchema().getFields();
if (fields != null) {
Object[] row = new Object[fields.size()];
for (int i = 0; i < fields.size(); i++) {
Schema.Field field = fields.get(i);
Schema fieldSchema = field.schema();
Object o = currRecord.get(field.name());
try {
row[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), o);
} catch (ArrayIndexOutOfBoundsException aioobe) {
getLogger().error("Index out of bounds at record {} for column {}, type {}, and object {}",
new Object[]{recordCount, i, fieldSchema.getType().getName(), o.toString()},
aioobe);
throw new IOException(aioobe);
}
}
orcWriter.addRow(NiFiOrcUtils.createOrcStruct(orcSchema, row));
recordCount++;
}
}
hiveAvroSchema.set(avroSchema);
totalRecordCount.set(recordCount);
} finally {
// finished writing this record, close the writer (which will flush to the flow file)
orcWriter.close();
}
}
});
final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet()
? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue()
: NiFiOrcUtils.normalizeHiveTableName(hiveAvroSchema.get().getFullName());
String hiveDDL = NiFiOrcUtils.generateHiveDDL(hiveAvroSchema.get(), hiveTableName);
// Add attributes and transfer to success
flowFile = session.putAttribute(flowFile, RECORD_COUNT_ATTRIBUTE, Integer.toString(totalRecordCount.get()));
flowFile = session.putAttribute(flowFile, HIVE_DDL_ATTRIBUTE, hiveDDL);
StringBuilder newFilename = new StringBuilder();
int extensionIndex = fileName.lastIndexOf(".");
if (extensionIndex != -1) {
newFilename.append(fileName.substring(0, extensionIndex));
} else {
newFilename.append(fileName);
}
newFilename.append(".orc");
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), ORC_MIME_TYPE);
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename.toString());
session.transfer(flowFile, REL_SUCCESS);
session.getProvenanceReporter().modifyContent(flowFile, "Converted "+totalRecordCount.get()+" records", System.currentTimeMillis() - startTime);
} catch (final ProcessException pe) {
getLogger().error("Failed to convert {} from Avro to ORC due to {}; transferring to failure", new Object[]{flowFile, pe});
session.transfer(flowFile, REL_FAILURE);
}
}
}