/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.nifi.processors.kite;
import org.apache.avro.Schema;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.processor.io.OutputStreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.kitesdk.data.spi.JsonUtil;
import org.kitesdk.data.spi.filesystem.CSVProperties;
import org.kitesdk.data.spi.filesystem.CSVUtil;
import java.io.InputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.BufferedReader;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
@Tags({"kite", "avro", "infer", "schema", "csv", "json"})
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@CapabilityDescription("Examines the contents of the incoming FlowFile to infer an Avro schema. The processor will" +
" use the Kite SDK to make an attempt to automatically generate an Avro schema from the incoming content." +
" When inferring the schema from JSON data the key names will be used in the resulting Avro schema" +
" definition. When inferring from CSV data a \"header definition\" must be present either as the first line of the incoming data" +
" or the \"header definition\" must be explicitly set in the property \"CSV Header Definition\". A \"header definition\"" +
" is simply a single comma separated line defining the names of each column. The \"header definition\" is" +
" required in order to determine the names that should be given to each field in the resulting Avro definition." +
" When inferring data types the higher order data type is always used if there is ambiguity." +
" For example when examining numerical values the type may be set to \"long\" instead of \"integer\" since a long can" +
" safely hold the value of any \"integer\". Only CSV and JSON content is currently supported for automatically inferring an" +
" Avro schema. The type of content present in the incoming FlowFile is set by using the property \"Input Content Type\"." +
" The property can either be explicitly set to CSV, JSON, or \"use mime.type value\" which will examine the" +
" value of the mime.type attribute on the incoming FlowFile to determine the type of content present.")
@ReadsAttributes({
@ReadsAttribute(attribute = "mime.type", description = "If configured by property \"Input Content Type\" will" +
" use this value to determine what sort of content should be inferred from the incoming FlowFile content."),
})
@WritesAttributes({
@WritesAttribute(attribute = "inferred.avro.schema", description = "If configured by \"Schema output destination\" to" +
" write to an attribute this will hold the resulting Avro schema from inferring the incoming FlowFile content."),
})
public class InferAvroSchema
extends AbstractKiteProcessor {
private static final Validator CHAR_VALIDATOR = new Validator() {
@Override
public ValidationResult validate(String subject, String input, ValidationContext context) {
// Allows special, escaped characters as input, which is then unescaped and converted to a single character.
// Examples for special characters: \t (or \u0009), \f.
input = unescapeString(input);
return new ValidationResult.Builder()
.subject(subject)
.input(input)
.explanation("Only non-null single characters are supported")
.valid(input.length() == 1 && input.charAt(0) != 0 || context.isExpressionLanguagePresent(input))
.build();
}
};
public static final String USE_MIME_TYPE = "use mime.type value";
public static final String JSON_CONTENT = "json";
public static final String CSV_CONTENT = "csv";
public static final String AVRO_SCHEMA_ATTRIBUTE_NAME = "inferred.avro.schema";
public static final String DESTINATION_ATTRIBUTE = "flowfile-attribute";
public static final String DESTINATION_CONTENT = "flowfile-content";
public static final String JSON_MIME_TYPE = "application/json";
public static final String CSV_MIME_TYPE = "text/csv";
public static final String AVRO_MIME_TYPE = "application/avro-binary";
public static final String AVRO_FILE_EXTENSION = ".avro";
public static final Pattern AVRO_RECORD_NAME_PATTERN = Pattern.compile("[A-Za-z_]+[A-Za-z0-9_.]*[^.]");
public static final PropertyDescriptor SCHEMA_DESTINATION = new PropertyDescriptor.Builder()
.name("Schema Output Destination")
.description("Control if Avro schema is written as a new flowfile attribute '" + AVRO_SCHEMA_ATTRIBUTE_NAME + "' " +
"or written in the flowfile content. Writing to flowfile content will overwrite any " +
"existing flowfile content.")
.required(true)
.allowableValues(DESTINATION_ATTRIBUTE, DESTINATION_CONTENT)
.defaultValue(DESTINATION_CONTENT)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor INPUT_CONTENT_TYPE = new PropertyDescriptor.Builder()
.name("Input Content Type")
.description("Content Type of data present in the incoming FlowFile's content. Only \"" +
JSON_CONTENT + "\" or \"" + CSV_CONTENT + "\" are supported." +
" If this value is set to \"" + USE_MIME_TYPE + "\" the incoming Flowfile's attribute \"" + CoreAttributes.MIME_TYPE + "\"" +
" will be used to determine the Content Type.")
.allowableValues(USE_MIME_TYPE, JSON_CONTENT, CSV_CONTENT)
.defaultValue(USE_MIME_TYPE)
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor GET_CSV_HEADER_DEFINITION_FROM_INPUT = new PropertyDescriptor.Builder()
.name("Get CSV Header Definition From Data")
.description("This property only applies to CSV content type. If \"true\" the processor will attempt to read the CSV header definition from the" +
" first line of the input data.")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor CSV_HEADER_DEFINITION = new PropertyDescriptor.Builder()
.name("CSV Header Definition")
.description("This property only applies to CSV content type. Comma separated string defining the column names expected in the CSV data." +
" EX: \"fname,lname,zip,address\". The elements present in this string should be in the same order" +
" as the underlying data. Setting this property will cause the value of" +
" \"" + GET_CSV_HEADER_DEFINITION_FROM_INPUT.getName() + "\" to be ignored instead using this value.")
.required(false)
.expressionLanguageSupported(true)
.defaultValue(null)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor HEADER_LINE_SKIP_COUNT = new PropertyDescriptor.Builder()
.name("CSV Header Line Skip Count")
.description("This property only applies to CSV content type. Specifies the number of lines that should be skipped when reading the CSV data." +
" Setting this value to 0 is equivalent to saying \"the entire contents of the file should be read\". If the" +
" property \"" + GET_CSV_HEADER_DEFINITION_FROM_INPUT.getName() + "\" is set then the first line of the CSV " +
" file will be read in and treated as the CSV header definition. Since this will remove the header line from the data" +
" care should be taken to make sure the value of \"CSV header Line Skip Count\" is set to 0 to ensure" +
" no data is skipped.")
.required(true)
.defaultValue("0")
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.build();
public static final PropertyDescriptor DELIMITER = new PropertyDescriptor.Builder()
.name("CSV delimiter")
.description("Delimiter character for CSV records")
.expressionLanguageSupported(true)
.addValidator(CHAR_VALIDATOR)
.defaultValue(",")
.build();
public static final PropertyDescriptor ESCAPE_STRING = new PropertyDescriptor.Builder()
.name("CSV Escape String")
.description("This property only applies to CSV content type. String that represents an escape sequence" +
" in the CSV FlowFile content data.")
.required(true)
.defaultValue("\\")
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor QUOTE_STRING = new PropertyDescriptor.Builder()
.name("CSV Quote String")
.description("This property only applies to CSV content type. String that represents a literal quote" +
" character in the CSV FlowFile content data.")
.required(true)
.defaultValue("'")
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor RECORD_NAME = new PropertyDescriptor.Builder()
.name("Avro Record Name")
.description("Value to be placed in the Avro record schema \"name\" field. The value must adhere to the Avro naming "
+ "rules for fullname. If Expression Language is present then the evaluated value must adhere to the Avro naming rules.")
.required(true)
.expressionLanguageSupported(true)
.addValidator(StandardValidators.createRegexMatchingValidator(AVRO_RECORD_NAME_PATTERN))
.build();
public static final PropertyDescriptor CHARSET = new PropertyDescriptor.Builder()
.name("Charset")
.description("Character encoding of CSV data.")
.required(true)
.defaultValue("UTF-8")
.expressionLanguageSupported(true)
.addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
.build();
public static final PropertyDescriptor PRETTY_AVRO_OUTPUT = new PropertyDescriptor.Builder()
.name("Pretty Avro Output")
.description("If true the Avro output will be formatted.")
.required(true)
.defaultValue("true")
.allowableValues("true", "false")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor NUM_RECORDS_TO_ANALYZE = new PropertyDescriptor.Builder()
.name("Number Of Records To Analyze")
.description("This property only applies to JSON content type. The number of JSON records that should be" +
" examined to determine the Avro schema. The higher the value the better chance kite has of detecting" +
" the appropriate type. However the default value of 10 is almost always enough.")
.required(true)
.defaultValue("10")
.expressionLanguageSupported(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
.description("Successfully created Avro schema from data.").build();
public static final Relationship REL_ORIGINAL = new Relationship.Builder().name("original")
.description("Original incoming FlowFile data").build();
public static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
.description("Failed to create Avro schema from data.").build();
public static final Relationship REL_UNSUPPORTED_CONTENT = new Relationship.Builder().name("unsupported content")
.description("The content found in the flowfile content is not of the required format.").build();
private List<PropertyDescriptor> properties;
private Set<Relationship> relationships;
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> properties = new ArrayList<>();
properties.add(SCHEMA_DESTINATION);
properties.add(INPUT_CONTENT_TYPE);
properties.add(CSV_HEADER_DEFINITION);
properties.add(GET_CSV_HEADER_DEFINITION_FROM_INPUT);
properties.add(HEADER_LINE_SKIP_COUNT);
properties.add(DELIMITER);
properties.add(ESCAPE_STRING);
properties.add(QUOTE_STRING);
properties.add(PRETTY_AVRO_OUTPUT);
properties.add(RECORD_NAME);
properties.add(NUM_RECORDS_TO_ANALYZE);
properties.add(CHARSET);
this.properties = Collections.unmodifiableList(properties);
final Set<Relationship> relationships = new HashSet<>();
relationships.add(REL_SUCCESS);
relationships.add(REL_FAILURE);
relationships.add(REL_ORIGINAL);
relationships.add(REL_UNSUPPORTED_CONTENT);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile original = session.get();
if (original == null) {
return;
}
try {
final AtomicReference<String> avroSchema = new AtomicReference<>();
switch (context.getProperty(INPUT_CONTENT_TYPE).getValue()) {
case USE_MIME_TYPE:
avroSchema.set(inferAvroSchemaFromMimeType(original, context, session));
break;
case JSON_CONTENT:
avroSchema.set(inferAvroSchemaFromJSON(original, context, session));
break;
case CSV_CONTENT:
avroSchema.set(inferAvroSchemaFromCSV(original, context, session));
break;
default:
//Shouldn't be possible but just in case
session.transfer(original, REL_UNSUPPORTED_CONTENT);
break;
}
if (StringUtils.isNotEmpty(avroSchema.get())) {
String destination = context.getProperty(SCHEMA_DESTINATION).getValue();
FlowFile avroSchemaFF = null;
switch (destination) {
case DESTINATION_ATTRIBUTE:
avroSchemaFF = session.putAttribute(session.clone(original), AVRO_SCHEMA_ATTRIBUTE_NAME, avroSchema.get());
//Leaves the original CoreAttributes.MIME_TYPE in place.
break;
case DESTINATION_CONTENT:
avroSchemaFF = session.write(session.create(), new OutputStreamCallback() {
@Override
public void process(OutputStream out) throws IOException {
out.write(avroSchema.get().getBytes());
}
});
avroSchemaFF = session.putAttribute(avroSchemaFF, CoreAttributes.MIME_TYPE.key(), AVRO_MIME_TYPE);
break;
default:
break;
}
//Transfer the sessions.
avroSchemaFF = session.putAttribute(avroSchemaFF, CoreAttributes.FILENAME.key(), (original.getAttribute(CoreAttributes.FILENAME.key()) + AVRO_FILE_EXTENSION));
session.transfer(avroSchemaFF, REL_SUCCESS);
session.transfer(original, REL_ORIGINAL);
} else {
//If the avroSchema is null then the content type is unknown and therefore unsupported
session.transfer(original, REL_UNSUPPORTED_CONTENT);
}
} catch (Exception ex) {
getLogger().error("Failed to infer Avro schema for {} due to {}", new Object[] {original, ex});
session.transfer(original, REL_FAILURE);
}
}
/**
* Infers the Avro schema from the input Flowfile content. To infer an Avro schema for CSV content a header line is
* required. You can configure the processor to pull that header line from the first line of the CSV data if it is
* present OR you can manually supply the desired header line as a property value.
*
* @param inputFlowFile
* The original input FlowFile containing the CSV content as it entered this processor.
*
* @param context
* ProcessContext to pull processor configurations.
*
* @param session
* ProcessSession to transfer FlowFiles
*/
private String inferAvroSchemaFromCSV(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) {
//Determines the header line either from the property input or the first line of the delimited file.
final AtomicReference<String> header = new AtomicReference<>();
final AtomicReference<Boolean> hasHeader = new AtomicReference<>();
if (context.getProperty(GET_CSV_HEADER_DEFINITION_FROM_INPUT).asBoolean() == Boolean.TRUE) {
//Read the first line of the file to get the header value.
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(in));
header.set(br.readLine());
hasHeader.set(Boolean.TRUE);
br.close();
}
});
hasHeader.set(Boolean.TRUE);
} else {
header.set(context.getProperty(CSV_HEADER_DEFINITION).evaluateAttributeExpressions(inputFlowFile).getValue());
hasHeader.set(Boolean.FALSE);
}
//Prepares the CSVProperties for kite
CSVProperties props = new CSVProperties.Builder()
.charset(context.getProperty(CHARSET).evaluateAttributeExpressions(inputFlowFile).getValue())
.delimiter(context.getProperty(DELIMITER).evaluateAttributeExpressions(inputFlowFile).getValue())
.quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue())
.escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions(inputFlowFile).getValue())
.linesToSkip(context.getProperty(HEADER_LINE_SKIP_COUNT).evaluateAttributeExpressions(inputFlowFile).asInteger())
.header(header.get())
.hasHeader(hasHeader.get())
.build();
final AtomicReference<String> avroSchema = new AtomicReference<>();
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
avroSchema.set(CSVUtil
.inferSchema(
context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(), in, props)
.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
}
});
return avroSchema.get();
}
/**
* Infers the Avro schema from the input Flowfile content.
*
* @param inputFlowFile
* The original input FlowFile containing the JSON content as it entered this processor.
*
* @param context
* ProcessContext to pull processor configurations.
*
* @param session
* ProcessSession to transfer FlowFiles
*/
private String inferAvroSchemaFromJSON(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) {
final AtomicReference<String> avroSchema = new AtomicReference<>();
session.read(inputFlowFile, new InputStreamCallback() {
@Override
public void process(InputStream in) throws IOException {
Schema as = JsonUtil.inferSchema(
in, context.getProperty(RECORD_NAME).evaluateAttributeExpressions(inputFlowFile).getValue(),
context.getProperty(NUM_RECORDS_TO_ANALYZE).evaluateAttributeExpressions(inputFlowFile).asInteger());
avroSchema.set(as.toString(context.getProperty(PRETTY_AVRO_OUTPUT).asBoolean()));
}
});
return avroSchema.get();
}
/**
* Examines the incoming FlowFiles mime.type attribute to determine if the schema should be inferred for CSV or JSON data.
*
* @param inputFlowFile
* The original input FlowFile containing the content.
*
* @param context
* ProcessContext to pull processor configurations.
*
* @param session
* ProcessSession to transfer FlowFiles
*/
private String inferAvroSchemaFromMimeType(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) {
String mimeType = inputFlowFile.getAttribute(CoreAttributes.MIME_TYPE.key());
String avroSchema = "";
if (mimeType!= null) {
switch (mimeType) {
case JSON_MIME_TYPE:
getLogger().debug("Inferred content type as JSON from \"{}\" value of \"{}\"", new Object[]{CoreAttributes.MIME_TYPE.key(),
inputFlowFile.getAttribute(CoreAttributes.MIME_TYPE.key())});
avroSchema = inferAvroSchemaFromJSON(inputFlowFile, context, session);
break;
case CSV_MIME_TYPE:
getLogger().debug("Inferred content type as CSV from \"{}\" value of \"{}\"", new Object[]{CoreAttributes.MIME_TYPE.key(),
inputFlowFile.getAttribute(CoreAttributes.MIME_TYPE.key())});
avroSchema = inferAvroSchemaFromCSV(inputFlowFile, context, session);
break;
default:
getLogger().warn("Unable to infer Avro Schema from {} because its mime type is {}, " +
" which is not supported by this Processor", new Object[] {inputFlowFile, mimeType} );
break;
}
}
return avroSchema;
}
private static String unescapeString(String input) {
if (input.length() > 1) {
input = StringEscapeUtils.unescapeJava(input);
}
return input;
}
}