/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.nifi.processors.kite;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class TestInferAvroSchema {
private TestRunner runner = null;
@Before
public void setup() {
runner = TestRunners.newTestRunner(InferAvroSchema.class);
// Prepare the common setup.
runner.assertNotValid();
runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_CONTENT);
runner.setProperty(InferAvroSchema.HEADER_LINE_SKIP_COUNT, "0");
runner.setProperty(InferAvroSchema.ESCAPE_STRING, "\\");
runner.setProperty(InferAvroSchema.QUOTE_STRING, "'");
runner.setProperty(InferAvroSchema.RECORD_NAME, "org.apache.nifi.contact");
runner.setProperty(InferAvroSchema.CHARSET, "UTF-8");
runner.setProperty(InferAvroSchema.PRETTY_AVRO_OUTPUT, "true");
}
@Test
public void testRecordName() throws Exception {
// Dot at the end is invalid
runner.setProperty(InferAvroSchema.RECORD_NAME, "org.apache.nifi.contact.");
runner.assertNotValid();
// Dashes are invalid
runner.setProperty(InferAvroSchema.RECORD_NAME, "avro-schema");
runner.assertNotValid();
// Name cannot start with a digit
runner.setProperty(InferAvroSchema.RECORD_NAME, "1Record");
runner.assertNotValid();
// Name cannot start with a dot
runner.setProperty(InferAvroSchema.RECORD_NAME, ".record");
runner.assertNotValid();
runner.setProperty(InferAvroSchema.RECORD_NAME, "avro_schema");
runner.assertValid();
runner.setProperty(InferAvroSchema.RECORD_NAME, "org.apache.nifi.contact");
runner.assertValid();
runner.setProperty(InferAvroSchema.RECORD_NAME, "${filename}"); // EL is valid, although its value may not be when evaluated
runner.assertValid();
}
@Test
public void inferAvroSchemaFromHeaderDefinitionOfCSVFile() throws Exception {
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue(new File("src/test/resources/Shapes_Header.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
flowFile.assertContentEquals(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void inferAvroSchemaFromJSONFile() throws Exception {
runner.assertValid();
runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
// Purposely set to True to test that none of the JSON file is read which would cause issues.
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE);
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "application/json");
runner.enqueue(new File("src/test/resources/Shapes.json").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME);
String knownSchema = new String(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes.json.avro")), StandardCharsets.UTF_8);
Assert.assertEquals(avroSchema, knownSchema);
// Since that avro schema is written to an attribute this should be teh same as the original
data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json");
}
@Test
public void inferAvroSchemaFromCSVFile() throws Exception {
runner.assertValid();
// Read in the header
StringWriter writer = new StringWriter();
IOUtils.copy((Files.newInputStream(Paths.get("src/test/resources/ShapesHeader.csv"), StandardOpenOption.READ)), writer, "UTF-8");
runner.setProperty(InferAvroSchema.CSV_HEADER_DEFINITION, writer.toString());
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "false");
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue(new File("src/test/resources/Shapes_NoHeader.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
data.assertContentEquals(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void inferSchemaFormHeaderLinePropertyOfProcessor() throws Exception {
final String CSV_HEADER_LINE = FileUtils.readFileToString(new File("src/test/resources/ShapesHeader.csv"));
runner.assertValid();
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "false");
runner.setProperty(InferAvroSchema.CSV_HEADER_DEFINITION, CSV_HEADER_LINE);
runner.setProperty(InferAvroSchema.HEADER_LINE_SKIP_COUNT, "1");
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue((CSV_HEADER_LINE + "\nJane,Doe,29,55555").getBytes(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void inferSchemaFromEmptyContent() throws Exception {
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue("", attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
}
@Test
public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFile() throws Exception {
runner.setProperty(InferAvroSchema.DELIMITER, "\\t");
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
flowFile.assertContentEquals(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFileNegativeTest() throws Exception {
// Inproper InferAvroSchema.DELIMITER > original goes to InferAvroSchema.REL_FAILURE
runner.setProperty(InferAvroSchema.DELIMITER, ";");
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_FAILURE).get(0);
flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath());
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv");
}
@Test
public void specifyCSVparametersInExpressionLanguage() throws Exception {
runner.setProperty(InferAvroSchema.DELIMITER, "${csv.delimiter}");
runner.setProperty(InferAvroSchema.ESCAPE_STRING, "${csv.escape}");
runner.setProperty(InferAvroSchema.QUOTE_STRING, "${csv.quote}");
runner.setProperty(InferAvroSchema.CHARSET, "${csv.charset}");
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.assertValid();
@SuppressWarnings("serial")
Map<String, String> attributes = new HashMap<String, String>() {
{
put("csv.delimiter",",");
put("csv.escape", "\\");
put("csv.quote", "\"");
put("csv.charset", "UTF-8");
put(CoreAttributes.MIME_TYPE.key(), "text/csv");
}
};
runner.enqueue(new File("src/test/resources/Shapes_Header.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
flowFile.assertContentEquals(unix2PlatformSpecificLineEndings(new File("src/test/resources/Shapes_header.csv.avro")));
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void specifyJsonParametersInExpressionLanguage() throws Exception {
runner.assertValid();
runner.setProperty(InferAvroSchema.INPUT_CONTENT_TYPE, InferAvroSchema.USE_MIME_TYPE);
// Purposely set to True to test that none of the JSON file is read which would cause issues.
runner.setProperty(InferAvroSchema.GET_CSV_HEADER_DEFINITION_FROM_INPUT, "true");
runner.setProperty(InferAvroSchema.SCHEMA_DESTINATION, InferAvroSchema.DESTINATION_ATTRIBUTE);
runner.setProperty(InferAvroSchema.RECORD_NAME, "${record.name}");
runner.setProperty(InferAvroSchema.NUM_RECORDS_TO_ANALYZE, "${records.analyze}");
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "application/json");
attributes.put("record.name", "myrecord");
attributes.put("records.analyze", "2");
runner.enqueue(new File("src/test/resources/Shapes.json").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile data = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
String avroSchema = data.getAttribute(InferAvroSchema.AVRO_SCHEMA_ATTRIBUTE_NAME);
Assert.assertTrue(avroSchema.contains("\"name\" : \"myrecord\""));
// Since that avro schema is written to an attribute this should be teh same as the original
data.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/json");
}
static byte[] unix2PlatformSpecificLineEndings(final File file) throws IOException {
try (final BufferedInputStream in = new BufferedInputStream(new FileInputStream(file)); final ByteArrayOutputStream out = new ByteArrayOutputStream()) {
byte eol[] = System.lineSeparator().getBytes(StandardCharsets.UTF_8);
int justRead;
while ((justRead = in.read()) != -1) {
if (justRead == '\n') {
out.write(eol);
} else {
out.write(justRead);
}
}
return out.toByteArray();
}
}
}