/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.nifi.processors.kite; import java.io.IOException; import java.io.InputStream; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.avro.Schema; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericData.Record; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.exception.ProcessException; import org.apache.nifi.processor.io.InputStreamCallback; import org.apache.nifi.util.StopWatch; import org.kitesdk.data.DatasetIOException; import org.kitesdk.data.DatasetWriter; import org.kitesdk.data.Datasets; import org.kitesdk.data.IncompatibleSchemaException; import org.kitesdk.data.ValidationException; import org.kitesdk.data.View; import org.kitesdk.data.spi.SchemaValidationUtil; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; @InputRequirement(Requirement.INPUT_REQUIRED) @Tags({"kite", "avro", "parquet", "hadoop", "hive", "hdfs", "hbase"}) @CapabilityDescription("Stores Avro records in a Kite dataset") public class StoreInKiteDataset extends AbstractKiteProcessor { private static final Relationship SUCCESS = new Relationship.Builder() .name("success") .description("FlowFile content has been successfully saved") .build(); private static final Relationship INCOMPATIBLE = new Relationship.Builder() .name("incompatible") .description("FlowFile content is not compatible with the target dataset") .build(); private static final Relationship FAILURE = new Relationship.Builder() .name("failure") .description("FlowFile content could not be processed") .build(); public static final PropertyDescriptor KITE_DATASET_URI = new PropertyDescriptor.Builder() .name("Target dataset URI") .description("URI that identifies a Kite dataset where data will be stored") .addValidator(RECOGNIZED_URI) .expressionLanguageSupported(true) .required(true) .build(); private static final List<PropertyDescriptor> PROPERTIES = ImmutableList.<PropertyDescriptor>builder() .addAll(AbstractKiteProcessor.getProperties()) .add(KITE_DATASET_URI) .build(); private static final Set<Relationship> RELATIONSHIPS = ImmutableSet.<Relationship>builder() .add(SUCCESS) .add(INCOMPATIBLE) .add(FAILURE) .build(); @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return PROPERTIES; } @Override public Set<Relationship> getRelationships() { return RELATIONSHIPS; } @Override public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile flowFile = session.get(); if (flowFile == null) { return; } final View<Record> target = load(context, flowFile); final Schema schema = target.getDataset().getDescriptor().getSchema(); try { StopWatch timer = new StopWatch(true); session.read(flowFile, new InputStreamCallback() { @Override public void process(InputStream in) throws IOException { try (DataFileStream<Record> stream = new DataFileStream<>( in, AvroUtil.newDatumReader(schema, Record.class))) { IncompatibleSchemaException.check( SchemaValidationUtil.canRead(stream.getSchema(), schema), "Incompatible file schema %s, expected %s", stream.getSchema(), schema); long written = 0L; try (DatasetWriter<Record> writer = target.newWriter()) { for (Record record : stream) { writer.write(record); written += 1; } } finally { session.adjustCounter("Stored records", written, true /* cannot roll back the write */); } } } }); timer.stop(); session.getProvenanceReporter().send(flowFile, target.getUri().toString(), timer.getDuration(TimeUnit.MILLISECONDS), true /* cannot roll back the write */); session.transfer(flowFile, SUCCESS); } catch (ProcessException | DatasetIOException e) { getLogger().error("Failed to read FlowFile", e); session.transfer(flowFile, FAILURE); } catch (ValidationException e) { getLogger().error(e.getMessage()); getLogger().debug("Incompatible schema error", e); session.transfer(flowFile, INCOMPATIBLE); } } private View<Record> load(ProcessContext context, FlowFile file) { String uri = context.getProperty(KITE_DATASET_URI) .evaluateAttributeExpressions(file) .getValue(); return Datasets.load(uri, Record.class); } }